def set_member(self, c_id, m_obj): if isinstance(m_obj, Model): m_obj = [m_obj] elif not isinstance(m_obj, list): raise ParseError() c_ldp_id = self.marmotta.ldp(encoder.encode(c_id)) collection = self.get_collection(c_id).pop() # 404 if collection not found if len(set([m.id for m in m_obj])) is not len(m_obj): raise ForbiddenError() if not collection.capabilities.membershipIsMutable: raise ForbiddenError() if collection.capabilities.restrictedToType: for m in m_obj: if not(hasattr(m,"datatype") and m.datatype in collection.capabilities.restrictedToType): raise ForbiddenError() if collection.capabilities.maxLength >= 0: size = self.sparql.size(c_ldp_id).bindings.pop().get(Variable('size')) if int(size) > collection.capabilities.maxLength-len(m_obj): raise ForbiddenError()#"Operation forbidden. Collection of maximum size {} is full.".format(collection.capabilities.maxLength)) ds = Dataset() ldp = ds.graph(identifier=LDP.ns) for m in m_obj: m_id = self.marmotta.ldp(encoder.encode(c_id)+"/member/"+encoder.encode(m.id)) member = ds.graph(identifier=m_id) member += self.RDA.object_to_graph(member.identifier,m) ldp += LDP.add_contains(c_ldp_id+"/member",m_id,False) res = self.sparql.insert(ds) if res.status_code is not 200: raise DBError() return m_obj
def set_service(self, s_obj): ds = Dataset() service = ds.graph(identifier=self.marmotta.ldp("service")) service += self.RDA.object_to_graph(service.identifier, s_obj) ldp = ds.graph(identifier=LDP.ns) ldp += LDP.add_contains(self.marmotta.ldp(),service.identifier,False) response = self.sparql.insert(ds) if response.status_code is 200: return s_obj else: raise DBError()
def set_collection(self, c_obj, over_write=False): if isinstance(c_obj, Model): c_obj = [c_obj] elif not isinstance(c_obj, list): raise ParseError() # create LD collection and declare as ldp:BasicContainer ds = Dataset() ldp = ds.graph(identifier=LDP.ns) for c in c_obj: c_id = encoder.encode(c.id) collection = ds.graph(identifier=self.marmotta.ldp(c_id)) collection += self.RDA.object_to_graph(collection.identifier, c) ldp += LDP.add_contains(self.marmotta.ldp(), collection.identifier) member = ds.graph(identifier=self.marmotta.ldp(c_id+'/member')) ldp += LDP.add_contains(collection.identifier, member.identifier) ins = self.sparql.insert(ds) if ins.status_code is 200: return c_obj else: raise DBError()
class Fragment(object): HYDRA = Namespace("http://www.w3.org/ns/hydra/core#") VOID = Namespace("http://rdfs.org/ns/void#") FOAF = Namespace("http://xmlns.com/foaf/0.1/") DCTERMS = Namespace("http://purl.org/dc/terms/") def __init__(self): self.rdf_graph = Dataset() def add_data_triple(self, subject, predicate, obj): self.rdf_graph.add((subject, predicate, obj)) def add_graph(self, identifier): self.rdf_graph.graph(identifier) def add_meta_quad(self, graph, subject, predicate, obj): self.rdf_graph.add((graph, subject, predicate, obj)) def add_prefix(self, prefix, uri): self.rdf_graph.bind(prefix, uri) def serialize(self): return self.rdf_graph.serialize(format="trig", encoding="utf-8")
class DatasetTestCase(unittest.TestCase): store = 'default' slow = True tmppath = None def setUp(self): try: self.graph = Dataset(store=self.store) except ImportError: raise SkipTest( "Dependencies for store '%s' not available!" % self.store) if self.store == "SQLite": _, self.tmppath = mkstemp( prefix='test', dir='/tmp', suffix='.sqlite') elif self.store == "SPARQLUpdateStore": root = HOST + DB self.graph.open((root + "sparql", root + "update")) else: self.tmppath = mkdtemp() if self.store != "SPARQLUpdateStore": self.graph.open(self.tmppath, create=True) self.michel = URIRef(u'urn:michel') self.tarek = URIRef(u'urn:tarek') self.bob = URIRef(u'urn:bob') self.likes = URIRef(u'urn:likes') self.hates = URIRef(u'urn:hates') self.pizza = URIRef(u'urn:pizza') self.cheese = URIRef(u'urn:cheese') # Use regular URIs because SPARQL endpoints like Fuseki alter short names self.c1 = URIRef(u'urn:context-1') self.c2 = URIRef(u'urn:context-2') # delete the graph for each test! self.graph.remove((None, None, None)) for c in self.graph.contexts(): c.remove((None, None, None)) assert len(c) == 0 self.graph.remove_graph(c) def tearDown(self): self.graph.close() if self.store == "SPARQLUpdateStore": pass else: if os.path.isdir(self.tmppath): shutil.rmtree(self.tmppath) else: os.remove(self.tmppath) def testGraphAware(self): if not self.graph.store.graph_aware: return g = self.graph g1 = g.graph(self.c1) # Some SPARQL endpoint backends (e.g. TDB) do not consider # empty named graphs if self.store != "SPARQLUpdateStore": # added graph exists self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) # added graph is empty self.assertEqual(len(g1), 0) g1.add((self.tarek, self.likes, self.pizza)) # added graph still exists self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) # added graph contains one triple self.assertEqual(len(g1), 1) g1.remove((self.tarek, self.likes, self.pizza)) # added graph is empty self.assertEqual(len(g1), 0) # Some SPARQL endpoint backends (e.g. TDB) do not consider # empty named graphs if self.store != "SPARQLUpdateStore": # graph still exists, although empty self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) g.remove_graph(self.c1) # graph is gone self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) def testDefaultGraph(self): # Something the default graph is read-only (e.g. TDB in union mode) if self.store == "SPARQLUpdateStore": print("Please make sure updating the default graph " "is supported by your SPARQL endpoint") self.graph.add((self.tarek, self.likes, self.pizza)) self.assertEqual(len(self.graph), 1) # only default exists self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) # removing default graph removes triples but not actual graph self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID) self.assertEqual(len(self.graph), 0) # default still exists self.assertEqual(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) def testNotUnion(self): # Union depends on the SPARQL endpoint configuration if self.store == "SPARQLUpdateStore": print("Please make sure your SPARQL endpoint has not configured " "its default graph as the union of the named graphs") g1 = self.graph.graph(self.c1) g1.add((self.tarek, self.likes, self.pizza)) self.assertEqual(list(self.graph.objects(self.tarek, None)), []) self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza])
class DatasetTestCase(unittest.TestCase): store = 'default' slow = True tmppath = None def setUp(self): try: self.graph = Dataset(store=self.store) except ImportError: raise SkipTest( "Dependencies for store '%s' not available!" % self.store) if self.store == "SQLite": _, self.tmppath = mkstemp( prefix='test', dir='/tmp', suffix='.sqlite') else: self.tmppath = mkdtemp() self.graph.open(self.tmppath, create=True) self.michel = URIRef(u'michel') self.tarek = URIRef(u'tarek') self.bob = URIRef(u'bob') self.likes = URIRef(u'likes') self.hates = URIRef(u'hates') self.pizza = URIRef(u'pizza') self.cheese = URIRef(u'cheese') self.c1 = URIRef(u'context-1') self.c2 = URIRef(u'context-2') # delete the graph for each test! self.graph.remove((None, None, None)) def tearDown(self): self.graph.close() if os.path.isdir(self.tmppath): shutil.rmtree(self.tmppath) else: os.remove(self.tmppath) def testGraphAware(self): if not self.graph.store.graph_aware: return g = self.graph g1 = g.graph(self.c1) # added graph exists self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) # added graph is empty self.assertEquals(len(g1), 0) g1.add( (self.tarek, self.likes, self.pizza) ) # added graph still exists self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) # added graph contains one triple self.assertEquals(len(g1), 1) g1.remove( (self.tarek, self.likes, self.pizza) ) # added graph is empty self.assertEquals(len(g1), 0) # graph still exists, although empty self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID])) g.remove_graph(self.c1) # graph is gone self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) def testDefaultGraph(self): self.graph.add(( self.tarek, self.likes, self.pizza)) self.assertEquals(len(self.graph), 1) # only default exists self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) # removing default graph removes triples but not actual graph self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID) self.assertEquals(len(self.graph), 0) # default still exists self.assertEquals(set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID])) def testNotUnion(self): g1 = self.graph.graph(self.c1) g1.add((self.tarek, self.likes, self.pizza)) self.assertEqual(list(self.graph.objects(self.tarek, None)), []) self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza])
def main(source, target, geometryfile='data/point2wkt.json'): with open(source) as infile: data = json.load(infile) with open(geometryfile) as infile: point2wkt = json.load(infile) ds = Dataset() dataset = lp.term('') g = rdfSubject.db = ds.graph(identifier=lp) ### Custom triples / Ontology g.add((lpOnt.Adres, OWL.equivalentClass, schema.PostalAddress)) g.add((lpOnt.Straat, OWL.equivalentClass, hg.Street)) g.add((lpOnt.Buurt, OWL.equivalentClass, hg.Neighbourhood)) g.add((lpOnt.adres, OWL.equivalentProperty, schema.address)) ######## # Data # ######## adres2locatie = defaultdict(lambda: defaultdict(list)) for n, adresLabel in enumerate(data, 1): if n % 5000 == 0: print(f"{n}/{len(data)}", end='\r') # break # # geometry # wkt = point2wkt.get(locatiepunt) # wktLiteral = Literal(wkt, datatype=geo.wktLiteral) # geometry = Geometry(lpGeo.term(str(locatiepunt)), # asWKT=wktLiteral, # label=[str(locatiepunt)]) addresses = getAdres(data[adresLabel], adresLabel, point2wkt) # adres2locatie[adres][year].append(geometry) # observations.append(locpdetail) # locp.observation = observations # addresses.append( # Role( # None, # label=address.label, # address=address, # hasLatestBeginTimeStamp=locpdetail.hasLatestBeginTimeStamp, # hasEarliestEndTimeStamp=locpdetail.hasEarliestEndTimeStamp, # startDate=Literal(year, datatype=XSD.gYear))) ds.bind('create', create) ds.bind('schema', schema) ds.bind('sem', sem) ds.bind('geo', geo) ds.bind('juso', juso) ds.bind('qb', qb) ds.bind('void', void) print("Serializing!") ds.serialize(target, format='trig')
# We initialize a dataset, and bind our namespaces dataset = Dataset() dataset.bind('g13data', DATA) dataset.bind('g13vocab', VOCAB) dataset.bind('g13set', SETNAME) dataset.bind('geo', GEO) dataset.bind('geof', GEOF) dataset.bind('dbo', DBO) dataset.bind('dbp', DBP) dataset.bind('schema', SCHEMA) dataset.bind('vcard', VCARD) dataset.bind('wgs', WGS) dataset.bind('void', VOID) # We then get a new dataset object with our URI from the dataset. graph = dataset.graph(graph_uri) # Load the externally defined schema into the default dataset (context) of the dataset #dataset.default_context.parse('vocab.ttl', format='turtle') # Let's iterate over the dictionary, and create some triples # Let's pretend we know exactly what the 'schema' of our CSV file is for row in csv_contents: thing = "" points = "" lat = "" lng = "" name = "" lat = ""
def data_structure_definition(profile, dataset_name, dataset_base_uri, variables, source_path, source_hash): """Converts the dataset + variables to a set of rdflib Graphs (a nanopublication with provenance annotations) that contains the data structure definition (from the DataCube vocabulary) and the mappings to external datasets. Arguments: dataset -- the name of the dataset variables -- the list of dictionaries with the variables and their mappings to URIs profile -- the Google signin profile source_path -- the path to the dataset file that was annotated source_hash -- the Git hash of the dataset file version of the dataset :returns: an RDF graph store containing a nanopublication """ BASE = Namespace('{}/'.format(dataset_base_uri)) dataset_uri = URIRef(dataset_base_uri) # Initialize a conjunctive graph for the whole lot rdf_dataset = Dataset() rdf_dataset.bind('qbrv', QBRV) rdf_dataset.bind('qbr', QBR) rdf_dataset.bind('qb', QB) rdf_dataset.bind('skos', SKOS) rdf_dataset.bind('prov', PROV) rdf_dataset.bind('np', NP) rdf_dataset.bind('foaf', FOAF) # Initialize the graphs needed for the nanopublication timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M") # Shorten the source hash to 8 digits (similar to Github) source_hash = source_hash[:8] hash_part = source_hash + '/' + timestamp # The Nanopublication consists of three graphs assertion_graph_uri = BASE['assertion/' + hash_part] assertion_graph = rdf_dataset.graph(assertion_graph_uri) provenance_graph_uri = BASE['provenance/' + hash_part] provenance_graph = rdf_dataset.graph(provenance_graph_uri) pubinfo_graph_uri = BASE['pubinfo/' + hash_part] pubinfo_graph = rdf_dataset.graph(pubinfo_graph_uri) # A URI that represents the author author_uri = QBR['person/' + profile['email']] rdf_dataset.add((author_uri, RDF.type, FOAF['Person'])) rdf_dataset.add((author_uri, FOAF['name'], Literal(profile['name']))) rdf_dataset.add((author_uri, FOAF['email'], Literal(profile['email']))) rdf_dataset.add((author_uri, QBRV['googleId'], Literal(profile['id']))) try: rdf_dataset.add( (author_uri, FOAF['depiction'], URIRef(profile['image']))) except KeyError: pass # A URI that represents the version of the dataset source file dataset_version_uri = BASE[source_hash] # Some information about the source file used rdf_dataset.add((dataset_version_uri, QBRV['path'], Literal(source_path, datatype=XSD.string))) rdf_dataset.add((dataset_version_uri, QBRV['sha1_hash'], Literal(source_hash, datatype=XSD.string))) # ---- # The nanopublication itself # ---- nanopublication_uri = BASE['nanopublication/' + hash_part] rdf_dataset.add((nanopublication_uri, RDF.type, NP['Nanopublication'])) rdf_dataset.add( (nanopublication_uri, NP['hasAssertion'], assertion_graph_uri)) rdf_dataset.add((assertion_graph_uri, RDF.type, NP['Assertion'])) rdf_dataset.add( (nanopublication_uri, NP['hasProvenance'], provenance_graph_uri)) rdf_dataset.add((provenance_graph_uri, RDF.type, NP['Provenance'])) rdf_dataset.add( (nanopublication_uri, NP['hasPublicationInfo'], pubinfo_graph_uri)) rdf_dataset.add((pubinfo_graph_uri, RDF.type, NP['PublicationInfo'])) # ---- # The provenance graph # ---- # Provenance information for the assertion graph (the data structure definition itself) provenance_graph.add( (assertion_graph_uri, PROV['wasDerivedFrom'], dataset_version_uri)) provenance_graph.add( (dataset_uri, PROV['wasDerivedFrom'], dataset_version_uri)) provenance_graph.add((assertion_graph_uri, PROV['generatedAtTime'], Literal(timestamp, datatype=XSD.datetime))) provenance_graph.add( (assertion_graph_uri, PROV['wasAttributedTo'], author_uri)) # ---- # The publication info graph # ---- # The URI of the latest version of QBer # TODO: should point to the actual latest commit of this QBer source file. # TODO: consider linking to this as the plan of some activity, rather than an activity itself. qber_uri = URIRef('https://github.com/CLARIAH/qber.git') pubinfo_graph.add((nanopublication_uri, PROV['wasGeneratedBy'], qber_uri)) pubinfo_graph.add((nanopublication_uri, PROV['generatedAtTime'], Literal(timestamp, datatype=XSD.datetime))) pubinfo_graph.add( (nanopublication_uri, PROV['wasAttributedTo'], author_uri)) # ---- # The assertion graph # ---- structure_uri = BASE['structure'] assertion_graph.add((dataset_uri, RDF.type, QB['DataSet'])) assertion_graph.add((dataset_uri, RDFS.label, Literal(dataset_name))) assertion_graph.add( (structure_uri, RDF.type, QB['DataStructureDefinition'])) assertion_graph.add((dataset_uri, QB['structure'], structure_uri)) for variable_id, variable in variables.items(): variable_uri = URIRef(variable['original']['uri']) variable_label = Literal(variable['original']['label']) variable_type = URIRef(variable['type']) codelist_uri = URIRef(variable['codelist']['original']['uri']) codelist_label = Literal(variable['codelist']['original']['label']) # The variable as component of the definition component_uri = safe_url(BASE, 'component/' + variable['original']['label']) # Add link between the definition and the component assertion_graph.add((structure_uri, QB['component'], component_uri)) # Add label to variable # TODO: We may need to do something with a changed label for the variable assertion_graph.add((variable_uri, RDFS.label, variable_label)) if 'description' in variable and variable['description'] != "": assertion_graph.add( (variable_uri, RDFS.comment, Literal(variable['description']))) # If the variable URI is not the same as the original, # it is a specialization of a prior variable property. if variable['uri'] != str(variable_uri): assertion_graph.add( (variable_uri, RDFS['subPropertyOf'], URIRef(variable['uri']))) if variable_type == QB['DimensionProperty']: assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB['dimension'], variable_uri)) # Coded variables are also of type coded property (a subproperty of dimension property) if variable['category'] == 'coded': assertion_graph.add( (variable_uri, RDF.type, QB['CodedProperty'])) elif variable_type == QB['MeasureProperty']: # The category 'other' assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB['measure'], variable_uri)) elif variable_type == QB['AttributeProperty']: # Actually never produced by QBer at this stage assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB['attribute'], variable_uri)) # If this variable is of category 'coded', we add codelist and URIs for # each variable (including mappings between value uris and etc....) if variable['category'] == 'coded': assertion_graph.add((codelist_uri, RDF.type, SKOS['Collection'])) assertion_graph.add( (codelist_uri, RDFS.label, Literal(codelist_label))) # The variable should point to the codelist assertion_graph.add((variable_uri, QB['codeList'], codelist_uri)) # The variable is mapped onto an external code list. # If the codelist uri is not the same as the original one, we # have a derived codelist. if variable['codelist']['uri'] != str(codelist_uri): assertion_graph.add((codelist_uri, PROV['wasDerivedFrom'], URIRef(variable['codelist']['uri']))) # Generate a SKOS concept for each of the values and map it to the # assigned codelist for value in variable['values']: value_uri = URIRef(value['original']['uri']) value_label = Literal(value['original']['label']) assertion_graph.add((value_uri, RDF.type, SKOS['Concept'])) assertion_graph.add( (value_uri, SKOS['prefLabel'], Literal(value_label))) assertion_graph.add((codelist_uri, SKOS['member'], value_uri)) # The value has been changed, and therefore there is a mapping if value['original']['uri'] != value['uri']: assertion_graph.add( (value_uri, SKOS['exactMatch'], URIRef(value['uri']))) assertion_graph.add( (value_uri, RDFS.label, Literal(value['label']))) elif variable['category'] == 'identifier': # Generate a SKOS concept for each of the values for value in variable['values']: value_uri = URIRef(value['original']['uri']) value_label = Literal(value['original']['label']) assertion_graph.add((value_uri, RDF.type, SKOS['Concept'])) assertion_graph.add( (value_uri, SKOS['prefLabel'], value_label)) # The value has been changed, and therefore there is a mapping if value['original']['uri'] != value['uri']: assertion_graph.add( (value_uri, SKOS['exactMatch'], URIRef(value['uri']))) assertion_graph.add( (value_uri, RDFS.label, Literal(value['label']))) elif variable['category'] == 'other': # Generate a literal for each of the values when converting the dataset (but not here) pass return rdf_dataset
class RDFtoUmlDiagram(): """ Transform a RDF dataset to an UML diagram """ def __init__(self, showObjs, showClasses, namespace): self.ds = Dataset() self.d = UmlPygraphVizDiagram() self.show_objs = showObjs self.show_classes = showClasses self.namespace = namespace self.add_namespaces(self.namespace) def load_rdf(self, filename, input_format=None): if input_format: rdf_format = input_format elif filename is not sys.stdin: format_list = {'.xml': 'xml', '.rdf': 'xml', '.owl': 'xml', '.n3': 'n3', '.ttl': 'turtle', '.nt': 'nt', '.trig': 'trig', '.nq': 'nquads', '': 'turtle'} extension = splitext(filename.name)[1] rdf_format = format_list[extension] else: rdf_format = 'turtle' print("using rdf format: " + rdf_format) temp = self.ds.graph("file://"+filename.name) temp.parse(filename.name, format=rdf_format) def add_namespaces(self, namespaces): if namespaces: for ns in namespaces: self.ds.namespace_manager.bind(ns[0],ns[1]) def start_subgraph(self, graph_name): self.d.start_subgraph(graph_name.strip('[<>]:_')) def add_object_node(self, object_name, classes_name, attributes): self.d.add_object_node(self.ds.namespace_manager.qname(object_name), classes_name, attributes) def add_class_node(self, class_name, attributes): self.d.add_class_node(self.ds.namespace_manager.qname(class_name), attributes) def add_edge(self, src, dst, predicate): self.d.add_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst), self.ds.namespace_manager.qname(predicate)) def add_subclass_edge(self, src, dst): self.d.add_subclass_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst)) def create_namespace_box(self): # Create Namespace box label = """< <table align="left" cellborder="0"> <tr><td align='center' colspan='2'><b>Namespaces</b></td></tr>""" for ns in sorted(self.ds.namespaces()): label += "<tr><td align='left'>%s:</td><td align='left'>%s</td></tr>" % (ns[0], ns[1] ) label += "</table> >" self.d.set_label(label) def output_dot(self, filename): self.d.write_to_file(filename) def visualize(self, filename): self.d.visualize(filename, self.ds.namespaces()) def create_diagram(self, object_nodes=True, class_nodes=False): # Iterate over all graphs for graph in self.ds.contexts(): graph_name = graph.n3() if graph_name == "[<urn:x-rdflib:default>]": break graph = graph.skolemize() if len(graph) > 0: self.start_subgraph(graph_name) if self.show_objs: self.create_object_nodes(graph) if self.show_classes: self.create_class_nodes(graph) self.d.add_undescribed_nodes() self.create_namespace_box() def create_object_nodes(self, graph): # object nodes query_nodes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> SELECT DISTINCT ?node WHERE { ?node a ?class. FILTER (?class not IN (rdfs:Class, owl:Class, owl:Property, owl:ObjectProperty, owl:DatatypeProperty)) } ORDER BY ?node""" result_nodes = graph.query(query_nodes) for row_nodes in result_nodes: # adding the classes to the node (can be more than one) query_classes = """SELECT DISTINCT ?class WHERE { %s a ?class. } ORDER BY ?class""" % row_nodes['node'].n3() result_classes = graph.query(query_classes) classes = [] for row_classes in result_classes: if not self.show_classes: classes.append(self.ds.namespace_manager.qname(row_classes['class'])) else: self.add_edge(row_nodes['node'], row_classes['class'], "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") # adding the attributes to the node query_attributes = """SELECT DISTINCT ?p ?o WHERE { %s ?p ?o. FILTER (isLiteral(?o)) } ORDER BY ?p ?o""" % row_nodes['node'].n3() result_attributes = graph.query(query_attributes) attributes = [] for row_attributes in result_attributes: attributes.append( self.ds.namespace_manager.qname(row_attributes['p']) + " = " + str(row_attributes['o'])) self.add_object_node(row_nodes['node'], ", ".join(classes), attributes) # object node connections query_connections = """SELECT DISTINCT ?c1 ?c2 ?p WHERE { ?c1 ?p ?c2. FILTER (!isLiteral(?c2)) FILTER (?p not IN (rdf:type, rdfs:domain, rdfs:range, rdfs:subClassOf)) } ORDER BY ?c1 ?p ?c2""" result_connections = graph.query(query_connections) for row_connections in result_connections: self.add_edge(row_connections['c1'], row_connections['c2'], row_connections['p']) def create_class_nodes(self, graph): # RDFS stuff query_classes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> SELECT DISTINCT ?class WHERE { ?class a ?c . FILTER (?c in (rdfs:Class, owl:Class)) } ORDER BY ?class""" result_classes = graph.query(query_classes) for row_classes in result_classes: query_datatype_property = """ PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> SELECT DISTINCT ?property ?range WHERE { ?property rdfs:domain %s; a owl:DatatypeProperty. OPTIONAL{ ?property rdfs:range ?range. } } ORDER BY ?property""" % row_classes['class'].n3() result_datatype_property = graph.query(query_datatype_property) attributes = [] for r in result_datatype_property: text = self.ds.namespace_manager.qname(r['property']) if r['range']: text += " = " + self.ds.namespace_manager.qname(r['range']) attributes.append(text) self.add_class_node(row_classes['class'], attributes) query_object_property = """SELECT DISTINCT ?src ?dest ?property WHERE { ?property a <http://www.w3.org/2002/07/owl#ObjectProperty>; rdfs:domain ?src; rdfs:range ?dest. } ORDER BY ?src ?property ?dest""" result_object_property = graph.query(query_object_property) for row_object_property in result_object_property: self.add_edge(row_object_property['src'], row_object_property['dest'], row_object_property['property']) query_subclass = """SELECT DISTINCT ?src ?dest WHERE { ?src rdfs:subClassOf ?dest. } ORDER BY ?src ?dest""" result_subclass = graph.query(query_subclass) for row_subclass in result_subclass: self.add_subclass_edge(row_subclass['src'], row_subclass['dest'])
def data_structure_definition(profile, dataset_name, dataset_base_uri, variables, source_path, source_hash): """Converts the dataset + variables to a set of rdflib Graphs (a nanopublication with provenance annotations) that contains the data structure definition (from the DataCube vocabulary) and the mappings to external datasets. Arguments: dataset -- the name of the dataset variables -- the list of dictionaries with the variables and their mappings to URIs profile -- the Google signin profile source_path -- the path to the dataset file that was annotated source_hash -- the Git hash of the dataset file version of the dataset :returns: an RDF graph store containing a nanopublication """ BASE = Namespace("{}/".format(dataset_base_uri)) dataset_uri = URIRef(dataset_base_uri) # Initialize a conjunctive graph for the whole lot rdf_dataset = Dataset() rdf_dataset.bind("qbrv", QBRV) rdf_dataset.bind("qbr", QBR) rdf_dataset.bind("qb", QB) rdf_dataset.bind("skos", SKOS) rdf_dataset.bind("prov", PROV) rdf_dataset.bind("np", NP) rdf_dataset.bind("foaf", FOAF) # Initialize the graphs needed for the nanopublication timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M") # Shorten the source hash to 8 digits (similar to Github) source_hash = source_hash[:8] hash_part = source_hash + "/" + timestamp # The Nanopublication consists of three graphs assertion_graph_uri = BASE["assertion/" + hash_part] assertion_graph = rdf_dataset.graph(assertion_graph_uri) provenance_graph_uri = BASE["provenance/" + hash_part] provenance_graph = rdf_dataset.graph(provenance_graph_uri) pubinfo_graph_uri = BASE["pubinfo/" + hash_part] pubinfo_graph = rdf_dataset.graph(pubinfo_graph_uri) # A URI that represents the author author_uri = QBR["person/" + profile["email"]] rdf_dataset.add((author_uri, RDF.type, FOAF["Person"])) rdf_dataset.add((author_uri, FOAF["name"], Literal(profile["name"]))) rdf_dataset.add((author_uri, FOAF["email"], Literal(profile["email"]))) rdf_dataset.add((author_uri, QBRV["googleId"], Literal(profile["id"]))) try: rdf_dataset.add((author_uri, FOAF["depiction"], URIRef(profile["image"]))) except KeyError: pass # A URI that represents the version of the dataset source file dataset_version_uri = BASE[source_hash] # Some information about the source file used rdf_dataset.add((dataset_version_uri, QBRV["path"], Literal(source_path, datatype=XSD.string))) rdf_dataset.add((dataset_version_uri, QBRV["sha1_hash"], Literal(source_hash, datatype=XSD.string))) # ---- # The nanopublication itself # ---- nanopublication_uri = BASE["nanopublication/" + hash_part] rdf_dataset.add((nanopublication_uri, RDF.type, NP["Nanopublication"])) rdf_dataset.add((nanopublication_uri, NP["hasAssertion"], assertion_graph_uri)) rdf_dataset.add((assertion_graph_uri, RDF.type, NP["Assertion"])) rdf_dataset.add((nanopublication_uri, NP["hasProvenance"], provenance_graph_uri)) rdf_dataset.add((provenance_graph_uri, RDF.type, NP["Provenance"])) rdf_dataset.add((nanopublication_uri, NP["hasPublicationInfo"], pubinfo_graph_uri)) rdf_dataset.add((pubinfo_graph_uri, RDF.type, NP["PublicationInfo"])) # ---- # The provenance graph # ---- # Provenance information for the assertion graph (the data structure definition itself) provenance_graph.add((assertion_graph_uri, PROV["wasDerivedFrom"], dataset_version_uri)) provenance_graph.add((dataset_uri, PROV["wasDerivedFrom"], dataset_version_uri)) provenance_graph.add((assertion_graph_uri, PROV["generatedAtTime"], Literal(timestamp, datatype=XSD.datetime))) provenance_graph.add((assertion_graph_uri, PROV["wasAttributedTo"], author_uri)) # ---- # The publication info graph # ---- # The URI of the latest version of QBer # TODO: should point to the actual latest commit of this QBer source file. # TODO: consider linking to this as the plan of some activity, rather than an activity itself. qber_uri = URIRef("https://github.com/CLARIAH/qber.git") pubinfo_graph.add((nanopublication_uri, PROV["wasGeneratedBy"], qber_uri)) pubinfo_graph.add((nanopublication_uri, PROV["generatedAtTime"], Literal(timestamp, datatype=XSD.datetime))) pubinfo_graph.add((nanopublication_uri, PROV["wasAttributedTo"], author_uri)) # ---- # The assertion graph # ---- structure_uri = BASE["structure"] assertion_graph.add((dataset_uri, RDF.type, QB["DataSet"])) assertion_graph.add((dataset_uri, RDFS.label, Literal(dataset_name))) assertion_graph.add((structure_uri, RDF.type, QB["DataStructureDefinition"])) assertion_graph.add((dataset_uri, QB["structure"], structure_uri)) for variable_id, variable in variables.items(): variable_uri = URIRef(variable["original"]["uri"]) variable_label = Literal(variable["original"]["label"]) variable_type = URIRef(variable["type"]) codelist_uri = URIRef(variable["codelist"]["original"]["uri"]) codelist_label = Literal(variable["codelist"]["original"]["label"]) # The variable as component of the definition component_uri = safe_url(BASE, "component/" + variable["original"]["label"]) # Add link between the definition and the component assertion_graph.add((structure_uri, QB["component"], component_uri)) # Add label to variable # TODO: We may need to do something with a changed label for the variable assertion_graph.add((variable_uri, RDFS.label, variable_label)) if "description" in variable and variable["description"] != "": assertion_graph.add((variable_uri, RDFS.comment, Literal(variable["description"]))) # If the variable URI is not the same as the original, # it is a specialization of a prior variable property. if variable["uri"] != str(variable_uri): assertion_graph.add((variable_uri, RDFS["subPropertyOf"], URIRef(variable["uri"]))) if variable_type == QB["DimensionProperty"]: assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB["dimension"], variable_uri)) # Coded variables are also of type coded property (a subproperty of dimension property) if variable["category"] == "coded": assertion_graph.add((variable_uri, RDF.type, QB["CodedProperty"])) elif variable_type == QB["MeasureProperty"]: # The category 'other' assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB["measure"], variable_uri)) elif variable_type == QB["AttributeProperty"]: # Actually never produced by QBer at this stage assertion_graph.add((variable_uri, RDF.type, variable_type)) assertion_graph.add((component_uri, QB["attribute"], variable_uri)) # If this variable is of category 'coded', we add codelist and URIs for # each variable (including mappings between value uris and etc....) if variable["category"] == "coded": assertion_graph.add((codelist_uri, RDF.type, SKOS["Collection"])) assertion_graph.add((codelist_uri, RDFS.label, Literal(codelist_label))) # The variable should point to the codelist assertion_graph.add((variable_uri, QB["codeList"], codelist_uri)) # The variable is mapped onto an external code list. # If the codelist uri is not the same as the original one, we # have a derived codelist. if variable["codelist"]["uri"] != str(codelist_uri): assertion_graph.add((codelist_uri, PROV["wasDerivedFrom"], URIRef(variable["codelist"]["uri"]))) # Generate a SKOS concept for each of the values and map it to the # assigned codelist for value in variable["values"]: value_uri = URIRef(value["original"]["uri"]) value_label = Literal(value["original"]["label"]) assertion_graph.add((value_uri, RDF.type, SKOS["Concept"])) assertion_graph.add((value_uri, SKOS["prefLabel"], Literal(value_label))) assertion_graph.add((codelist_uri, SKOS["member"], value_uri)) # The value has been changed, and therefore there is a mapping if value["original"]["uri"] != value["uri"]: assertion_graph.add((value_uri, SKOS["exactMatch"], URIRef(value["uri"]))) assertion_graph.add((value_uri, RDFS.label, Literal(value["label"]))) elif variable["category"] == "identifier": # Generate a SKOS concept for each of the values for value in variable["values"]: value_uri = URIRef(value["original"]["uri"]) value_label = Literal(value["original"]["label"]) assertion_graph.add((value_uri, RDF.type, SKOS["Concept"])) assertion_graph.add((value_uri, SKOS["prefLabel"], value_label)) # The value has been changed, and therefore there is a mapping if value["original"]["uri"] != value["uri"]: assertion_graph.add((value_uri, SKOS["exactMatch"], URIRef(value["uri"]))) assertion_graph.add((value_uri, RDFS.label, Literal(value["label"]))) elif variable["category"] == "other": # Generate a literal for each of the values when converting the dataset (but not here) pass return rdf_dataset
class DatasetTestCase(unittest.TestCase): store = "default" slow = True tmppath = None def setUp(self): try: self.graph = Dataset(store=self.store) except ImportError: raise SkipTest("Dependencies for store '%s' not available!" % self.store) if self.store == "SQLite": _, self.tmppath = mkstemp(prefix="test", dir="/tmp", suffix=".sqlite") elif self.store == "SPARQLUpdateStore": root = HOST + DB self.graph.open((root + "sparql", root + "update")) else: self.tmppath = mkdtemp() if self.store != "SPARQLUpdateStore": self.graph.open(self.tmppath, create=True) self.michel = URIRef("urn:michel") self.tarek = URIRef("urn:tarek") self.bob = URIRef("urn:bob") self.likes = URIRef("urn:likes") self.hates = URIRef("urn:hates") self.pizza = URIRef("urn:pizza") self.cheese = URIRef("urn:cheese") # Use regular URIs because SPARQL endpoints like Fuseki alter short names self.c1 = URIRef("urn:context-1") self.c2 = URIRef("urn:context-2") # delete the graph for each test! self.graph.remove((None, None, None)) for c in self.graph.contexts(): c.remove((None, None, None)) assert len(c) == 0 self.graph.remove_graph(c) def tearDown(self): self.graph.close() if self.store == "SPARQLUpdateStore": pass else: if os.path.isdir(self.tmppath): shutil.rmtree(self.tmppath) else: os.remove(self.tmppath) def testGraphAware(self): if not self.graph.store.graph_aware: return g = self.graph g1 = g.graph(self.c1) # Some SPARQL endpoint backends (e.g. TDB) do not consider # empty named graphs if self.store != "SPARQLUpdateStore": # added graph exists self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID]), ) # added graph is empty self.assertEqual(len(g1), 0) g1.add((self.tarek, self.likes, self.pizza)) # added graph still exists self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID]), ) # added graph contains one triple self.assertEqual(len(g1), 1) g1.remove((self.tarek, self.likes, self.pizza)) # added graph is empty self.assertEqual(len(g1), 0) # Some SPARQL endpoint backends (e.g. TDB) do not consider # empty named graphs if self.store != "SPARQLUpdateStore": # graph still exists, although empty self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([self.c1, DATASET_DEFAULT_GRAPH_ID]), ) g.remove_graph(self.c1) # graph is gone self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID]), ) def testDefaultGraph(self): # Something the default graph is read-only (e.g. TDB in union mode) if self.store == "SPARQLUpdateStore": print("Please make sure updating the default graph " "is supported by your SPARQL endpoint") self.graph.add((self.tarek, self.likes, self.pizza)) self.assertEqual(len(self.graph), 1) # only default exists self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID]), ) # removing default graph removes triples but not actual graph self.graph.remove_graph(DATASET_DEFAULT_GRAPH_ID) self.assertEqual(len(self.graph), 0) # default still exists self.assertEqual( set(x.identifier for x in self.graph.contexts()), set([DATASET_DEFAULT_GRAPH_ID]), ) def testNotUnion(self): # Union depends on the SPARQL endpoint configuration if self.store == "SPARQLUpdateStore": print("Please make sure your SPARQL endpoint has not configured " "its default graph as the union of the named graphs") g1 = self.graph.graph(self.c1) g1.add((self.tarek, self.likes, self.pizza)) self.assertEqual(list(self.graph.objects(self.tarek, None)), []) self.assertEqual(list(g1.objects(self.tarek, None)), [self.pizza]) def testIter(self): """PR 1382: adds __iter__ to Dataset""" d = Dataset() uri_a = URIRef("https://example.com/a") uri_b = URIRef("https://example.com/b") uri_c = URIRef("https://example.com/c") uri_d = URIRef("https://example.com/d") d.add_graph(URIRef("https://example.com/g1")) d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g1"))) d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g1") )) # pointless addition: duplicates above d.add_graph(URIRef("https://example.com/g2")) d.add((uri_a, uri_b, uri_c, URIRef("https://example.com/g2"))) d.add((uri_a, uri_b, uri_d, URIRef("https://example.com/g1"))) # new, uri_d # traditional iterator i_trad = 0 for t in d.quads((None, None, None)): i_trad += 1 # new Dataset.__iter__ iterator i_new = 0 for t in d: i_new += 1 self.assertEqual(i_new, i_trad) # both should be 3
class BurstConverter(object): """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF.""" def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format): self.ds = Dataset() # self.ds = apply_default_namespaces(Dataset()) self.g = self.ds.graph(URIRef(identifier)) self.columns = columns self.schema = schema self.metadata_graph = metadata_graph self.encoding = encoding self.output_format = output_format self.templates = {} self.aboutURLSchema = self.schema.csvw_aboutUrl def equal_to_null(self, nulls, row): """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)""" for n in nulls: n = Item(self.metadata_graph, n) col = str(n.csvw_name) val = str(n.csvw_null) if row[col] == val: logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val)) # There is a match with null value return True # There is no match with null value return False def process(self, count, rows, chunksize): """Process the rows fed to the converter. Count and chunksize are used to determine the current row number (needed for default observation identifiers)""" obs_count = count * chunksize # logger.info("Row: {}".format(obs_count)) #removed for readability # We iterate row by row, and then column by column, as given by the CSVW mapping file. mult_proc_counter = 0 iter_error_counter= 0 for row in rows: # This fixes issue:10 if row is None: mult_proc_counter += 1 # logger.debug( #removed for readability # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") continue # set the '_row' value in case we need to generate 'default' URIs for each observation () # logger.debug("row: {}".format(obs_count)) #removed for readability row[u'_row'] = obs_count count += 1 # The self.columns dictionary gives the mapping definition per column in the 'columns' # array of the CSVW tableSchema definition. for c in self.columns: c = Item(self.metadata_graph, c) # default about URL s = self.expandURL(self.aboutURLSchema, row) try: # Can also be used to prevent the triggering of virtual # columns! # Get the raw value from the cell in the CSV file value = row[unicode(c.csvw_name)] # This checks whether we should continue parsing this cell, or skip it. if self.isValueNull(value, c): continue # If the null values are specified in an array, we need to parse it as a collection (list) elif isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue except: # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. # logger.debug(traceback.format_exc()) #removed for readability iter_error_counter +=1 if isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue try: # This overrides the subject resource 's' that has been created earlier based on the # schema wide aboutURLSchema specification. if unicode(c.csvw_virtual) == u'true' and c.csvw_aboutUrl is not None: s = self.expandURL(c.csvw_aboutUrl, row) if c.csvw_valueUrl is not None: # This is an object property, because the value needs to be cast to a URL p = self.expandURL(c.csvw_propertyUrl, row) o = self.expandURL(c.csvw_valueUrl, row) if self.isValueNull(os.path.basename(unicode(o)), c): logger.debug("skipping empty value") continue if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI: # Special case: this is a virtual column with object values that are URIs # For now using a test special property value = row[unicode(c.csvw_name)].encode('utf-8') o = URIRef(iribaker.to_iri(value)) if unicode(c.csvw_virtual) == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI: about_url = str(c.csvw_aboutUrl) about_url = about_url[about_url.find("{"):about_url.find("}")+1] s = self.expandURL(about_url, row) # logger.debug("s: {}".format(s)) value_url = str(c.csvw_valueUrl) value_url = value_url[value_url.find("{"):value_url.find("}")+1] o = self.expandURL(value_url, row) # logger.debug("o: {}".format(o)) # For coded properties, the collectionUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Collection with that URL. if c.csvw_collectionUrl is not None: collection = self.expandURL(c.csvw_collectionUrl, row) self.g.add((collection, RDF.type, SKOS['Collection'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((collection, SKOS['member'], o)) # For coded properties, the schemeUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Scheme with that URL. if c.csvw_schemeUrl is not None: scheme = self.expandURL(c.csvw_schemeUrl, row) self.g.add((scheme, RDF.type, SKOS['Scheme'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((o, SKOS['inScheme'], scheme)) else: # This is a datatype property if c.csvw_value is not None: value = self.render_pattern(unicode(c.csvw_value), row) elif c.csvw_name is not None: # print s # print c.csvw_name, self.encoding # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) # print row[unicode(c.csvw_name)].encode('utf-8') # print '...' value = row[unicode(c.csvw_name)].encode('utf-8') else: raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") # If propertyUrl is specified, use it, otherwise use # the column name if c.csvw_propertyUrl is not None: p = self.expandURL(c.csvw_propertyUrl, row) else: if "" in self.metadata_graph.namespaces(): propertyUrl = self.metadata_graph.namespaces()[""][ unicode(c.csvw_name)] else: propertyUrl = "{}{}".format(get_namespaces()['sdv'], unicode(c.csvw_name)) p = self.expandURL(propertyUrl, row) if c.csvw_datatype is not None: if URIRef(c.csvw_datatype) == XSD.anyURI: # The xsd:anyURI datatype will be cast to a proper IRI resource. o = URIRef(iribaker.to_iri(value)) elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None: # If it is a string datatype that has a language, we turn it into a # language tagged literal # We also render the lang value in case it is a # pattern. o = Literal(value, lang=self.render_pattern( c.csvw_lang, row)) else: o = Literal(value, datatype=c.csvw_datatype, normalize=False) else: # It's just a plain literal without datatype. o = Literal(value) # Add the triple to the assertion graph self.g.add((s, p, o)) # Add provenance relating the propertyUrl to the column id if '@id' in c: self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) except: # print row[0], value traceback.print_exc() # We increment the observation (row number) with one obs_count += 1 logger.debug( "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) logger.debug( "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) logger.info("... done") return self.ds.serialize(format=self.output_format) # def serialize(self): # trig_file_name = self.file_name + '.trig' # logger.info("Starting serialization to {}".format(trig_file_name)) # # with open(trig_file_name, 'w') as f: # self.np.serialize(f, format='trig') # logger.info("... done") def render_pattern(self, pattern, row): """Takes a Jinja or Python formatted string, and applies it to the row value""" # Significant speedup by not re-instantiating Jinja templates for every # row. if pattern in self.templates: template = self.templates[pattern] else: template = self.templates[pattern] = Template(pattern) # TODO This should take into account the special CSVW instructions such as {_row} # First we interpret the url_pattern as a Jinja2 template, and pass all # column/value pairs as arguments rendered_template = template.render(**row) try: # We then format the resulting string using the standard Python2 # expressions return rendered_template.format(**row) except: logger.warning( u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template)) return rendered_template def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" url = self.render_pattern(unicode(url_pattern), row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print "Baked: ", iri return URIRef(iri) def isValueNull(self, value, c): """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value.""" try: if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true": print("Not skipping empty value") return False #because it should not be skipped elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null): # Skip value if length is zero and equal to (one of) the null value(s) logger.debug( "Length is 0 or value is equal to specified 'null' value") return True except: logger.debug("null does not exist or is not a list.") return False
vocabs['rdavf'] = Namespace('http://rdaregistry.info/termList/videoFormat/') vocabs['rdagd'] = Namespace('http://rdaregistry.info/termList/gender/') vocabs['iso639-2'] = Namespace('http://id.loc.gov/vocabulary/iso639-2/') vocabs['wd'] = Namespace('http://www.wikidata.org/entity/') vocabs['gnd'] = Namespace('http://d-nb.info/gnd/') vocabs['rico'] = Namespace('https://www.ica.org/standards/RiC/ontology#') vocabs['ric-rst'] = Namespace( 'https://www.ica.org/standards/RiC/vocabularies/recordSetTypes#') vocabs['ebucore'] = Namespace( 'https://www.ebu.ch/metadata/ontologies/ebucore#') vocabs['eclap'] = Namespace('http://www.eclap.eu/schema/eclap/') vocabs['dct'] = Namespace('http://purl.org/dc/terms/') vocabs['premis'] = Namespace('http://www.loc.gov/premis/rdf/v1#') ds = Dataset() graph = ds.graph(URIRef('http://vocab.performing-arts.ch/container/context')) graph.bind('rdfs', RDFS) graph.bind('skos', SKOS) graph.bind('owl', OWL) graph.bind('spav', vocab_ns) for k, v in vocabs.items(): graph.bind(k, v) ldp = URIRef('http://vocab.performing-arts.ch/container') graph.add((ldp, RDF.type, vocabs['prov'].Entity)) graph.add((ldp, RDF.type, vocabs['ldp'].Resource)) graph.add( (ldp, vocabs['prov'].generatedAtTime, Literal(datetime.datetime.now(pytz.timezone('Europe/Zurich')).isoformat(), datatype=XSD.dateTime)))
if r['neighbourhood']: locationObservation = LocationObservation( nsLocationObservation.term(r['neighbourhood']), label=[r['neighbourhood']]) personObservation.hasLocation = [locationObservation] DERIVATION.entity = sorted(images) return personObservation # exampleResource if __name__ == "__main__": ds = Dataset() g = rdfSubject.db = ds.graph( identifier="https://data.create.humanities.uva.nl/id/kohier1674/") exampleResource = main('data/records.csv', g) rdfSubject.db = ds description = """Van dit handgeschreven kohier (SAA inventarisnummer 5028:662) bestaat een getypte index op achternaam (SAA inventarisnummber 5028:662A). Hierin is de naam van een persoon, de relatie tot een andere persoon of groep (e.g. wed. van, of kinderen van), beroep en de woonwijk opgenomen. Ook is genoteerd op welk foliant de persoon beschreven is. In totaal zijn 60 wijken beschreven in het kohier, aangegeven met cijfers. Daarna volgt een sectie van de 'Magistraten' (M), 'Joodse Natie' (J), 'Paden buijten de Stadt' (P1-P6), 'Officianten' (O), 'Regerende heeren' (R), 'Personen van andere Steden' (AS) en 'Testamenten' (T). De wijkindeling correspondeert waarschijnlijk met die van een kaart uit 1766, vervaardigd door C. Philips Jacobsz. (1732-1789) en F.W. Greebe en is beschikbaar in de Beeldbank van het Stadsarchief, afbeelding 010001000849. """ contributors = "" download = DataDownload(
def test_scenarios() -> None: """ Testing scenarios: 1. no base set 2. base set at graph creation 3. base set at serialization 4. base set at both graph creation & serialization, serialization overrides 5. multiple serialization side effect checking 6. checking results for RDF/XML 7. checking results for N3 8. checking results for TriX & TriG """ # variables base_one = Namespace("http://one.org/") base_two = Namespace("http://two.org/") title = Literal("Title", lang="en") description = Literal("Test Description", lang="en") creator = URIRef("https://creator.com") cs = URIRef("") # starting graph g = Graph() g.add((cs, RDF.type, SKOS.ConceptScheme)) g.add((cs, DCTERMS.creator, creator)) g.add((cs, DCTERMS.source, URIRef("nick"))) g.bind("dct", DCTERMS) g.bind("skos", SKOS) # 1. no base set for graph, no base set for serialization g1 = Graph() g1 += g # @base should not be in output assert "@base" not in g.serialize(format="turtle") # 2. base one set for graph, no base set for serialization g2 = Graph(base=base_one) g2 += g # @base should be in output, from Graph (one) assert "@base <http://one.org/> ." in g2.serialize(format="turtle") # 3. no base set for graph, base two set for serialization g3 = Graph() g3 += g # @base should be in output, from serialization (two) assert "@base <http://two.org/> ." in g3.serialize(format="turtle", base=base_two) # 4. base one set for graph, base two set for serialization, Graph one overrides g4 = Graph(base=base_one) g4 += g # @base should be in output, from graph (one) assert "@base <http://two.org/> ." in g4.serialize(format="turtle", base=base_two) # just checking that the serialization setting (two) hasn't snuck through assert "@base <http://one.org/> ." not in g4.serialize(format="turtle", base=base_two) # 5. multiple serialization side effect checking g5 = Graph() g5 += g # @base should be in output, from serialization (two) assert "@base <http://two.org/> ." in g5.serialize(format="turtle", base=base_two) # checking for side affects - no base now set for this serialization # @base should not be in output assert "@base" not in g5.serialize(format="turtle") # 6. checking results for RDF/XML g6 = Graph() g6 += g g6.bind("dct", DCTERMS) g6.bind("skos", SKOS) assert "@xml:base" not in g6.serialize(format="xml") assert 'xml:base="http://one.org/"' in g6.serialize(format="xml", base=base_one) g6.base = base_two assert 'xml:base="http://two.org/"' in g6.serialize(format="xml") assert 'xml:base="http://one.org/"' in g6.serialize(format="xml", base=base_one) # 7. checking results for N3 g7 = Graph() g7 += g g7.bind("dct", DCTERMS) g7.bind("skos", SKOS) assert "@xml:base" not in g7.serialize(format="xml") assert "@base <http://one.org/> ." in g7.serialize(format="n3", base=base_one) g7.base = base_two assert "@base <http://two.org/> ." in g7.serialize(format="n3") assert "@base <http://one.org/> ." in g7.serialize(format="n3", base=base_one) # 8. checking results for TriX & TriG # TriX can specify a base per graph but setting a base for the whole base_three = Namespace("http://three.org/") ds1 = Dataset() ds1.bind("dct", DCTERMS) ds1.bind("skos", SKOS) g8 = ds1.graph(URIRef("http://g8.com/"), base=base_one) g9 = ds1.graph(URIRef("http://g9.com/")) g8 += g g9 += g g9.base = base_two ds1.base = base_three trix = ds1.serialize(format="trix", base=Namespace("http://two.org/")) assert '<graph xml:base="http://one.org/">' in trix assert '<graph xml:base="http://two.org/">' in trix assert '<TriX xml:base="http://two.org/"' in trix trig = ds1.serialize(format="trig", base=Namespace("http://two.org/")) assert "@base <http://one.org/> ." not in trig assert "@base <http://three.org/> ." not in trig assert "@base <http://two.org/> ." in trig
class RDFtoUmlDiagram(): """ Transform a RDF dataset to an UML diagram """ def __init__(self, output_filename='output.png'): self.ds = Dataset() #self.d = UmlGraphVizDiagram(output_filename) self.d = UmlPygraphVizDiagram(output_filename) def load_rdf(self, filename, input_format=None): if input_format: rdf_format = input_format elif filename is not sys.stdin: format_list = {'.xml': 'xml', '.rdf': 'xml', '.owl': 'xml', '.n3': 'n3', '.ttl': 'turtle', '.nt': 'nt', '.trig': 'trig', '.nq': 'nquads', '': 'turtle'} extension = splitext(filename.name)[1] rdf_format = format_list[extension] else: rdf_format = 'turtle' temp = self.ds.graph("file://"+filename.name) temp.parse(filename.name, format=rdf_format) def add_namespaces(self, namespaces): if namespaces: for ns in namespaces: self.ds.namespace_manager.bind(ns[0],ns[1]) def start_subgraph(self, graph_name): self.d.start_subgraph(graph_name.strip('[<>]:_')) def close_subgraph(self): self.d.close_subgraph() def add_object_node(self, object_name, classes_name, attributes): self.d.add_object_node(self.ds.namespace_manager.qname(object_name), classes_name, attributes) def add_class_node(self, class_name, attributes): self.d.add_class_node(self.ds.namespace_manager.qname(class_name), attributes) def add_edge(self, src, dst, predicate): self.d.add_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst), self.ds.namespace_manager.qname(predicate)) def add_subclass_edge(self, src, dst): self.d.add_subclass_edge(self.ds.namespace_manager.qname(src), self.ds.namespace_manager.qname(dst)) def create_namespace_box(self): # Create Namespace box self.d.add_label("Namespaces:\l") for ns in sorted(self.ds.namespaces()): self.d.add_label("%s:\t%s \l" % (ns[0], ns[1])) def output_dot(self): self.d.write_to_file() def close(self): self.create_namespace_box() self.d.close() def visualize(self): self.d.visualize()
def main(search=None, cache=None, identifiers=[]): ns = Namespace("https://data.create.humanities.uva.nl/id/rkd/") ds = Dataset() ds.bind('rdfs', RDFS) ds.bind('schema', schema) ds.bind('sem', sem) ds.bind('bio', bio) ds.bind('foaf', foaf) ds.bind('void', void) ds.bind('skos', SKOS) ds.bind('owl', OWL) ds.bind('dc', dc) ds.bind('rkdArtist', URIRef("https://data.rkd.nl/artists/")) ds.bind('rkdThes', nsThesaurus) ds.bind('rkdPerson', nsPerson) ds.bind('rkdImage', URIRef("https://rkd.nl/explore/images/")) ds.bind('rkdThumb', URIRef("https://images.rkd.nl/rkd/thumb/650x650/")) ds.bind('aat', URIRef("http://vocab.getty.edu/aat/")) ## First the images g = rdfSubject.db = ds.graph(identifier=ns) # Load cache thesaurus if os.path.isfile('rkdthesaurus.json'): with open('rkdthesaurus.json') as infile: thesaurusDict = json.load(infile) else: thesaurusDict = dict() # Load cache images if os.path.isfile('imagecache.json'): with open('imagecache.json') as infile: imageCache = json.load(infile) else: imageCache = dict() # to fetch all identifiers from the search if search: thesaurusDict, imageCache = parseURL(search, thesaurusDict=thesaurusDict, imageCache=imageCache) elif cache: # assume that everything in the thesaurus is also cached for doc in cache.values(): parseData(doc, thesaurusDict=thesaurusDict) elif identifiers: for i in identifiers: thesaurusDict, imageCache = parseURL(APIURL + str(i), thesaurusDict=thesaurusDict, imageCache=imageCache) # Any images without labels? # These were not included in the search, but fetch them anyway. print("Finding referred images that were not included") q = """ PREFIX schema: <http://schema.org/> SELECT ?uri WHERE { ?role a schema:Role ; schema:isRelatedTo ?uri . FILTER NOT EXISTS { ?uri schema:name ?name } } """ images = g.query(q) print(f"Found {len(images)}!") for i in images: identifier = str(i['uri']).replace('https://rkd.nl/explore/images/', '') thesaurusDict, imageCache = parseURL( "https://api.rkd.nl/api/record/images/" + str(identifier), thesaurusDict=thesaurusDict, imageCache=imageCache) ## Then the thesaurus print("Converting the thesaurus") rdfSubject.db = ds.graph(identifier=ns.term('thesaurus/')) ids = list(thesaurusDict.keys()) for i in ids: _, thesaurusDict = getThesaurus(i, thesaurusDict, 'concept') # Save updated cache with open('rkdthesaurus.json', 'w') as outfile: json.dump(thesaurusDict, outfile) with open('imagecache.json', 'w') as outfile: json.dump(imageCache, outfile) ## Serialize print("Serializing!") ds.serialize('rkdportraits14751825.trig', format='trig')
class BurstConverter(object): """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF.""" def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format): self.ds = Dataset() # self.ds = apply_default_namespaces(Dataset()) self.g = self.ds.graph(URIRef(identifier)) self.columns = columns self.schema = schema self.metadata_graph = metadata_graph self.encoding = encoding self.output_format = output_format self.templates = {} self.aboutURLSchema = self.schema.csvw_aboutUrl def equal_to_null(self, nulls, row): """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)""" for n in nulls: n = Item(self.metadata_graph, n) col = str(n.csvw_name) val = str(n.csvw_null) if row[col] == val: # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val)) # There is a match with null value return True # There is no match with null value return False def process(self, count, rows, chunksize): """Process the rows fed to the converter. Count and chunksize are used to determine the current row number (needed for default observation identifiers)""" obs_count = count * chunksize # logger.info("Row: {}".format(obs_count)) #removed for readability # We iterate row by row, and then column by column, as given by the CSVW mapping file. mult_proc_counter = 0 iter_error_counter= 0 for row in rows: # This fixes issue:10 if row is None: mult_proc_counter += 1 # logger.debug( #removed for readability # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") continue # set the '_row' value in case we need to generate 'default' URIs for each observation () # logger.debug("row: {}".format(obs_count)) #removed for readability row[u'_row'] = obs_count count += 1 # print(row) # The self.columns dictionary gives the mapping definition per column in the 'columns' # array of the CSVW tableSchema definition. for c in self.columns: c = Item(self.metadata_graph, c) # default about URL s = self.expandURL(self.aboutURLSchema, row) try: # Can also be used to prevent the triggering of virtual # columns! # Get the raw value from the cell in the CSV file try: # Python 2 value = row[unicode(c.csvw_name)] except NameError: # Python 3 value = row[str(c.csvw_name)] # This checks whether we should continue parsing this cell, or skip it. if self.isValueNull(value, c): continue # If the null values are specified in an array, we need to parse it as a collection (list) elif isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue except: # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. # logger.debug(traceback.format_exc()) #removed for readability iter_error_counter +=1 if isinstance(c.csvw_null, Item): nulls = Collection(self.metadata_graph, BNode(c.csvw_null)) if self.equal_to_null(nulls, row): # Continue to next column specification in this row, if the value is equal to (one of) the null values. continue try: # This overrides the subject resource 's' that has been created earlier based on the # schema wide aboutURLSchema specification. try: csvw_virtual = unicode(c.csvw_virtual) csvw_name = unicode(c.csvw_name) csvw_value = unicode(c.csvw_value) about_url = unicode(c.csvw_aboutUrl) value_url = unicode(c.csvw_valueUrl) except NameError: csvw_virtual = str(c.csvw_virtual) csvw_name = str(c.csvw_name) csvw_value = str(c.csvw_value) about_url = str(c.csvw_aboutUrl) value_url = str(c.csvw_valueUrl) if csvw_virtual == u'true' and c.csvw_aboutUrl is not None: s = self.expandURL(c.csvw_aboutUrl, row) if c.csvw_valueUrl is not None: # This is an object property, because the value needs to be cast to a URL p = self.expandURL(c.csvw_propertyUrl, row) o = self.expandURL(c.csvw_valueUrl, row) try: if self.isValueNull(os.path.basename(unicode(o)), c): logger.debug("skipping empty value") continue except NameError: if self.isValueNull(os.path.basename(str(o)), c): logger.debug("skipping empty value") continue if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.anyURI: # Special case: this is a virtual column with object values that are URIs # For now using a test special property value = row[unicode(c.csvw_name)].encode('utf-8') o = URIRef(iribaker.to_iri(value)) if csvw_virtual == u'true' and c.csvw_datatype is not None and URIRef(c.csvw_datatype) == XSD.linkURI: about_url = about_url[about_url.find("{"):about_url.find("}")+1] s = self.expandURL(about_url, row) # logger.debug("s: {}".format(s)) value_url = value_url[value_url.find("{"):value_url.find("}")+1] o = self.expandURL(value_url, row) # logger.debug("o: {}".format(o)) # For coded properties, the collectionUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Collection with that URL. if c.csvw_collectionUrl is not None: collection = self.expandURL(c.csvw_collectionUrl, row) self.g.add((collection, RDF.type, SKOS['Collection'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((collection, SKOS['member'], o)) # For coded properties, the schemeUrl can be used to indicate that the # value URL is a concept and a member of a SKOS Scheme with that URL. if c.csvw_schemeUrl is not None: scheme = self.expandURL(c.csvw_schemeUrl, row) self.g.add((scheme, RDF.type, SKOS['Scheme'])) self.g.add((o, RDF.type, SKOS['Concept'])) self.g.add((o, SKOS['inScheme'], scheme)) else: # This is a datatype property if c.csvw_value is not None: value = self.render_pattern(csvw_value, row) elif c.csvw_name is not None: # print s # print c.csvw_name, self.encoding # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) # print row[unicode(c.csvw_name)].encode('utf-8') # print '...' value = row[csvw_name].encode('utf-8') else: raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") # If propertyUrl is specified, use it, otherwise use # the column name if c.csvw_propertyUrl is not None: p = self.expandURL(c.csvw_propertyUrl, row) else: if "" in self.metadata_graph.namespaces(): propertyUrl = self.metadata_graph.namespaces()[""][ csvw_name] else: propertyUrl = "{}{}".format(get_namespaces()['sdv'], csvw_name) p = self.expandURL(propertyUrl, row) if c.csvw_datatype is not None: if URIRef(c.csvw_datatype) == XSD.anyURI: # The xsd:anyURI datatype will be cast to a proper IRI resource. o = URIRef(iribaker.to_iri(value)) elif URIRef(c.csvw_datatype) == XSD.string and c.csvw_lang is not None: # If it is a string datatype that has a language, we turn it into a # language tagged literal # We also render the lang value in case it is a # pattern. o = Literal(value, lang=self.render_pattern( c.csvw_lang, row)) else: try: csvw_datatype = unicode(c.csvw_datatype) except NameError: csvw_datatype = str(c.csvw_datatype).split(')')[0].split('(')[-1] # csvw_datatype = str(c.csvw_datatype) # print(type(csvw_datatype)) # print(csvw_datatype) o = Literal(value, datatype=csvw_datatype, normalize=False) else: # It's just a plain literal without datatype. o = Literal(value) # Add the triple to the assertion graph self.g.add((s, p, o)) # Add provenance relating the propertyUrl to the column id if '@id' in c: self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) except: # print row[0], value traceback.print_exc() # We increment the observation (row number) with one obs_count += 1 # for s,p,o in self.g.triples((None,None,None)): # print(s.__repr__,p.__repr__,o.__repr__) logger.debug( "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) logger.debug( "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) logger.info("... done") return self.ds.serialize(format=self.output_format) # def serialize(self): # trig_file_name = self.file_name + '.trig' # logger.info("Starting serialization to {}".format(trig_file_name)) # # with open(trig_file_name, 'w') as f: # self.np.serialize(f, format='trig') # logger.info("... done") def render_pattern(self, pattern, row): """Takes a Jinja or Python formatted string, and applies it to the row value""" # Significant speedup by not re-instantiating Jinja templates for every # row. if pattern in self.templates: template = self.templates[pattern] else: template = self.templates[pattern] = Template(pattern) # TODO This should take into account the special CSVW instructions such as {_row} # First we interpret the url_pattern as a Jinja2 template, and pass all # column/value pairs as arguments # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')} # print(pattern) # print(type(pattern)) # print(row) # print(type(row)) # rendered_template = template.render(Int=120000) rendered_template = template.render(**row) try: # We then format the resulting string using the standard Python2 # expressions return rendered_template.format(**row) except: logger.warning( u"Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template)) return rendered_template def expandURL(self, url_pattern, row, datatype=False): """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" try: unicode_url_pattern = unicode(url_pattern) except NameError: unicode_url_pattern = str(url_pattern).split(')')[0].split('(')[-1] # print(unicode_url_pattern) url = self.render_pattern(unicode_url_pattern, row) # DEPRECATED # for ns, nsuri in namespaces.items(): # if url.startswith(ns): # url = url.replace(ns + ':', nsuri) # break try: iri = iribaker.to_iri(url) rfc3987.parse(iri, rule='IRI') except: raise Exception(u"Cannot convert `{}` to valid IRI".format(url)) # print(iri) return URIRef(iri) def isValueNull(self, value, c): """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value.""" try: if len(value) == 0 and unicode(c.csvw_parseOnEmpty) == u"true": # print("Not skipping empty value") return False #because it should not be skipped elif len(value) == 0 or value == unicode(c.csvw_null) or value in [unicode(n) for n in c.csvw_null] or value == unicode(self.schema.csvw_null): # Skip value if length is zero and equal to (one of) the null value(s) # logger.debug( # "Length is 0 or value is equal to specified 'null' value") return True except: # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value. pass return False
from oldman import ClientResourceManager, parse_graph_safely, SPARQLDataStore from oldman.rest.crud import HashLessCRUDer logging.config.fileConfig(path.join(path.dirname(__file__), 'logging.ini')) sesame_iri = "http://*****:*****@context": [ { "myvoc": MY_VOC,
class LongTermMemory(object): ONE_TO_ONE_PREDICATES = [ 'age', 'born_in', 'faceID', 'favorite', 'favorite_of', 'id', 'is_from', 'manufactured_in', 'mother_is', 'name' ] def __init__(self, address=config.BRAIN_URL_LOCAL): """ Interact with Triple store Parameters ---------- address: str IP address and port of the Triple store """ self.address = address self.namespaces = {} self.ontology_paths = {} self.format = 'trig' self.dataset = Dataset() self.query_prefixes = """ prefix gaf: <http://groundedannotationframework.org/gaf#> prefix grasp: <http://groundedannotationframework.org/grasp#> prefix leolaniInputs: <http://cltl.nl/leolani/inputs/> prefix leolaniFriends: <http://cltl.nl/leolani/friends/> prefix leolaniTalk: <http://cltl.nl/leolani/talk/> prefix leolaniTime: <http://cltl.nl/leolani/time/> prefix leolaniWorld: <http://cltl.nl/leolani/world/> prefix n2mu: <http://cltl.nl/leolani/n2mu/> prefix ns1: <urn:x-rdflib:> prefix owl: <http://www.w3.org/2002/07/owl#> prefix prov: <http://www.w3.org/ns/prov#> prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/> prefix skos: <http://www.w3.org/2004/02/skos/core#> prefix time: <http://www.w3.org/TR/owl-time/#> prefix xml: <http://www.w3.org/XML/1998/namespace> prefix xml1: <https://www.w3.org/TR/xmlschema-2/#> prefix xsd: <http://www.w3.org/2001/XMLSchema#> """ self._define_namespaces() self._get_ontology_path() self._bind_namespaces() self.my_uri = None self._log = logger.getChild(self.__class__.__name__) self._log.debug("Booted") #################################### Main functions to interact with the brain #################################### def update(self, capsule): """ Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement, transforms them to triples, and posts them to the triple store :param statement: Structured data of a parsed statement :return: json response containing the status for posting the triples, and the original statement """ # Case fold capsule = casefold_capsule(capsule) # Create graphs and triples self._model_graphs_(capsule) data = self._serialize(config.BRAIN_LOG) code = self._upload_to_brain(data) # Create JSON output capsule["date"] = str(capsule["date"]) output = {'response': code, 'statement': capsule} return output def experience(self, capsule): """ Main function to interact with if a statement is coming into the brain. Takes in a structured parsed statement, transforms them to triples, and posts them to the triple store :param capsule: Structured data of a parsed statement :return: json response containing the status for posting the triples, and the original statement """ # Case fold capsule = casefold_capsule(capsule) # Create graphs and triples self._model_graphs_(capsule, type='Experience') data = self._serialize(config.BRAIN_LOG) code = self._upload_to_brain(data) # Create JSON output capsule["date"] = str(capsule["date"]) output = {'response': code, 'statement': capsule} return output def query_brain(self, capsule): """ Main function to interact with if a question is coming into the brain. Takes in a structured parsed question, transforms it into a query, and queries the triple store for a response :param capsule: Structured data of a parsed question :return: json response containing the results of the query, and the original question """ # Case fold capsule = casefold_capsule(capsule) # Generate query query = self._create_query(capsule) # Perform query response = self._submit_query(query) # Create JSON output if 'date' in capsule.keys(): capsule["date"] = str(capsule["date"]) output = {'response': response, 'question': capsule} return output def process_visual(self, item, exact_only=True): """ Main function to determine if this item can be recognized by the brain, learned, or none :param item: :return: """ if casefold(item) in self.get_classes(): # If this is in the ontology already, create sensor triples directly text = 'I know about %s. I will remember this object' % item return item, text temp = self.get_labels_and_classes() if casefold(item) in temp.keys(): # If this is in the ontology already, create sensor triples directly text = 'I know about %s. It is of type %s. I will remember this object' % ( item, temp[item]) return item, text # Query the web for information class_type, description = self.exact_match_dbpedia(item) if class_type is not None: # Had to learn it, but I can create triples now text = 'I did not know what %s is, but I searched on the web and I found that it is a %s. ' \ 'I will remember this object' % (item, class_type) return casefold(class_type), text if not exact_only: # Second go at dbpedia, relaxed approach class_type, description = self.keyword_match_dbpedia(item) if class_type is not None: # Had to really search for it to learn it, but I can create triples now text = 'I did not know what %s is, but I searched for fuzzy matches on the web and I found that it ' \ 'is a %s. I will remember this object' % (item, class_type) return casefold(class_type), text # Failure, nothing found text = 'I am sorry, I could not learn anything on %s so I will not remember it' % item return None, text ########## management system for keeping track of chats and turns ########## def get_last_chat_id(self): """ Get the id for the last interaction recorded :return: id """ query = read_query('last_chat_id') response = self._submit_query(query) return int(response[0]['chatid']['value']) if response else 0 def get_last_turn_id(self, chat_id): """ Get the id for the last turn in the given chat :param chat_id: id for chat of interest :return: id """ query = read_query('last_turn_id') % (chat_id) response = self._submit_query(query) last_turn = 0 for turn in response: turn_uri = turn['s']['value'] turn_id = turn_uri.split('/')[-1][10:] turn_id = int(turn_id) if turn_id > last_turn: last_turn = turn_id return last_turn ########## brain structure exploration ########## def get_predicates(self): """ Get predicates in social ontology :return: """ query = read_query('predicates') response = self._submit_query(query) return [elem['p']['value'].split('/')[-1] for elem in response] def get_classes(self): """ Get classes in social ontology :return: """ query = read_query('classes') response = self._submit_query(query) return [elem['o']['value'].split('/')[-1] for elem in response] def get_labels_and_classes(self): """ Get classes in social ontology :return: """ query = read_query('labels_and_classes') response = self._submit_query(query) temp = dict() for r in response: temp[r['l']['value']] = r['o']['value'].split('/')[-1] return temp ########## learned facts exploration ########## def count_statements(self): """ Count statements or 'facts' in the brain :return: """ query = read_query('count_statements') response = self._submit_query(query) return response[0]['count']['value'] def count_friends(self): """ Count number of people I have talked to :return: """ query = read_query('count_friends') response = self._submit_query(query) return response[0]['count']['value'] def get_my_friends(self): """ Get names of people I have talked to :return: """ query = read_query('my_friends') response = self._submit_query(query) return [elem['name']['value'].split('/')[-1] for elem in response] def get_best_friends(self): """ Get names of the 5 people I have talked to the most :return: """ query = read_query('best_friends') response = self._submit_query(query) return [elem['name']['value'] for elem in response] def get_instance_of_type(self, instance_type): """ Get isntances of a certain class type :param instance_type: name of class in ontology :return: """ query = read_query('instance_of_type') % (instance_type) response = self._submit_query(query) return [elem['name']['value'] for elem in response] def when_last_chat_with(self, actor_label): """ Get time value for the last time I chatted with this person :param actor_label: name of person :return: """ query = read_query('when_last_chat_with') % (actor_label) response = self._submit_query(query) return response[0]['time']['value'].split('/')[-1] def get_triples_with_predicate(self, predicate): """ Get triples that contain this predicate :param predicate: :return: """ query = read_query('triples_with_predicate') % predicate response = self._submit_query(query) return [(elem['sname']['value'], elem['oname']['value']) for elem in response] ########## conflicts ########## def get_all_conflicts(self): """ Aggregate all conflicts in brain :return: """ conflicts = [] for predicate in self.ONE_TO_ONE_PREDICATES: conflicts.extend(self._get_conflicts_with_predicate(predicate)) return conflicts ########## semantic web ########## def exact_match_dbpedia(self, item): """ Query dbpedia for information on this item to get it's semantic type and description. :param item: :return: """ # Gather combinations combinations = [item, item.lower(), item.capitalize(), item.title()] for comb in combinations: # Try exact matching query query = read_query('dbpedia_type_and_description') % (comb) response = self._submit_query(query) # break if we have a hit if response: break class_type = response[0]['label_type']['value'] if response else None description = response[0]['description']['value'].split( '.')[0] if response else None return class_type, description def keyword_match_dbpedia(self, item): # Query API r = requests.get( 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch', params={ 'QueryString': item, 'MaxHits': '10' }, headers={ 'Accept': 'application/json' }).json()['results'] # Fuzzy match choices = [e['label'] for e in r] best_match = process.extractOne("item", choices) # Get best match object r = [{ 'label': e['label'], 'classes': e['classes'], 'description': e['description'] } for e in r if e['label'] == best_match[0]] if r: r = r[0] if r['classes']: # process dbpedia classes only r['classes'] = [ c['label'] for c in r['classes'] if 'dbpedia' in c['uri'] ] else: r = {'label': None, 'classes': None, 'description': None} return r['classes'][0] if r['classes'] else None, r[ 'description'].split('.')[0] if r['description'] else None ######################################## Helpers for setting up connection ######################################## def _define_namespaces(self): """ Define namespaces for different layers (ontology/vocab and resource). Assign them to self :return: """ # Namespaces for the instance layer instance_vocab = 'http://cltl.nl/leolani/n2mu/' self.namespaces['N2MU'] = Namespace(instance_vocab) instance_resource = 'http://cltl.nl/leolani/world/' self.namespaces['LW'] = Namespace(instance_resource) # Namespaces for the mention layer mention_vocab = 'http://groundedannotationframework.org/gaf#' self.namespaces['GAF'] = Namespace(mention_vocab) mention_resource = 'http://cltl.nl/leolani/talk/' self.namespaces['LTa'] = Namespace(mention_resource) # Namespaces for the attribution layer attribution_vocab = 'http://groundedannotationframework.org/grasp#' self.namespaces['GRASP'] = Namespace(attribution_vocab) attribution_resource_friends = 'http://cltl.nl/leolani/friends/' self.namespaces['LF'] = Namespace(attribution_resource_friends) attribution_resource_inputs = 'http://cltl.nl/leolani/inputs/' self.namespaces['LI'] = Namespace(attribution_resource_inputs) # Namespaces for the temporal layer-ish time_vocab = 'http://www.w3.org/TR/owl-time/#' self.namespaces['TIME'] = Namespace(time_vocab) time_resource = 'http://cltl.nl/leolani/time/' self.namespaces['LTi'] = Namespace(time_resource) # The namespaces of external ontologies skos = 'http://www.w3.org/2004/02/skos/core#' self.namespaces['SKOS'] = Namespace(skos) prov = 'http://www.w3.org/ns/prov#' self.namespaces['PROV'] = Namespace(prov) sem = 'http://semanticweb.cs.vu.nl/2009/11/sem/' self.namespaces['SEM'] = Namespace(sem) xml = 'https://www.w3.org/TR/xmlschema-2/#' self.namespaces['XML'] = Namespace(xml) def _get_ontology_path(self): """ Define ontology paths to key vocabularies :return: """ self.ontology_paths[ 'n2mu'] = './../../knowledge_representation/ontologies/leolani.ttl' self.ontology_paths[ 'gaf'] = './../../knowledge_representation/ontologies/gaf.rdf' self.ontology_paths[ 'grasp'] = './../../knowledge_representation/ontologies/grasp.rdf' self.ontology_paths[ 'sem'] = './../../knowledge_representation/ontologies/sem.rdf' def _bind_namespaces(self): """ Bnd namespaces :return: """ self.dataset.bind('n2mu', self.namespaces['N2MU']) self.dataset.bind('leolaniWorld', self.namespaces['LW']) self.dataset.bind('gaf', self.namespaces['GAF']) self.dataset.bind('leolaniTalk', self.namespaces['LTa']) self.dataset.bind('grasp', self.namespaces['GRASP']) self.dataset.bind('leolaniFriends', self.namespaces['LF']) self.dataset.bind('leolaniInputs', self.namespaces['LI']) self.dataset.bind('time', self.namespaces['TIME']) self.dataset.bind('leolaniTime', self.namespaces['LTi']) self.dataset.bind('skos', self.namespaces['SKOS']) self.dataset.bind('prov', self.namespaces['PROV']) self.dataset.bind('sem', self.namespaces['SEM']) self.dataset.bind('xml', self.namespaces['XML']) self.dataset.bind('owl', OWL) ######################################## Helpers for statement processing ######################################## def create_chat_id(self, actor, date): """ Determine chat id depending on my last conversation with this person :param actor: :param date: :return: """ self._log.debug('Chat with {} on {}'.format(actor, date)) query = read_query('last_chat_with') % (actor) response = self._submit_query(query) if response and int(response[0]['day']['value']) == int(date.day) \ and int(response[0]['month']['value']) == int(date.month) \ and int(response[0]['year']['value']) == int(date.year): # Chatted with this person today so same chat id chat_id = int(response[0]['chatid']['value']) else: # Either have never chatted with this person, or I have but not today. Add one to latest chat chat_id = self.get_last_chat_id() + 1 return chat_id def create_turn_id(self, chat_id): self._log.debug('Turn in chat {}'.format(chat_id)) query = read_query('last_turn_in_chat') % (chat_id) response = self._submit_query(query) return int(response['turnid']['value']) + 1 if response else 1 def _generate_leolani(self, instance_graph): # Create Leolani leolani_id = 'leolani' leolani_label = 'leolani' leolani = URIRef(to_iri(self.namespaces['LW'] + leolani_id)) leolani_label = Literal(leolani_label) leolani_type1 = URIRef(to_iri(self.namespaces['N2MU'] + 'robot')) leolani_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((leolani, RDFS.label, leolani_label)) instance_graph.add((leolani, RDF.type, leolani_type1)) instance_graph.add((leolani, RDF.type, leolani_type2)) self.my_uri = leolani return leolani def _generate_subject(self, capsule, instance_graph): if capsule['subject']['type'] == '': # We only get the label subject_vocab = OWL subject_type = 'Thing' else: subject_vocab = self.namespaces['N2MU'] subject_type = capsule['subject']['type'] subject_id = capsule['subject']['label'] subject = URIRef(to_iri(self.namespaces['LW'] + subject_id)) subject_label = Literal(subject_id) subject_type1 = URIRef(to_iri(subject_vocab + subject_type)) subject_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((subject, RDFS.label, subject_label)) instance_graph.add((subject, RDF.type, subject_type1)) instance_graph.add((subject, RDF.type, subject_type2)) return subject, subject_label def _create_leolani_world(self, capsule, type='Statement'): # Instance graph instance_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Instances')) instance_graph = self.dataset.graph(instance_graph_uri) # Subject if type == 'Statement': subject, subject_label = self._generate_subject( capsule, instance_graph) elif type == 'Experience': subject = self._generate_leolani( instance_graph) if self.my_uri is None else self.my_uri subject_label = 'leolani' # Object if capsule['object']['type'] == '': # We only get the label object_vocab = OWL object_type = 'Thing' else: object_vocab = self.namespaces['N2MU'] object_type = capsule['object']['type'] object_id = capsule['object']['label'] object = URIRef(to_iri(self.namespaces['LW'] + object_id)) object_label = Literal(object_id) object_type1 = URIRef(to_iri(object_vocab + object_type)) object_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) instance_graph.add((object, RDFS.label, object_label)) instance_graph.add((object, RDF.type, object_type1)) instance_graph.add((object, RDF.type, object_type2)) if type == 'Statement': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, capsule['predicate']['type'], type='Statement') elif type == 'Experience': claim_graph, statement = self._create_claim_graph( subject, subject_label, object, object_label, 'sees', type='Experience') return instance_graph, claim_graph, subject, object, statement def _create_claim_graph(self, subject, subject_label, object, object_label, predicate, type='Statement'): # Claim graph claim_graph_uri = URIRef(to_iri(self.namespaces['LW'] + 'Claims')) claim_graph = self.dataset.graph(claim_graph_uri) # Statement statement_id = hash_statement_id( [subject_label, predicate, object_label]) statement = URIRef(to_iri(self.namespaces['LW'] + statement_id)) statement_type1 = URIRef(to_iri(self.namespaces['GRASP'] + type)) statement_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) statement_type3 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) # Create graph and add triple graph = self.dataset.graph(statement) graph.add((subject, self.namespaces['N2MU'][predicate], object)) claim_graph.add((statement, RDF.type, statement_type1)) claim_graph.add((statement, RDF.type, statement_type2)) claim_graph.add((statement, RDF.type, statement_type3)) return claim_graph, statement def _create_leolani_talk(self, capsule, leolani, type='Statement'): # Interaction graph if type == 'Statement': graph_to_write = 'Interactions' elif type == 'Experience': graph_to_write = 'Sensors' interaction_graph_uri = URIRef( to_iri(self.namespaces['LTa'] + graph_to_write)) interaction_graph = self.dataset.graph(interaction_graph_uri) # Time date = capsule["date"] time = URIRef( to_iri(self.namespaces['LTi'] + str(capsule["date"].isoformat()))) time_type = URIRef( to_iri(self.namespaces['TIME'] + 'DateTimeDescription')) day = Literal(date.day, datatype=self.namespaces['XML']['gDay']) month = Literal(date.month, datatype=self.namespaces['XML']['gMonthDay']) year = Literal(date.year, datatype=self.namespaces['XML']['gYear']) time_unitType = URIRef(to_iri(self.namespaces['TIME'] + 'unitDay')) interaction_graph.add((time, RDF.type, time_type)) interaction_graph.add((time, self.namespaces['TIME']['day'], day)) interaction_graph.add((time, self.namespaces['TIME']['month'], month)) interaction_graph.add((time, self.namespaces['TIME']['year'], year)) interaction_graph.add( (time, self.namespaces['TIME']['unitType'], time_unitType)) # Actor actor_id = capsule['author'] actor_label = capsule['author'] actor = URIRef(to_iri(to_iri(self.namespaces['LF'] + actor_id))) actor_label = Literal(actor_label) actor_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Actor')) actor_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Instance')) if type == 'Statement': actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'person')) elif type == 'Experience': actor_type3 = URIRef(to_iri(self.namespaces['N2MU'] + 'sensor')) interaction_graph.add((actor, RDFS.label, actor_label)) interaction_graph.add((actor, RDF.type, actor_type1)) interaction_graph.add((actor, RDF.type, actor_type2)) interaction_graph.add((actor, RDF.type, actor_type3)) # Add leolani knows/senses actor if type == 'Statement': predicate = 'knows' elif type == 'Experience': predicate = 'senses' interaction_graph.add( (leolani, self.namespaces['N2MU'][predicate], actor)) _, _ = self._create_claim_graph(leolani, 'leolani', actor, actor_label, predicate, type) # Event and subevent event_id = self.create_chat_id(actor_label, date) if type == 'Statement': event_label = 'chat%s' % event_id elif type == 'Experience': event_label = 'visual%s' % event_id subevent_id = self.create_turn_id(event_id) if type == 'Statement': subevent_label = event_label + '_turn%s' % subevent_id elif type == 'Experience': subevent_label = event_label + '_object%s' % subevent_id turn = URIRef(to_iri(self.namespaces['LTa'] + subevent_label)) turn_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) if type == 'Statement': turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Turn')) elif type == 'Experience': turn_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Object')) interaction_graph.add((turn, RDF.type, turn_type1)) interaction_graph.add((turn, RDF.type, turn_type2)) interaction_graph.add( (turn, self.namespaces['N2MU']['id'], Literal(subevent_id))) interaction_graph.add( (turn, self.namespaces['SEM']['hasActor'], actor)) interaction_graph.add((turn, self.namespaces['SEM']['hasTime'], time)) chat = URIRef(to_iri(self.namespaces['LTa'] + event_label)) chat_type1 = URIRef(to_iri(self.namespaces['SEM'] + 'Event')) if type == 'Statement': chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Chat')) elif type == 'Experience': chat_type2 = URIRef(to_iri(self.namespaces['GRASP'] + 'Visual')) interaction_graph.add((chat, RDF.type, chat_type1)) interaction_graph.add((chat, RDF.type, chat_type2)) interaction_graph.add( (chat, self.namespaces['N2MU']['id'], Literal(event_id))) interaction_graph.add( (chat, self.namespaces['SEM']['hasActor'], actor)) interaction_graph.add((chat, self.namespaces['SEM']['hasTime'], time)) interaction_graph.add( (chat, self.namespaces['SEM']['hasSubevent'], turn)) perspective_graph, mention, attribution = self._create_perspective_graph( capsule, subevent_label) # Link interactions and perspectives perspective_graph.add( (mention, self.namespaces['GRASP']['wasAttributedTo'], actor)) perspective_graph.add( (mention, self.namespaces['GRASP']['hasAttribution'], attribution)) perspective_graph.add( (mention, self.namespaces['PROV']['wasDerivedFrom'], chat)) perspective_graph.add( (mention, self.namespaces['PROV']['wasDerivedFrom'], turn)) return interaction_graph, perspective_graph, actor, time, mention, attribution def _create_perspective_graph(self, capsule, turn_label, type='Statement'): # Perspective graph perspective_graph_uri = URIRef( to_iri(self.namespaces['LTa'] + 'Perspectives')) perspective_graph = self.dataset.graph(perspective_graph_uri) # Mention if type == 'Statement': mention_id = turn_label + '_char%s' % capsule['position'] elif type == 'Experience': mention_id = turn_label + '_pixel%s' % capsule['position'] mention = URIRef(to_iri(self.namespaces['LTa'] + mention_id)) mention_type = URIRef(to_iri(self.namespaces['GRASP'] + 'Mention')) perspective_graph.add((mention, RDF.type, mention_type)) # Attribution attribution_id = mention_id + '_CERTAIN' attribution = URIRef(to_iri(self.namespaces['LTa'] + attribution_id)) attribution_type = URIRef( to_iri(self.namespaces['GRASP'] + 'Attribution')) attribution_value = URIRef(to_iri(self.namespaces['GRASP'] + 'CERTAIN')) perspective_graph.add((attribution, RDF.type, attribution_type)) perspective_graph.add((attribution, RDF.value, attribution_value)) return perspective_graph, mention, attribution def _serialize(self, file_path): """ Save graph to local file and return the serialized string :param file_path: path to where data will be saved :return: serialized data as string """ # Save to file but return the python representation with open(file_path + '.' + self.format, 'w') as f: self.dataset.serialize(f, format=self.format) return self.dataset.serialize(format=self.format) def _upload_to_brain(self, data): """ Post data to the brain :param data: serialized data as string :return: response status """ self._log.debug("Posting triples") # From serialized string post_url = self.address + "/statements" response = requests.post( post_url, data=data, headers={'Content-Type': 'application/x-' + self.format}) return str(response.status_code) def _model_graphs_(self, capsule, type='Statement'): # Leolani world (includes instance and claim graphs) instance_graph, claim_graph, subject, object, instance = self._create_leolani_world( capsule, type) # Identity leolani = self._generate_leolani( instance_graph) if self.my_uri is None else self.my_uri # Leolani talk (includes interaction and perspective graphs) interaction_graph, perspective_graph, actor, time, mention, attribution = self._create_leolani_talk( capsule, leolani, type) # Interconnections instance_graph.add( (subject, self.namespaces['GRASP']['denotedIn'], mention)) instance_graph.add( (object, self.namespaces['GRASP']['denotedIn'], mention)) instance_graph.add( (instance, self.namespaces['GRASP']['denotedBy'], mention)) instance_graph.add( (instance, self.namespaces['SEM']['hasActor'], actor)) instance_graph.add((instance, self.namespaces['SEM']['hasTime'], time)) perspective_graph.add( (mention, self.namespaces['GRASP']['containsDenotation'], subject)) perspective_graph.add( (mention, self.namespaces['GRASP']['containsDenotation'], object)) perspective_graph.add( (mention, self.namespaces['GRASP']['denotes'], instance)) perspective_graph.add( (attribution, self.namespaces['GRASP']['isAttributionFor'], mention)) ######################################### Helpers for question processing ######################################### def _create_query(self, parsed_question): _ = hash_statement_id([ parsed_question['subject']['label'], parsed_question['predicate']['type'], parsed_question['object']['label'] ]) # Query subject if parsed_question['subject']['label'] == "": # Case fold # object_label = casefold_label(parsed_question['object']['label']) query = """ SELECT ?slabel ?authorlabel WHERE { ?s n2mu:%s ?o . ?s rdfs:label ?slabel . ?o rdfs:label '%s' . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } """ % (parsed_question['predicate']['type'], parsed_question['object']['label'], parsed_question['predicate']['type']) # Query object elif parsed_question['object']['label'] == "": query = """ SELECT ?olabel ?authorlabel WHERE { ?s n2mu:%s ?o . ?s rdfs:label '%s' . ?o rdfs:label ?olabel . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } """ % (parsed_question['predicate']['type'], parsed_question['subject']['label'], parsed_question['predicate']['type']) # Query existence else: query = """ SELECT ?authorlabel ?v WHERE { ?s n2mu:%s ?o . ?s rdfs:label '%s' . ?o rdfs:label '%s' . GRAPH ?g { ?s n2mu:%s ?o . } . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . ?m grasp:hasAttribution ?att . ?att rdf:value ?v . } """ % (parsed_question['predicate']['type'], parsed_question['subject']['label'], parsed_question['object']['label'], parsed_question['predicate']['type']) query = self.query_prefixes + query return query def _submit_query(self, query): # Set up connection sparql = SPARQLWrapper(self.address) # Response parameters sparql.setQuery(query) sparql.setReturnFormat(JSON) sparql.addParameter('Accept', 'application/sparql-results+json') response = sparql.query().convert() return response["results"]["bindings"] ######################################### Helpers for conflict processing ######################################### def _get_conflicts_with_predicate(self, one_to_one_predicate): query = """ PREFIX n2mu: <http://cltl.nl/leolani/n2mu/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX grasp: <http://groundedannotationframework.org/grasp#> select ?sname (group_concat(?oname ; separator=";") as ?onames) (group_concat(?authorlabel ; separator=";") as ?authorlabels) where { GRAPH ?g { ?s n2mu:%s ?o . } . ?s rdfs:label ?sname . ?o rdfs:label ?oname . ?g grasp:denotedBy ?m . ?m grasp:wasAttributedTo ?author . ?author rdfs:label ?authorlabel . } group by ?sname having (count(distinct ?oname) > 1) """ % one_to_one_predicate response = self._submit_query(query) conflicts = [] for item in response: conflict = { 'subject': item['sname']['value'], 'predicate': one_to_one_predicate, 'objects': [] } values = item['onames']['value'].split(';') authors = item['authorlabels']['value'].split(';') for val, auth in zip(values, authors): option = {'value': val, 'author': auth} conflict['objects'].append(option) conflicts.append(conflict) return conflicts