def create_ontology(self, tr, predicate, subClass, address, booktitle): LDT = Namespace("http://www.JceFinalProjectOntology.com/") ut = Namespace("http://www.JceFinalProjectOntology.com/subject/#") usubClass = URIRef("http://www.JceFinalProjectOntology.com/subject/" + subClass.strip() + '#') #LDT.subClass=LDT[subClass] print(ut) print(usubClass) store = IOMemory() sty = LDT[predicate] g = rdflib.Graph(store=store, identifier=LDT) t = ConjunctiveGraph(store=store, identifier=ut) print('Triples in graph before add: ', len(t)) #g.add((LDT,RDF.type,RDFS.Class)) g.add((URIRef(LDT), RDF.type, RDFS.Class)) g.add((URIRef(LDT), RDFS.label, Literal("JFPO"))) g.add((URIRef(LDT), RDFS.comment, Literal('class of all properties'))) for v in self.symbols.values(): if self.if_compoTerm(v) == True: vs = self.splitTerms(v)[0] else: vs = v g.add((LDT[vs], RDF.type, RDF.Property)) g.add((LDT[vs], RDFS.label, Literal('has' + vs))) g.add((LDT[vs], RDFS.comment, Literal(v))) g.add((LDT[vs], RDFS.range, OWL.Class)) g.add((LDT[vs], RDFS.domain, Literal(vs))) g.bind('JFPO', LDT) #g.commit() g.serialize('trtst.rdf', format='turtle') t.add((ut[tr], RDF.type, OWL.Class)) t.add((ut[tr], RDFS.subClassOf, OWL.Thing)) t.add((ut[tr], RDFS.label, Literal(tr))) t.add((ut[tr], DC.title, Literal(booktitle))) t.add((ut[tr], DC.source, Literal(address))) t.add((ut[tr], DC[predicate], URIRef(usubClass))) t.add((ut[tr], LDT[predicate], RDF.Property)) t.add((ut[tr], DC[predicate], URIRef(usubClass))) t.add((ut[tr], DC[predicate], URIRef(usubClass))) relation = 'has' + predicate t.add((ut[tr], LDT.term(predicate), URIRef(usubClass))) t.add((usubClass, RDF.type, OWL.Class)) t.add((usubClass, RDFS.subClassOf, OWL.Thing)) t.add((usubClass, RDFS.subClassOf, URIRef(sty))) t.add((usubClass, RDFS.label, Literal(subClass))) #tc=Graph(store=store,identifier=usubClass) t.bind("dc", "http://http://purl.org/dc/elements/1.1/") t.bind('JFPO', LDT) t.commit() #print(t.serialize(format='pretty-xml')) t.serialize('test2.owl', format='turtle')
def locationtoturtle(ellist, meta): rdf=Graph(); cs = Namespace("http://cs.unibo.it/ontology/") colon=Namespace("http://www.essepuntato.it/resource/") dcterms=Namespace("http://purl.org/dc/terms/") xsd=Namespace("http://www.w3.org/2001/XMLSchema#") this=Namespace("http://vitali.web.cs.unibo.it/twiki/pub/TechWeb12/DataSource2/posteBO2011.ttl#") vcard = Namespace("http://www.w3.org/2006/vcard/ns#") rdf.bind("vcard", vcard) rdf.bind("cs", cs) rdf.bind("", colon) rdf.bind("dcterms", dcterms) rdf.bind("xsd", xsd) rdf.bind("this", this) rdf.add((this["metadata"], dcterms["creator"], Literal(meta.creator))) rdf.add((this["metadata"], dcterms["created"], Literal(meta.created,datatype=XSD.date))) rdf.add((this["metadata"], dcterms["description"], Literal(meta.version))) rdf.add((this["metadata"], dcterms["valid"], Literal(meta.valid,datatype=XSD.date))) rdf.add((this["metadata"], dcterms["source"], Literal(meta.source))) for location in ellist: rdf.add((colon[location.id], vcard["fn"], Literal(location.name))) rdf.add((colon[location.id], vcard["extended-address"], Literal(location.address))) rdf.add((colon[location.id], vcard["category"], Literal(location.category))) rdf.add((colon[location.id], vcard["latitude"], Literal(location.lat))) rdf.add((colon[location.id], vcard["longitude"], Literal(location.long))) if(location.tel): rdf.add((colon[location.id], vcard["tel"], Literal(location.tel))) if(location.note): rdf.add((colon[location.id], vcard["note"], Literal(location.note))) rdf.add((colon[location.id], cs["opening"], Literal(location.opening))) rdf.add((colon[location.id], cs["closing"], Literal(location.closing))) print("Content-type: text/turtle; charset=UTF-8\n") print rdf.serialize(format="n3")
def parse_and_serialize(input_files, input_format, guess, outfile, output_format, ns_bindings, store_conn="", store_type=None): if store_type: store = plugin.get(store_type, Store)() store.open(store_conn) graph = ConjunctiveGraph(store) else: store = None graph = ConjunctiveGraph() for prefix, uri in list(ns_bindings.items()): graph.namespace_manager.bind(prefix, uri, override=False) for fpath in input_files: use_format, kws = _format_and_kws(input_format) if fpath == '-': fpath = sys.stdin elif not input_format and guess: use_format = guess_format(fpath) or DEFAULT_INPUT_FORMAT graph.parse(fpath, format=use_format, **kws) if outfile: output_format, kws = _format_and_kws(output_format) kws.setdefault('base', None) graph.serialize(destination=outfile, format=output_format, **kws) if store: store.rollback()
def parse_and_serialize(input_files, input_format, guess, outfile, output_format, ns_bindings, store_conn="", store_type=None): if store_type: store = plugin.get(store_type, Store)() store.open(store_conn) graph = ConjunctiveGraph(store) else: store = None graph = ConjunctiveGraph() for prefix, uri in list(ns_bindings.items()): graph.namespace_manager.bind(prefix, uri, override=False) for fpath in input_files: use_format, kws = _format_and_kws(input_format) if fpath == '-': fpath = sys.stdin elif not input_format and guess: use_format = guess_format(fpath) or DEFAULT_INPUT_FORMAT graph.parse(fpath, format=use_format, **kws) if outfile: output_format, kws = _format_and_kws(output_format) kws.setdefault('base', None) graph.serialize(destination=outfile, format=output_format, **kws) if store: store.rollback()
def test_turtle_namespace_prefixes(self): g = ConjunctiveGraph() n3 = \ """ @prefix _9: <http://data.linkedmdb.org/resource/movie/> . @prefix p_9: <urn:test:> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . p_9:a p_9:b p_9:c . <http://data.linkedmdb.org/resource/director/1> a <http://data.linkedmdb.org/resource/movie/director>; rdfs:label "Cecil B. DeMille (Director)"; _9:director_name "Cecil B. DeMille" .""" g.parse(data=n3, format='n3') turtle = g.serialize(format="turtle") # Check round-tripping, just for kicks. g = ConjunctiveGraph() g.parse(data=turtle, format='turtle') # Shouldn't have got to here s = g.serialize(format="turtle") self.assertTrue(b('@prefix _9') not in s)
def create_ontology(self,tr,predicate,subClass,address,booktitle): LDT= Namespace("http://www.JceFinalProjectOntology.com/") ut=Namespace("http://www.JceFinalProjectOntology.com/subject/#") usubClass=URIRef("http://www.JceFinalProjectOntology.com/subject/"+subClass.strip()+'#') #LDT.subClass=LDT[subClass] print(ut) print(usubClass) store=IOMemory() sty=LDT[predicate] g = rdflib.Graph(store=store,identifier=LDT) t = ConjunctiveGraph(store=store,identifier=ut) print ('Triples in graph before add: ', len(t)) #g.add((LDT,RDF.type,RDFS.Class)) g.add((URIRef(LDT),RDF.type,RDFS.Class)) g.add((URIRef(LDT),RDFS.label,Literal("JFPO"))) g.add((URIRef(LDT),RDFS.comment,Literal('class of all properties'))) for v in self.symbols.values(): if self.if_compoTerm(v)==True: vs=self.splitTerms(v)[0] else: vs =v g.add((LDT[vs],RDF.type,RDF.Property)) g.add((LDT[vs],RDFS.label,Literal('has'+vs))) g.add((LDT[vs],RDFS.comment,Literal(v))) g.add((LDT[vs],RDFS.range,OWL.Class)) g.add((LDT[vs],RDFS.domain,Literal(vs))) g.bind('JFPO',LDT) #g.commit() g.serialize('trtst.rdf',format='turtle') t.add( (ut[tr], RDF.type,OWL.Class) ) t.add((ut[tr],RDFS.subClassOf,OWL.Thing)) t.add((ut[tr],RDFS.label,Literal(tr))) t.add((ut[tr],DC.title,Literal(booktitle))) t.add((ut[tr],DC.source,Literal(address))) t.add((ut[tr],DC[predicate],URIRef(usubClass))) t.add((ut[tr],LDT[predicate],RDF.Property)) t.add((ut[tr],DC[predicate],URIRef(usubClass))) t.add((ut[tr],DC[predicate],URIRef(usubClass))) relation='has'+predicate t.add((ut[tr],LDT.term(predicate),URIRef(usubClass))) t.add( (usubClass,RDF.type,OWL.Class)) t.add((usubClass,RDFS.subClassOf,OWL.Thing)) t.add((usubClass,RDFS.subClassOf,URIRef(sty))) t.add((usubClass,RDFS.label,Literal(subClass))) #tc=Graph(store=store,identifier=usubClass) t.bind("dc", "http://http://purl.org/dc/elements/1.1/") t.bind('JFPO',LDT) t.commit() #print(t.serialize(format='pretty-xml')) t.serialize('test2.owl',format='turtle')
def serialize_store(db_conn, filename): createdb = False rdfstore = rdflib.plugin.get('Sleepycat', rdflib.store.Store)() # rdflib can create necessary structures if the store is empty rt = rdfstore.open(db_conn, create=False) cg = ConjunctiveGraph(store=rdfstore) f = open(filename, 'w') cg.serialize(f) f.close() return True
def serialize_store(db_conn, filename): createdb = False rdfstore = rdflib.plugin.get('Sleepycat', rdflib.store.Store)() # rdflib can create necessary structures if the store is empty rt = rdfstore.open(db_conn, create=False) cg = ConjunctiveGraph(store=rdfstore) f = open(filename, 'w') cg.serialize(f) f.close() return True
def extract_rdfa(url, outfile=sys.stdout, parser="rdfa", serializer="n3"): """ Extract RDFa from a given URL Parsers are listed at https://rdflib.readthedocs.org/en/4.1.0/plugin_parsers.html Serializers are listed at https://rdflib.readthedocs.org/en/4.1.0/plugin_serializers.html """ store = None graph = ConjunctiveGraph() graph.parse(url, format=parser) graph.serialize(destination=outfile, format=serializer)
def extract_rdfa(url, outfile=sys.stdout, parser="rdfa", serializer="n3"): """ Extract RDFa from a given URL Parsers are listed at https://rdflib.readthedocs.org/en/4.1.0/plugin_parsers.html Serializers are listed at https://rdflib.readthedocs.org/en/4.1.0/plugin_serializers.html """ store = None graph = ConjunctiveGraph() graph.parse(url, format=parser) graph.serialize(destination=outfile, format=serializer)
def get(self): g = ConjunctiveGraph() ns = Namespace('http://purl.org/NET/mediatype#') for mt in models.MediaType.all(): g.add((URIRef(mt.uri), RDF.type, ns['MediaType'])) g.add((URIRef(mt.uri), RDFS.label, Literal(mt.name))) if mt.rfc_url: g.add((URIRef(mt.uri), RDFS.seeAlso, URIRef(mt.rfc_url))) if mt.application_url: g.add((URIRef(mt.uri), RDFS.seeAlso, URIRef(mt.application_url))) self.response.headers['Content-Type'] = 'application/rdf+xml' g.serialize(self.response.out)
def convert_gml(self, ttl_output_file, uri_part, specific_part): """ Pelagios conversion GML to TTL @type ttl_output_file: string @param ttl_output_file: Absolute path to TTL output file @type uri_part: string @param uri_part: URI for the region to be displayed (e.g. http://earkdev.ait.ac.at/earkweb/sip2aip/working_area/sip2aip/34809536-b9f8-4c51-83d1-ef365ca658f5/) @type specific_part: string @param specific_part: Specific part that distinguishes the URI from other URIs (e.g. 1994) """ cito_ns = Namespace("http://purl.org/spar/cito") cnt_ns = Namespace("http://www.w3.org/2011/content#") dcterms_ns = Namespace("http://purl.org/dc/terms/") foaf_ns = Namespace("http://xmlns.com/foaf/0.1/") geo_ns = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#") geosparql_ns = Namespace("http://www.opengis.net/ont/geosparql#") gn_ns = Namespace("http://www.geonames.org/ontology#") lawd_ns = Namespace("http://lawd.info/ontology/") rdfs_ns = Namespace("http://www.w3.org/2000/01/rdf-schema#") skos_ns = Namespace("http://www.w3.org/2004/02/skos/core#") slovenia = URIRef("http://earkdev.ait.ac.at/earkweb/sip2aip/working_area/sip2aip/5c6f5563-7665-4719-a2b6-4356ea033c1d/#place/Slovenia") store = IOMemory() g = ConjunctiveGraph(store=store) g.bind("cito", cito_ns) g.bind("cnt", cnt_ns) g.bind("dcterms", dcterms_ns) g.bind("foaf", foaf_ns) g.bind("geo", geo_ns) g.bind("geosparql", geosparql_ns) g.bind("gn", gn_ns) g.bind("lawd", lawd_ns) g.bind("rdfs", rdfs_ns) g.bind("skos", skos_ns) graph_slovenian_districts = Graph(store=store, identifier=slovenia) gml_to_wkt = GMLtoWKT(self.gml_file) district_included = {} i = 1 print "Processing GML file: %s" % self.gml_file for district_wkt in gml_to_wkt.get_wkt_linear_ring(): techname = whsp_to_unsc(district_wkt["name"]) print "District %d: %s" % (i, whsp_to_unsc(district_wkt["name"])) if techname not in district_included: district = URIRef("%s#place/%s/%s" % (uri_part, whsp_to_unsc(district_wkt["name"]), specific_part)) graph_slovenian_districts.add((district, RDF.type, lawd_ns.Place)) graph_slovenian_districts.add((district, dcterms_ns['isPartOf'], slovenia)) graph_slovenian_districts.add((district, dcterms_ns['temporal'], Literal(str(district_wkt["year"])))) graph_slovenian_districts.add((district, gn_ns['countryCode'], Literal(u'SI'))) graph_slovenian_districts.add((district, rdfs_ns['label'], Literal(district_wkt["name"], lang=u'si'))) polygons = BNode() graph_slovenian_districts.add((district, geosparql_ns['hasGeometry'], polygons)) g.add((polygons, geosparql_ns['asWKT'], Literal(district_wkt["polygon"]))) district_included[techname] = True i += 1 with open(ttl_output_file, 'w') as f: f.write(g.serialize(format='n3')) f.close()
def test_issue_250(self): """ https://github.com/RDFLib/rdflib/issues/250 When I have a ConjunctiveGraph with the default namespace set, for example import rdflib g = rdflib.ConjunctiveGraph() g.bind(None, "http://defaultnamespace") then the Trix serializer binds the default namespace twice in its XML output, once for the Trix namespace and once for the namespace I used: print(g.serialize(format='trix').decode('UTF-8')) <?xml version="1.0" encoding="utf-8"?> <TriX xmlns:xml="http://www.w3.org/XML/1998/namespace" xmlns="http://defaultnamespace" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns="http://www.w3.org/2004/03/trix/trix-1/" /> """ graph = ConjunctiveGraph() graph.bind(None, "http://defaultnamespace") sg = graph.serialize(format='trix').decode('UTF-8') self.assertTrue( 'xmlns="http://defaultnamespace"' not in sg, sg) self.assertTrue( 'xmlns="http://www.w3.org/2004/03/trix/trix-1/' in sg, sg)
class SiocWiki(object): def __init__(self, uri, title=None, created=None): self.graph = Graph() self.graph.bind('sioc', SIOC) self.graph.bind('dc', DC) self.graph.bind('dcterms', DCTERMS) self.graph.bind('rdf', RDF) self._add_site(uri, title) def _add_site(self, uri, title): node = URIRef(uri) self.graph.add((node, RDF.type, SIOC['Site'])) self.graph.add((node, DC['title'], Literal(title))) return node def add_page(self, content, title, uri, updated): node = URIRef(uri) self.graph.add((node, RDF.type, SIOC['Wiki'])) self.graph.add((node, SIOC['link'], URIRef(uri))) self.graph.add((node, DC['title'], Literal(title))) self.graph.add((node, DC['content'], Literal(content))) self.graph.add((node, DCTERMS['updated'], Literal(updated))) def to_str(self): return self.graph.serialize(format="pretty-xml")
def test_issue_250(self): """ https://github.com/RDFLib/rdflib/issues/250 When I have a ConjunctiveGraph with the default namespace set, for example import rdflib g = rdflib.ConjunctiveGraph() g.bind(None, "http://defaultnamespace") then the Trix serializer binds the default namespace twice in its XML output, once for the Trix namespace and once for the namespace I used: print(g.serialize(format='trix').decode('UTF-8')) <?xml version="1.0" encoding="utf-8"?> <TriX xmlns:xml="http://www.w3.org/XML/1998/namespace" xmlns="http://defaultnamespace" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns="http://www.w3.org/2004/03/trix/trix-1/" /> """ graph = ConjunctiveGraph() graph.bind(None, "http://defaultnamespace") sg = graph.serialize(format="trix") self.assertTrue('xmlns="http://defaultnamespace"' not in sg, sg) self.assertTrue('xmlns="http://www.w3.org/2004/03/trix/trix-1/' in sg, sg)
def check_n3_serialize(fpath, fmt, verbose=False): g = ConjunctiveGraph() _parse_or_report(verbose, g, fpath, format=fmt) if verbose: for t in g: print t print "========================================" print "Parsed OK!" s = g.serialize(format='n3') if verbose: print s g2 = ConjunctiveGraph() _parse_or_report(verbose, g2, data=s, format='n3') if verbose: print g2.serialize() crapCompare(g,g2)
def check_nt_serialize(fpath, fmt, verbose=False): g = ConjunctiveGraph() _parse_or_report(verbose, g, fpath, format=fmt) if verbose: for t in g: print t print "========================================" print "Parsed OK!" s = g.serialize(format='nt') if verbose: print "Serialized to: ", s g2 = ConjunctiveGraph() _parse_or_report(verbose, g2, data=s, format='nt') if verbose: print g2.serialize() crapCompare(g,g2)
def output_to_oac(fileid, dir, metadata, annotations): """ TODO """ # import libraries from rdflib import Namespace, BNode, Literal, URIRef,RDF,RDFS from rdflib.graph import Graph, ConjunctiveGraph from rdflib.plugins.memory import IOMemory # declare namespaces oac = Namespace("http://www.w3.org/ns/oa#") perseus = Namespace("http://data.perseus.org/citations/") myanno = Namespace("http://hellespont.org/annotations/jstor") store = IOMemory() # initialise the graph g = ConjunctiveGraph(store=store) # bind namespaces g.bind("oac",oac) g.bind("perseus",perseus) g.bind("myanno",myanno) for n,ann in enumerate(metadata["citations"]): anno1 = URIRef(myanno["#%i"%n]) g.add((anno1, RDF.type,oac["Annotation"])) g.add((anno1, oac["hasTarget"],URIRef("%s%s"%("http://jstor.org/stable/",metadata["doi"])))) g.add((anno1, RDFS.label, Literal(ann["label"]))) g.add((anno1,oac["hasBody"],perseus[ann["ctsurn"]])) g.add((anno1,oac["motivatedBy"],oac["linking"])) fname="%s%s"%(dir, fileid.replace(".txt",".ttl")) f=open(fname,"w") f.write(g.serialize(format="turtle")) f.close() return
def view(name=None, format=None, view=None): self.db.store.nsBindings = {} content_type = None if format is not None: if format in extensions: content_type = extensions[format] else: name = '.'.join([name, format]) #argstring = '&'.join(["%s=%s"%(k,v) for k,v in request.args.iteritems(multi=True) if k != 'value']) if name is not None: #if len(argstring) > 0: # name = name + "?" + argstring entity = self.NS.local[name] elif 'uri' in request.args: entity = URIRef(request.args['uri']) else: entity = self.NS.local.Home #print(request.method, 'view()', entity, view) if request.method == 'POST': print ("uploading file",entity) if len(request.files) == 0: flash('No file uploaded') return redirect(request.url) upload_type = rdflib.URIRef(request.form['upload_type']) self.add_files(entity, [y for x, y in request.files.items(multi=True)], upload_type=upload_type) url = "/about?%s" % urlencode(dict(uri=str(entity), view="view")) print ("redirecting to",url) return redirect(url) elif request.method == 'DELETE': self.delete_file(entity) return '', 204 elif request.method == 'GET': resource = self.get_resource(entity) # 'view' is the default view fileid = resource.value(self.NS.whyis.hasFileID) if fileid is not None and 'view' not in request.args: print (resource.identifier, fileid) f = self.file_depot.get(fileid) fsa = FileServeApp(f, self.config["file_archive"].get("cache_max_age",3600*24*7)) return fsa if content_type is None: content_type = request.headers['Accept'] if 'Accept' in request.headers else 'text/turtle' #print entity fmt = sadi.mimeparse.best_match([mt for mt in list(dataFormats.keys()) if mt is not None],content_type) if 'view' in request.args or fmt in htmls: return render_view(resource) elif fmt in dataFormats: output_graph = ConjunctiveGraph() result, status, headers = render_view(resource, view='describe') output_graph.parse(data=result, format="json-ld") return output_graph.serialize(format=dataFormats[fmt]), 200, {'Content-Type':content_type} #elif 'view' in request.args or sadi.mimeparse.best_match(htmls, content_type) in htmls: else: return render_view(resource)
def main(): import optparse prs = optparse.OptionParser() prs.add_option("-i", "--ipython", action="store_true") prs.add_option("--print-store", action="store_true") prs.add_option("--drop-store", action="store_true") (opts, args) = prs.parse_args() logging.getLogger().setLevel(logging.DEBUG) initialize_rdflib() store = store_from_connstr(DEFAULT_STORE_URI) if opts.drop_store: store.destroy(DEFAULT_STORE_URI) exit(0) # Create a new named graph graph_uri = get_session_uri() graph = Graph(store, identifier=URIRef(graph_uri)) # ! if opts.print_store: print store cmt = """ contexts = sorted(set(graph.contexts())) print "Contexts:" for c in contexts: print c print "" for c in contexts: print '\n\n', c for t in graph.triples((None,None,None), context=c): print t """ print graph.serialize(format='n3') if opts.ipython: #import sys import IPython IPython.Shell.IPShellEmbed(argv=args)(local_ns=locals(), global_ns=globals())
def test_pretty_broken_xmlliteral(self): # given: g = ConjunctiveGraph() g.add((BNode(), RDF.value, Literal(u'''<p ''', datatype=RDF.XMLLiteral))) # when: xmlrepr = g.serialize(format='pretty-xml') # then: assert u'''<rdf:value rdf:datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"><p '''.encode('utf-8') in xmlrepr
def test_pretty_xmlliteral(self): # given: g = ConjunctiveGraph() g.add((BNode(), RDF.value, Literal(u'''<p xmlns="http://www.w3.org/1999/xhtml">See also <a href="#aring">Å</a></p>''', datatype=RDF.XMLLiteral))) # when: xmlrepr = g.serialize(format='pretty-xml') # then: assert u'''<rdf:value rdf:parseType="Literal"><p xmlns="http://www.w3.org/1999/xhtml">See also <a href="#aring">Å</a></p></rdf:value>'''.encode('utf-8') in xmlrepr
def post(self): query = self.request.get("content") nrOfResults = self.request.get("amount") try: number = int(nrOfResults) except ValueError: number = 0 literals = re.findall(r'"(.+?)"',query) urls = processLiterals(literals, number) graph = ConjunctiveGraph() for url in urls: # Original URL fetch xmlresult = urlfetch.fetch(url,deadline=60,method=urlfetch.GET) if xmlresult.status_code == 200: iwa = Namespace('http://iwa2012-18-2.appspot.com/#') idns = Namespace('http://iwa2012-18-2.appspot.com/id/#') venuens = Namespace('http://iwa2012-18-2.appspot.com/venueid/#') tree = etree.fromstring(xmlresult.content) for event in tree.findall('events/event'): id = event.attrib['id'] title = event.find('title') url = event.find('url') venueid = event.find('venue_id') venueurl = event.find('venue_url') venuename = event.find('venue_name') graph.add((idns[id], iwa['hasTitle'], Literal(title.text))) graph.add((idns[id], iwa['hasUrl'], Literal(url.text))) graph.add((venuens[id], iwa['hasVenueName'], Literal(venuename.text))) graph.add((venuens[id], iwa['hasUrl'], Literal(venueurl.text))) graph.add((idns[id], iwa['atVenue'], venuens[id]))) else: print "Something went wrong with the connection to the Eventful server. Status code: " + xml.status_code print graph.serialize()
def rdf_description(name, notation='xml' ): """ Funtion takes title of node, and rdf notation. """ valid_formats = ["xml", "n3", "ntriples", "trix"] default_graph_uri = "http://gstudio.gnowledge.org/rdfstore" configString = "/var/tmp/rdfstore" # Get the Sleepycat plugin. store = plugin.get('Sleepycat', Store)('rdfstore') # Open previously created store, or create it if it doesn't exist yet graph = Graph(store="Sleepycat", identifier = URIRef(default_graph_uri)) path = mkdtemp() rt = graph.open(path, create=False) if rt == NO_STORE: #There is no underlying Sleepycat infrastructure, create it graph.open(path, create=True) else: assert rt == VALID_STORE, "The underlying store is corrupt" # Now we'll add some triples to the graph & commit the changes rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') graph.bind("gstudio", "http://gnowledge.org/") exclusion_fields = ["id", "rght", "node_ptr_id", "image", "lft", "_state", "_altnames_cache", "_tags_cache", "nid_ptr_id", "_mptt_cached_fields"] node=Objecttype.objects.get(title=name) node_dict=node.__dict__ subject=str(node_dict['id']) for key in node_dict: if key not in exclusion_fields: predicate=str(key) pobject=str(node_dict[predicate]) graph.add((rdflib[subject], rdflib[predicate], Literal(pobject))) graph.commit() print graph.serialize(format=notation) graph.close()
def rdf_description(name, notation='xml'): """ Funtion takes title of node, and rdf notation. """ valid_formats = ["xml", "n3", "ntriples", "trix"] default_graph_uri = "http://gstudio.gnowledge.org/rdfstore" configString = "/var/tmp/rdfstore" # Get the Sleepycat plugin. store = plugin.get('Sleepycat', Store)('rdfstore') # Open previously created store, or create it if it doesn't exist yet graph = Graph(store="Sleepycat", identifier=URIRef(default_graph_uri)) path = mkdtemp() rt = graph.open(path, create=False) if rt == NO_STORE: #There is no underlying Sleepycat infrastructure, create it graph.open(path, create=True) else: assert rt == VALID_STORE, "The underlying store is corrupt" # Now we'll add some triples to the graph & commit the changes rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') graph.bind("gstudio", "http://gnowledge.org/") exclusion_fields = [ "id", "rght", "node_ptr_id", "image", "lft", "_state", "_altnames_cache", "_tags_cache", "nid_ptr_id", "_mptt_cached_fields" ] node = Objecttype.objects.get(title=name) node_dict = node.__dict__ subject = str(node_dict['id']) for key in node_dict: if key not in exclusion_fields: predicate = str(key) pobject = str(node_dict[predicate]) graph.add((rdflib[subject], rdflib[predicate], Literal(pobject))) graph.commit() print graph.serialize(format=notation) graph.close()
def test_pretty_broken_xmlliteral(self): # given: g = ConjunctiveGraph() g.add((BNode(), RDF.value, Literal("""<p """, datatype=RDF.XMLLiteral))) # when: xmlrepr = g.serialize(format="pretty-xml") # then: assert ( """<rdf:value rdf:datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral"><p """ .encode("utf-8") in xmlrepr)
def writeFile(self, stmts, ctx, fileWords): g = ConjunctiveGraph() doc = {'ctx' : ctx} for s in stmts: g.add(s) if s[1] == SIOC.has_reply: doc['topic'] = s[0] if s[1] == DCTERMS.created: # expecting 2 of these, but same value doc['created'] = parse(s[2]) doc['n3'] = g.serialize(format="n3") self.mongo['comment'].insert(doc, safe=True)
def gnis2rdf(gnisfilename, rdffilename): gnisfile = open(gnisfilename, "rb") store = ConjunctiveGraph(identifier="temp") if not gnisfile: print("Error opening gnis file!") return False gnisreader = csv.reader(gnisfile, delimiter="|") # Drop first row gnisreader.next() for r in gnisreader: InsertGNISFeature(r, store) # Add prefixes to store store.bind("gnis", gnis) store.bind("geo", geo) print("Serializing rdf...") store.serialize(destination=rdffilename, format="n3") print("created " + str(len(store)) + " triples")
def test_escaping_of_triple_doublequotes(): """ Issue 186 - Check escaping of multiple doublequotes. A serialization/deserialization roundtrip of a certain class of Literals fails when there are both, newline characters and multiple subsequent quotation marks in the lexical form of the Literal. In this case invalid N3 is emitted by the serializer, which in turn cannot be parsed correctly. """ g=ConjunctiveGraph() g.add((URIRef('http://foobar'), URIRef('http://fooprop'), Literal('abc\ndef"""""'))) # assert g.serialize(format='n3') == '@prefix ns1: <http:// .\n\nns1:foobar ns1:fooprop """abc\ndef\\"\\"\\"\\"\\"""" .\n\n' g2=ConjunctiveGraph() g2.parse(data=g.serialize(format='n3'), format='n3') assert g.isomorphic(g2) is True
def gnis2rdf(gnisfilename, rdffilename): gnisfile = open(gnisfilename, 'rb') store = ConjunctiveGraph(identifier='temp') if not gnisfile: print('Error opening gnis file!') return False gnisreader = csv.reader(gnisfile, delimiter='|') # Drop first row gnisreader.next() for r in gnisreader: InsertGNISFeature(r, store) # Add prefixes to store store.bind('gnis', gnis) store.bind('geo', geo) print('Serializing rdf...') store.serialize(destination=rdffilename, format='n3') print('created ' + str(len(store)) + ' triples')
def track(self, resource): graph = ConjunctiveGraph() sparql = SPARQLWrapper(self.conf.get_SPARQL()) queue = [resource] while len(queue) != 0: target = queue.pop() query = DESCRIBE_QUERY.replace('__RESOURCE__', target.n3()) query = query.replace('__RELEASE__', self.conf.get_graph_name('release')) query = query.replace('__RULES__', self.conf.get_graph_name('rules')) query = query.replace('__RAW_DATA__', self.conf.get_graph_name('raw-data')) sparql.setQuery(query) results = sparql.query().convert() for statement in results: # Add the statement to the graph graph.add(statement) # If the relate to another resource we describe, queue it (_,p,o) = statement if p.startswith(PROV): if o.startswith(self.conf.get_namespace('data')): queue.append(o) print graph.serialize(format='turtle')
def test_escaping_of_triple_doublequotes(): """ Issue 186 - Check escaping of multiple doublequotes. A serialization/deserialization roundtrip of a certain class of Literals fails when there are both, newline characters and multiple subsequent quotation marks in the lexical form of the Literal. In this case invalid N3 is emitted by the serializer, which in turn cannot be parsed correctly. """ g = ConjunctiveGraph() g.add((URIRef('http://foobar'), URIRef('http://fooprop'), Literal('abc\ndef"""""'))) # assert g.serialize(format='n3') == '@prefix ns1: <http:// .\n\nns1:foobar ns1:fooprop """abc\ndef\\"\\"\\"\\"\\"""" .\n\n' g2 = ConjunctiveGraph() g2.parse(data=g.serialize(format='n3'), format='n3') assert g.isomorphic(g2) is True
def gatherAndExportUserData(repo_name,userId,userToken): store = IOMemory() g=ConjunctiveGraph(store=store) g.bind("av",ns) g.bind("sc",sc) g.bind("dbo",dbo) g.bind("fb",fb) createGraphForFBUser(store,repo_name,userId,userToken) graphString = g.serialize(format="n3") with open("user.ttl","w") as f: f.write(graphString) response = sesame.import_content(repo_name,graphString)
def write_graph(data_handle, out_handle, format='n3'): graph = Graph() count = 0 for record in generate_records(data_handle): count += 1 if count % 1000: sys.stderr.write(".") else: sys.stderr.write(str(count)) for triple in get_triples(record): graph.add(triple) graph.commit() current_site = Site.objects.get_current() domain = 'https://%s' % current_site.domain out_handle.write(graph.serialize(format=format, base=domain, include_base=True)) return count
def test_pretty_xmlliteral(self): # given: g = ConjunctiveGraph() g.add(( BNode(), RDF.value, Literal( u"""<p xmlns="http://www.w3.org/1999/xhtml">See also <a href="#aring">Å</a></p>""", datatype=RDF.XMLLiteral, ), )) # when: xmlrepr = g.serialize(format="pretty-xml") # then: assert ( u"""<rdf:value rdf:parseType="Literal"><p xmlns="http://www.w3.org/1999/xhtml">See also <a href="#aring">Å</a></p></rdf:value>""" .encode("utf-8") in xmlrepr)
def testSerialize(self): s1 = URIRef('store:1') r1 = URIRef('resource:1') r2 = URIRef('resource:2') label = URIRef('predicate:label') g1 = Graph(identifier = s1) g1.add((r1, label, Literal("label 1", lang="en"))) g1.add((r1, label, Literal("label 2"))) s2 = URIRef('store:2') g2 = Graph(identifier = s2) g2.add((r2, label, Literal("label 3"))) g = ConjunctiveGraph() for s,p,o in g1.triples((None, None, None)): g.addN([(s,p,o,g1)]) for s,p,o in g2.triples((None, None, None)): g.addN([(s,p,o,g2)]) r3 = URIRef('resource:3') g.add((r3, label, Literal(4))) r = g.serialize(format='trix') g3 = ConjunctiveGraph() from StringIO import StringIO g3.parse(StringIO(r), format='trix') for q in g3.quads((None,None,None)): # TODO: Fix once getGraph/getContext is in conjunctive graph if isinstance(q[3].identifier, URIRef): tg=Graph(store=g.store, identifier=q[3].identifier) else: # BNode, this is a bit ugly # we cannot match the bnode to the right graph automagically # here I know there is only one anonymous graph, # and that is the default one, but this is not always the case tg=g.default_context self.assertTrue(q[0:3] in tg)
def gatherAndExportGenreData(repo_name): store = IOMemory() g=ConjunctiveGraph(store=store) g.bind("av",ns) g.bind("sc",sc) g.bind("dbo",dbo) g.bind("fb",fb) genreRelations = dbpedia.getDBpediaGenreRelations() genreNames = dbpedia.getDbpediaMusicGenres() createGraphForGenres(store,genreNames,genreRelations) graphString = g.serialize(format="n3") with open("genres.ttl","w") as f: f.write(graphString) response = sesame.import_content(repo_name,graphString)
def testSerialize(self): s1 = URIRef('store:1') r1 = URIRef('resource:1') r2 = URIRef('resource:2') label = URIRef('predicate:label') g1 = Graph(identifier=s1) g1.add((r1, label, Literal("label 1", lang="en"))) g1.add((r1, label, Literal("label 2"))) s2 = URIRef('store:2') g2 = Graph(identifier=s2) g2.add((r2, label, Literal("label 3"))) g = ConjunctiveGraph() for s, p, o in g1.triples((None, None, None)): g.addN([(s, p, o, g1)]) for s, p, o in g2.triples((None, None, None)): g.addN([(s, p, o, g2)]) r3 = URIRef('resource:3') g.add((r3, label, Literal(4))) r = g.serialize(format='trix') g3 = ConjunctiveGraph() from StringIO import StringIO g3.parse(StringIO(r), format='trix') for q in g3.quads((None, None, None)): # TODO: Fix once getGraph/getContext is in conjunctive graph if isinstance(q[3].identifier, URIRef): tg = Graph(store=g.store, identifier=q[3].identifier) else: # BNode, this is a bit ugly # we cannot match the bnode to the right graph automagically # here I know there is only one anonymous graph, # and that is the default one, but this is not always the case tg = g.default_context self.assertTrue(q[0:3] in tg)
def processHEAD(request, return_content = False): ''' Returns an httplib.HTTPRequest ''' graph = get_graph_from_request(request) accept = http_accept(request) if accept not in FORMAT_MAP.values(): return HttpResponse(status = 406) g = None if graph is None: g = ConjunctiveGraph(store=CharmeMiddleware.get_store()) else: g = generate_graph(CharmeMiddleware.get_store(), URIRef(graph)) content = g.serialize(format = rdf_format_from_mime(accept)) if return_content: return HttpResponse(content = content) return HttpResponse()
def gatherAndExportGlobalData(repo_name): store = IOMemory() g=ConjunctiveGraph(store=store) g.bind("av",ns) g.bind("sc",sc) g.bind("dbo",dbo) g.bind("fb",fb) venues = importVenuesFromFile("fb_data_stuff/venues.txt") events = importEventsFromDirectory("fb_data_stuff/events/") createGraphForEvents(store,repo_name,events) createGraphForVenues(store,venues) createGraphForEventArtistsAndGenres(store,repo_name,events) graphString = g.serialize(format="n3") with open("global.ttl","w") as f: f.write(graphString)
def generictest(testFile): func_name = __name__ = __doc__ = id = 'test_sparql.' + \ os.path.splitext(testFile)[0][8:].translate( maketrans('-/','__')) store = plugin.get(STORE,Store)() bootStrapStore(store) store.commit() prefix = testFile.split('.rq')[-1] manifestPath = '/'.join(testFile.split('/')[:-1]+['manifest.n3']) manifestPath2 = '/'.join(testFile.split('/')[:-1]+['manifest.ttl']) queryFileName = testFile.split('/')[-1] store = plugin.get(STORE,Store)() store.open(configString,create=False) assert len(store) == 0 manifestG=ConjunctiveGraph(store) if not os.path.exists(manifestPath): assert os.path.exists(manifestPath2) manifestPath = manifestPath2 manifestG.default_context.parse(open(manifestPath), publicID=URIRef(TEST_BASE), format='n3') manifestData = manifestG.query( MANIFEST_QUERY, processor='sparql', initBindings={'query' : TEST_BASE[queryFileName]}, initNs=manifestNS, DEBUG = False) store.rollback() store.close() for source,testCaseName,testCaseComment,expectedRT in manifestData: if expectedRT: expectedRT = '/'.join(testFile.split('/')[:-1] + \ [expectedRT.replace(TEST_BASE,'')]) if source: source = '/'.join(testFile.split('/')[:-1] + \ [source.replace(TEST_BASE,'')]) testCaseName = testCaseComment and testCaseComment or testCaseName # log.debug("## Source: %s ##"%source) # log.debug("## Test: %s ##"%testCaseName) # log.debug("## Result: %s ##"%expectedRT) #Expected results if expectedRT: store = plugin.get(STORE,Store)() store.open(configString,create=False) resultG=ConjunctiveGraph(store).default_context log.debug("###"*10) log.debug("parsing: %s" % open(expectedRT).read()) log.debug("###"*10) assert len(store) == 0 # log.debug("## Parsing (%s) ##"%(expectedRT)) if not trialAndErrorRTParse(resultG,expectedRT,DEBUG): log.debug( "Unexpected result format (for %s), skipping" % \ (expectedRT)) store.rollback() store.close() continue log.debug("## Done .. ##") rtVars = [rtVar for rtVar in resultG.objects(None,RESULT_NS.resultVariable)] bindings = [] resultSetNode = resultG.value(predicate=RESULT_NS.value, object=RESULT_NS.ResultSet) for solutionNode in resultG.objects(resultSetNode, RESULT_NS.solution): bindingDict = dict([(key,None) for key in rtVars]) for bindingNode in resultG.objects(solutionNode, RESULT_NS.binding): value = resultG.value(subject=bindingNode, predicate=RESULT_NS.value) name = resultG.value(subject=bindingNode, predicate=RESULT_NS.variable) bindingDict[name] = value rbinds = [bindingDict[vName] for vName in rtVars] # print("Rbinds", rbinds) if len(rbinds) > 1 and ( isinstance(rbinds, list) or isinstance(rbinds, tuple)): bindings.append(frozenset(rbinds)) elif len(rbinds) == 1 and ( isinstance(rbinds, list) or isinstance(rbinds, tuple)): bindings.append(rbinds[0]) else: bindings.append(rbinds) # bindings.append(tuple([bindingDict[vName] for vName in rtVars])) log.debug(open(expectedRT).read()) store.rollback() store.close() if testFile in tests2Skip.keys(): log.debug("Skipping test (%s) %s\n" % \ (testFile, tests2Skip[testFile])) raise SkipTest("Skipping test (%s) %s\n" % \ (testFile, tests2Skip[testFile])) query = open(testFile).read() log.debug("### %s (%s) ###" % (testCaseName,testFile)) log.debug(query) p = parse(query)#,DEBUG_PARSE) log.debug(p) if EVALUATE and source: log.debug("### Source Graph: ###") log.debug(open(source).read()) store = plugin.get(STORE,Store)() store.open(configString,create=False) g = ConjunctiveGraph(store) try: g.parse(open(source),format='n3') except: log.debug("Unexpected data format (for %s), skipping" % \ (source)) store.rollback() store.close() continue rt = g.query(query, processor='sparql', DEBUG = False) if expectedRT: try: result = rt.result except AttributeError: result = rt if isinstance(result, Graph): resgraph = open(graphtests[testFile]).read() store = plugin.get(STORE,Store)() store.open(configString,create=False) g = ConjunctiveGraph(store) g.parse(data=resgraph,format="n3") assert result == g, \ "### Test Failed: ###\n\nB:\n%s\n\nR:\n%s\n\n" % \ (g.serialize(format="n3"), result.serialize(format="n3")) else: # result = [r[0] for r in result if isinstance(r, (tuple, list))] def stab(r): if isinstance(r, (tuple, list)): return frozenset(r) else: return r results = set( [stab(r) for r in result]) assert set(bindings).difference(results) == set([]) or set(bindings) == results, \ "### Test Failed: ###\n\nB:\n%s\n\nR:\n%s\n\n" % \ (set(bindings), results) log.debug("### Test Passed: ###") store.rollback()
def query_lode(self, id): var = "http://inpho.cogs.indiana.edu/thinker/" + id # initialize dictionaries to store temporray results dbPropResults = {} inpho_DB = {} DB_inpho = {} dbpedia_web = {} triples = {} # init graphs for LODE and mapped data gLODE = ConjunctiveGraph() gReturn = ConjunctiveGraph() # import InPhO data gLODE.parse("http://inphodata.cogs.indiana.edu/lode/out_n3.20140207.rdf", format="n3") # builds a set of triples with the inpho id as the first entry and the # dbpedia id as the second resultsLODE = gLODE.query( """ SELECT ?thinker_LODE ?thinkerDB WHERE { ?thinker_LODE owl:sameAs ?thinkerDB FILTER (regex(str(?thinker_LODE),"http://inpho.cogs.indiana.edu","i") && regex(str(?thinkerDB),"http://dbpedia.org/resource/","i")). } """ ) # load in property mapping between inpho-dbpedia prop_map_filename = config.get_data_path("rdf_map.txt") with open(prop_map_filename, "r") as f: dbprops = csv.reader(f, delimiter="\t") for dbprop in dbprops: dbPropResults[dbprop[1]] = dbprop[0] dbpedia_web[dbprop[1].split(":")[1]] = dbprop[2] # iterate through triples and store mappings for triple in resultsLODE: inpho_DB[str(triple[0])] = str(triple[1]) # store the results in key as inpho url and value as dbpedia url DB_inpho[str(triple[1])] = str(triple[0]) # store the results in key as dbpedia url and value as inpho url # queries for all relationships in dbpedia sparqlDB = SPARQLWrapper("http://inpho-dataserve.cogs.indiana.edu:8890/sparql/") sparqlDB.setReturnFormat(JSON) for inpho, DB in inpho_DB.iteritems(): predicate = {} # for dbprop in dbPropResults: if str(DB_inpho.get(DB)) == var: for dbprop in dbPropResults: sparqlDB.setQuery( """ PREFIX dbpprop: <http://dbpedia.org/ontology/> SELECT ?b WHERE { <""" + DB + """> """ + dbprop + """ ?b. FILTER (regex(str(?b),"dbpedia.org/resource/","i")). }""" ) resultsDB = sparqlDB.query().convert() predicate[dbprop] = resultsDB["results"]["bindings"] triples[DB] = predicate # retrieve native python object c.entity = h.fetch_obj(Entity, id, new_id=True) existing_predicate_list = [] existing_object_list = [] predicates_to_compare = ["influenced", "influenced_by", "teachers", "students"] for subject, predicate in triples.iteritems(): for predicate1, objectn in predicate.iteritems(): predicate_to_match = predicate1.split(":")[1] attr = getattr(c.entity, dbpedia_web[predicate_to_match]) for attr1 in attr: if dbpedia_web[predicate_to_match] in predicates_to_compare: existing_predicate_list.append(dbpedia_web[predicate_to_match] + ":" + attr1.wiki) # maps from dbpedia relationships back to inpho relationships for subject, predicate in triples.iteritems(): # attr = getattr(c.entity, predicate) # raise Exception for predicate1, objectn in predicate.iteritems(): for object1 in objectn: # temp_str=dbpedia_web[predicate1.split(":")[1]] + ':'+str(object1['b']['value']).split("/")[len(str(object1['b']['value']).split("/"))-1].replace("_"," ") temp_str = ( dbpedia_web[predicate1.split(":")[1]] + ":" + str(object1["b"]["value"]).split("/")[len(str(object1["b"]["value"]).split("/")) - 1] ) # raise Exception if temp_str not in existing_predicate_list: # returns the inphoid for the object DB_Entry = DB_inpho.get(object1["b"]["value"]) # reverse lookup for the inpho data check # if there is not an inpho id, leave it as the dbpedia id if DB_Entry == None: gReturn.add( (URIRef(subject), URIRef(dbPropResults.get(predicate1)), URIRef(object1["b"]["value"])) ) else: # return the properly mapped id # TODO: use attr to filter DB_Entry gReturn.add((URIRef(subject), URIRef(dbPropResults.get(predicate1)), URIRef(DB_Entry))) # if "Francisco" in str(object1['b']['value']).split("/")[len(str(object1['b']['value']).split("/"))-1].replace("_", ): # raise Exception return gReturn.serialize()
gNode1.add((SNode2, hs['EndTime'], Literal("2012-06-19T01:52:02Z"))) gNode1.add((SNode3,hs['hasTemperature'], Literal('64'))) gNode1.add((SNode3, hs['hasLight'], Literal('67'))) gNode1.add((SNode3, hs['hasHumidity'], Literal('88'))) gNode1.add((SNode3, hs['Located'], Literal(''))) gNode1.add((SNode3, hs['StartTime'], Literal("2012-06-19T01:52:02Z"))) gNode1.add((SNode3, hs['EndTime'], Literal("2012-06-19T01:52:02Z"))) # seperate and display the graphs in n3 print gNode1.serialize(format='n3') print "===================" #Display full graph print g.serialize(format='n3')
# add a graph for Mary's facts to the Conjunctive Graph gmary = Graph(store=store, identifier=cmary) # Mary's graph only contains the URI of the person she love, not his cute name gmary.add((mary, ns["hasName"], Literal("Mary"))) gmary.add((mary, ns["loves"], john)) # add a graph for John's facts to the Conjunctive Graph gjohn = Graph(store=store, identifier=cjohn) # John's graph contains his cute name gjohn.add((john, ns["hasCuteName"], Literal("Johnny Boy"))) # enumerate contexts for c in g.contexts(): print("-- %s " % c) # separate graphs print(gjohn.serialize(format="n3")) print("===================") print(gmary.serialize(format="n3")) print("===================") # full graph print(g.serialize(format="n3")) # query the conjunction of all graphs xx = None for x in g[mary : ns.loves / ns.hasCuteName]: # type: ignore[misc] xx = x print("Q: Who does Mary love?") print("A: Mary loves {}".format(xx))
gmary.add((mary, LOVE.loves, john)) # Add a graph containing John's facts to the Conjunctive Graph gjohn = Graph(store=store, identifier=cjohn) # John's graph contains his cute name gjohn.add((john, LOVE.hasCuteName, Literal("Johnny Boy"))) # Enumerate contexts print("Contexts:") for c in g.contexts(): print(f"-- {c.identifier} ") print("===================") # Separate graphs print("John's Graph:") print(gjohn.serialize()) print("===================") print("Mary's Graph:") print(gmary.serialize()) print("===================") print("Full Graph") print(g.serialize()) print("===================") print("Query the conjunction of all graphs:") xx = None for x in g[mary : LOVE.loves / LOVE.hasCuteName]: xx = x print("Q: Who does Mary love?") print("A: Mary loves {}".format(xx))
# add a graph for Mary's facts to the Conjunctive Graph gmary = Graph(store=store, identifier=cmary) # Mary's graph only contains the URI of the person she love, not his cute name gmary.add((mary, ns["hasName"], Literal("Mary"))) gmary.add((mary, ns["loves"], john)) # add a graph for John's facts to the Conjunctive Graph gjohn = Graph(store=store, identifier=cjohn) # John's graph contains his cute name gjohn.add((john, ns["hasCuteName"], Literal("Johnny Boy"))) # enumerate contexts for c in g.contexts(): print("-- %s " % c) # separate graphs print(gjohn.serialize(format="n3").decode("utf-8")) print("===================") print(gmary.serialize(format="n3").decode("utf-8")) print("===================") # full graph print(g.serialize(format="n3").decode("utf-8")) # query the conjunction of all graphs xx = None for x in g[mary:ns.loves / ns.hasCuteName]: xx = x print("Q: Who does Mary love?") print("A: Mary loves {}".format(xx))
class Hisco2RDF(): ''' Scrapes the HISCO Web site The hierarchy goes as "master > minor > rubri > micro" ''' def __init__(self): # The graph to store the data self.graph = ConjunctiveGraph() self.graph.namespace_manager.bind('skos', SKOS) self.graph.namespace_manager.bind('hisco', HISCO) self.graph.namespace_manager.bind('dcterms', DCTERMS) self.graph.namespace_manager.bind('sdmx-dimension', SDMX_DIMENSION) self.graph.namespace_manager.bind('sdmx-code', SDMX_CODE) self.graph.namespace_manager.bind('qb', QB) # SQLite DB for the cache self.cache = sqlite3.connect('cache.db') cursor = self.cache.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS page (url text, html text)") self.cache.commit() def __del__(self): self.cache.close() def get_page(self, url): #log.debug("Load %s" % url) c = self.cache.cursor() c.execute("SELECT * FROM page WHERE url = ?", (url,)) res = c.fetchone() doc = None if res == None: doc = requests.get(url).content c.execute("INSERT INTO page VALUES (?,?)", (url, doc)) self.cache.commit() else: (_, doc) = res return BeautifulSoup(doc) def save_output(self): # Add more things needed for DataCubes dimprop = HISCO['occupation'] self.graph.add((dimprop, RDF.type, QB['DimensionProperty'])) self.graph.add((dimprop, RDFS.range, SKOS.Collection)) self.graph.add((dimprop, QB['Concept'], SKOS.Collection)) self.graph.add((dimprop, RDFS.label, Literal('Occupation code', lang='en'))) self.graph.add((dimprop, RDFS.comment, Literal('The HISCO group of the occupation', lang='en'))) # Print to the screen #outfile = sys.stdout.buffer #self.graph.serialize(destination=outfile, format='n3') # Save to the file outfile = open('../hisco.ttl', "wb") self.graph.serialize(destination=outfile, format='n3') outfile.close() def parse_hisco_tree(self): ''' Parse the hisco tree ''' # Load the page doc = self.get_page(ROOT + HISCO_TREE) # Find the major groups major_groups = [] major_group = None for table in doc.find_all('table', attrs={'border':'0'}): for row in table.find_all('tr'): for col in row.find_all('td'): # Skip empty rows if len(col.text) == 1: continue # We are starting a new group if col.text.startswith('Majorgroup'): # Save the one we were building if any if major_group != None: major_groups.append(major_group) m = re.search("Majorgroup ([^ ]*) ", col.text) major_group = {} major_group['title'] = col.text major_group['code'] = m.group(1).replace('/', '-') # We have a description if col.text.startswith('Workers'): major_group['description'] = col.text # We have links to minor if col.text.startswith('List Minor'): link = col.find_all('a')[0]['href'] major_group.setdefault('links', []) major_group['links'].append(link) # Add the last group in the making if major_group != None: major_groups.append(major_group) # Add the groups to the graph for group in major_groups: major_group_uri = self._get_group_uri(group['code']) self.graph.add((major_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add((major_group_uri, DCTERMS.title, Literal(group['title']))) self.graph.add((major_group_uri, DCTERMS.description, Literal(group['description']))) # Now move onto the minor groups following the links for major_group in major_groups: major_group_uri = self._get_group_uri(major_group['code']) for minor_link in major_group['links']: # Look for the minor groups minor_groups = self._parse_records_table(minor_link, 2) # Add the groups to the graph for minor_group in minor_groups: minor_group_uri = self._get_group_uri(minor_group['code']) self.graph.add((minor_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add((minor_group_uri, RDFS.label, Literal(minor_group['title']))) self.graph.add((minor_group_uri, DCTERMS.description, Literal(minor_group['description']))) self.graph.add((major_group_uri, SKOS.related, minor_group_uri)) # Got one level deeper into the rubri for rubri_link in minor_group['links']: # Look for the minor groups rubri_groups = self._parse_records_table(rubri_link, 3) # Add the groups to the graph for rubri_group in rubri_groups: rubri_group_uri = self._get_group_uri(rubri_group['code']) self.graph.add((rubri_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add((rubri_group_uri, RDFS.label, Literal(rubri_group['title']))) self.graph.add((rubri_group_uri, DCTERMS.description, Literal(rubri_group['description']))) self.graph.add((minor_group_uri, SKOS.related, rubri_group_uri)) # And one deeper for the micro for micro_link in rubri_group['links']: # Look for the minor groups micro_groups = self._parse_records_table(micro_link, 5) # Add the groups to the graph for micro_group in micro_groups: hisco_uri = self._get_hisco_uri(micro_group['code']) self.graph.add((hisco_uri, RDF.type, SKOS['Collection'])) self.graph.add((hisco_uri, RDFS.label, Literal(micro_group['title']))) self.graph.add((hisco_uri, DCTERMS.description, Literal(micro_group['description']))) self.graph.add((rubri_group_uri, SKOS.related, hisco_uri)) def parse_occupational_titles(self): ''' Scrape the section of the site about occupational titles Last page = http://historyofwork.iisg.nl/list_hiswi.php?step=1845&publish=Y&modus=ftsearch ''' parsed_status_page = set() next_page = OCCUPATIONAL_TITLES while next_page != None: log.info("Parse titles %s" % next_page) # Load the page doc = self.get_page(ROOT + next_page) # Find the right table table = doc.find('table', attrs={'cellspacing':'0', 'cellpadding':'2', 'border':'0'}) # Look for all the titles for row in table.find_all('tr')[1:]: # Skip the header cols = row.find_all('td') occupation_title = cols[1].text details_page_link = cols[1].find_all('a')[0]['href'] language = LANG_MAP[cols[2].text] hisco_code = cols[3].text.replace('*', '') # Get the DB index from details_page_link m = re.search('know_id=([^&]*)&', details_page_link) occupation_index = m.group(1) # Add the concept to the graph resource = self._get_occupation_title_uri(occupation_index) self.graph.add((resource, RDF.type, SKOS['Concept'])) self.graph.add((resource, SKOS.prefLabel, Literal(occupation_title, lang=language))) self.graph.add((resource, SKOS.member, self._get_hisco_uri(hisco_code))) # Get more information about the title and add it as a member of the collection details_page = self.get_page(ROOT + details_page_link) details_table = details_page.find('table', attrs={'cellspacing':'8', 'cellpadding':'0'}) keyvalues = {} for details_row in details_table.find_all('tr'): details_cols = details_row.find_all('td') keyvalues[details_cols[0].text.strip()] = details_cols[-1] # We already dealt with these two del keyvalues['Hisco code'] del keyvalues['Occupational title'] # TODO Country , use refArea # TODO Language # Do we know the gender ? if 'Gender' in keyvalues: sex = SDMX_CODE['sex-U'] # Also applies to "Male/Female" if keyvalues['Gender'].text.strip() == 'Male': sex = SDMX_CODE['sex-M'] elif keyvalues['Gender'].text.strip() == 'Female': sex = SDMX_CODE['sex-F'] self.graph.add((resource, SDMX_DIMENSION['sex'], sex)) del keyvalues['Gender'] # Do we know the status ? if 'Status' in keyvalues: # Add the status status = keyvalues['Status'].text.strip() self.graph.add((resource, HISCO['status'], self._get_status_uri(status))) # Parse the status page if necessary status_page = keyvalues['Status'].find_all('a')[0]['href'] if status_page not in parsed_status_page: self._parse_status_page(status_page) parsed_status_page.add(status_page) del keyvalues['Status'] # TODO Relation # TODO Product # TODO Provenance # Do we have a translation in English ? if 'Translation' in keyvalues: trans = Literal(keyvalues['Translation'].text.strip().replace('´', "'"), lang='en') self.graph.add((resource, SKOS.altLabel, trans)) del keyvalues['Translation'] # Print whatever is left #if len(keyvalues.keys()) != 0: # log.info(keyvalues.keys()) # Look for the "next" link next_table = doc.find('table', class_='nextprev') next_page = None for link in next_table.find_all('a'): if 'Next' in link.text: next_page = link['href'] def _parse_status_page(self, url): ''' Parses a status page such as http://historyofwork.iisg.nl/status.php?int02=32 ''' # Work-around broken content if url == 'status.php?int02=15': return # Load the page doc = self.get_page(ROOT + url) # Find the data about this status status_uri = None for line in doc.find('pre').text.split('\n'): if re.match("^[0-9]* [a-zA-Z]*", line): m = re.search("^([0-9]*) ([a-zA-Z]*)", line) status_uri = self._get_status_uri(m.group(1)) self.graph.add((status_uri, RDF.type, HISCO['Status'])) self.graph.add((status_uri, RDFS.label, Literal(m.group(2)))) self.graph.add((status_uri, SKOS.prefLabel, Literal(m.group(2)))) self.graph.add((status_uri, SKOS.notation, Literal(m.group(1)))) if re.match("^[A-Z]{2}:\t[a-zA-Z]*", line): m = re.search("^([A-Z]{2}):\t([a-zA-Z]*)", line) lang_code = m.group(1).lower() label = Literal(m.group(2), lang = lang_code) self.graph.add((status_uri, SKOS.altLabel, label)) # Describe the class status_class = HISCO['Status'] descr = doc.find('table', attrs={'width':'600'}).text.strip().split('\r\n') self.graph.add((status_class, RDF.type, RDFS.Class)) self.graph.add((status_class, RDFS.label, Literal("Status code"))) self.graph.add((status_class, DCTERMS.comment, Literal(descr[1]))) # Describe the property status_property = HISCO['status'] self.graph.add((status_property, RDF.type, RDF.Property)) self.graph.add((status_property, RDFS.label, Literal("status associated to the occupation"))) self.graph.add((status_property, RDFS.range, HISCO['Status'])) self.graph.add((status_property, RDFS.domain, SKOS.Concept)) def _parse_records_table(self, url, size): ''' Minor, Rubri and Micro have the same structure except an additional column for Micro with links to the titles ''' # Load the page doc = self.get_page(ROOT + url) # Find the right table table = doc.find('table', attrs={'cellspacing':'8', 'cellpadding':'0'}) # If we can't find the table return an empty list # work around for http://historyofwork.iisg.nl/list_micro.php?keywords=920&keywords_qt=lstrict if table == None: return [] # Look for the minor groups groups = [] group = None columns = table.find_all('td') for index in range(0, len(columns)): # New group if re.match("[0-9]{%d}" % size, columns[index].text): if group != None: groups.append(group) group = {} group['code'] = columns[index].text group['title'] = columns[index + 1].text link = columns[index + 1].find_all('a')[0]['href'] group.setdefault('links', []) group['links'].append(link) group['description'] = columns[index + 2].text if columns[index + 3].text == "Display Titles": link = columns[index + 3].find_all('a')[0]['href'] group['titles_link'] = link groups.append(group) return groups def _get_group_uri(self, code): return HISCO['group-%s' % code] def _get_hisco_uri(self, code): return HISCO['hisco-%s' % code] def _get_occupation_title_uri(self, code): return HISCO['occupation-%s' % code] def _get_status_uri(self, code): return HISCO['status-%s' % code]
def DoTheTestMemory(): ns = Namespace("http://love.com#") # AssertionError: ConjunctiveGraph must be backed by a context aware store. mary = URIRef("http://love.com/lovers/mary") john = URIRef("http://love.com/lovers/john") cmary = URIRef("http://love.com/lovers/context_mary") cjohn = URIRef("http://love.com/lovers/context_john") # my_store = Memory() store_input = IOMemory() gconjunctive = ConjunctiveGraph(store=store_input) gconjunctive.bind("love", ns) # add a graph for Mary's facts to the Conjunctive Graph gmary = Graph(store=store_input, identifier=cmary) # Mary's graph only contains the URI of the person she love, not his cute name gmary.add((mary, ns["hasName"], Literal("Mary"))) gmary.add((mary, ns["loves"], john)) # add a graph for John's facts to the Conjunctive Graph gjohn = Graph(store=store_input, identifier=cjohn) # John's graph contains his cute name gjohn.add((john, ns["hasCuteName"], Literal("Johnny Boy"))) # enumerate contexts print("Input contexts") for c in gconjunctive.contexts(): print("-- %s " % c) # separate graphs if False: print("===================") print("GJOHN") print(gjohn.serialize(format="n3").decode("utf-8")) print("===================") print("GMARY") print(gmary.serialize(format="n3").decode("utf-8")) print("===================") # full graph print("===================") print("GCONJUNCTIVE NATIVE") print(gconjunctive.serialize(format="n3").decode("utf-8")) # query the conjunction of all graphs xx = None for x in gconjunctive[mary:ns.loves / ns.hasCuteName]: xx = x print("Q: Who does Mary love?") print("A: Mary loves {}".format(xx)) # Ensuite, on sauve un seul sous-graphe, puis on le recharge et le resultat doit etre le meme. gjohn.serialize(destination='gjohn_copy.xml', format='xml') gmary.serialize(destination='gmary_copy.xml', format='xml') gjohn_copy = Graph() gjohn_copy.parse('gjohn_copy.xml', format='xml') gmary_copy = Graph() gmary_copy.parse('gmary_copy.xml', format='xml') if True: print("===================") print("GJOHN") print(gjohn_copy.serialize(format="n3").decode("utf-8")) print("===================") print("GMARY") print(gmary_copy.serialize(format="n3").decode("utf-8")) print("===================") print("===================") print("GCONJUNCTIVE WITH QUADS") print(list(gconjunctive.quads(None))) print("===================") gconjunctive.serialize(destination='gconjunctive_copy.xml', format='xml') gconjunctive_copy = ConjunctiveGraph() gconjunctive_copy.parse('gconjunctive_copy.xml', format='xml') print("===================") print("GCONJUNCTIVE AS CONJUNCTIVE") print(gconjunctive_copy.serialize(format="n3").decode("utf-8")) print("Output contexts") for c in gconjunctive_copy.contexts(): print("-- %s " % c) print("===================") gconjunctive_graph_copy = Graph() gconjunctive_graph_copy.parse('gconjunctive_copy.xml', format='xml') print("===================") print("GCONJUNCTIVE AS GRAPH") print(gconjunctive_graph_copy.serialize(format="n3").decode("utf-8")) #print("Output contexts") #for c in gconjunctive_graph_copy.contexts(): # print("-- %s " % c) print("===================")
def query_lode(self,id): var = "http://inpho.cogs.indiana.edu/thinker/"+id # initialize dictionaries to store temporray results dbPropResults = {} inpho_DB = {} DB_inpho = {} dbpedia_web = {} triples={} # init graphs for LODE and mapped data gLODE = ConjunctiveGraph() gReturn = ConjunctiveGraph() # import InPhO data gLODE.parse("http://inphodata.cogs.indiana.edu/lode/out_n3.20140207.rdf", format="n3") # builds a set of triples with the inpho id as the first entry and the # dbpedia id as the second resultsLODE = gLODE.query(""" SELECT ?thinker_LODE ?thinkerDB WHERE { ?thinker_LODE owl:sameAs ?thinkerDB FILTER (regex(str(?thinker_LODE),"http://inpho.cogs.indiana.edu","i") && regex(str(?thinkerDB),"http://dbpedia.org/resource/","i")). } """) # load in property mapping between inpho-dbpedia prop_map_filename = config.get_data_path('rdf_map.txt') with open(prop_map_filename,'r') as f: dbprops=csv.reader(f,delimiter='\t') for dbprop in dbprops: dbPropResults[dbprop[1]] = dbprop[0] dbpedia_web[dbprop[1].split(":")[1]]=dbprop[2] # iterate through triples and store mappings for triple in resultsLODE: inpho_DB[str(triple[0])] = str(triple[1])#store the results in key as inpho url and value as dbpedia url DB_inpho[str(triple[1])] = str(triple[0])#store the results in key as dbpedia url and value as inpho url # queries for all relationships in dbpedia sparqlDB = SPARQLWrapper("http://inpho-dataserve.cogs.indiana.edu:8890/sparql/") sparqlDB.setReturnFormat(JSON) for inpho,DB in inpho_DB.iteritems(): predicate = {} #for dbprop in dbPropResults: if(str(DB_inpho.get(DB))== var): for dbprop in dbPropResults: sparqlDB.setQuery(""" PREFIX dbpprop: <http://dbpedia.org/ontology/> SELECT ?b WHERE { <"""+DB+"""> """+dbprop+""" ?b. FILTER (regex(str(?b),"dbpedia.org/resource/","i")). }""") resultsDB = sparqlDB.query().convert() predicate[dbprop] = resultsDB["results"]["bindings"] triples[DB] = predicate #retrieve native python object c.entity = h.fetch_obj(Entity, id, new_id=True) existing_predicate_list=[] existing_object_list=[] predicates_to_compare = ['influenced', 'influenced_by', 'teachers', 'students'] for subject,predicate in triples.iteritems(): for predicate1, objectn in predicate.iteritems(): predicate_to_match=predicate1.split(":")[1] attr=getattr(c.entity,dbpedia_web[predicate_to_match]) for attr1 in attr: if(dbpedia_web[predicate_to_match] in predicates_to_compare) : existing_predicate_list.append(dbpedia_web[predicate_to_match] +':'+attr1.wiki) # maps from dbpedia relationships back to inpho relationships for subject,predicate in triples.iteritems(): #attr = getattr(c.entity, predicate) #raise Exception for predicate1, objectn in predicate.iteritems(): for object1 in objectn: #temp_str=dbpedia_web[predicate1.split(":")[1]] + ':'+str(object1['b']['value']).split("/")[len(str(object1['b']['value']).split("/"))-1].replace("_"," ") temp_str=dbpedia_web[predicate1.split(":")[1]] + ':'+str(object1['b']['value']).split("/")[len(str(object1['b']['value']).split("/"))-1] # raise Exception if temp_str not in existing_predicate_list: # returns the inphoid for the object DB_Entry = DB_inpho.get(object1['b']['value'])#reverse lookup for the inpho data check # if there is not an inpho id, leave it as the dbpedia id if(DB_Entry == None): gReturn.add((URIRef(subject),URIRef(dbPropResults.get(predicate1)),URIRef(object1['b']['value']))) else: # return the properly mapped id # TODO: use attr to filter DB_Entry gReturn.add((URIRef(subject),URIRef(dbPropResults.get(predicate1)),URIRef(DB_Entry))) # if "Francisco" in str(object1['b']['value']).split("/")[len(str(object1['b']['value']).split("/"))-1].replace("_", ): # raise Exception return gReturn.serialize();
g = ConjunctiveGraph(store=store) g.bind("love", ns) gmary = Graph(store=store, identifier=cmary) gmary.add((mary, ns['hasName'], Literal("Mary"))) gmary.add((mary, ns['loves'], john)) gjohn = Graph(store=store, identifier=cjohn) gjohn.add((john, ns['hasName'], Literal("John"))) #enumerate contexts for c in g.contexts(): print("-- %s " % c) #separate graphs print(gjohn.serialize(format='n3')) print("===================") print(gmary.serialize(format='n3')) print("===================") #full graph print(g.serialize(format='xml')) # query the conjunction of all graphs print 'Mary loves:' for x in g[mary:ns.loves / ns.hasName]: print x
data = csv.DictReader(fd, delimiter="\t", quotechar='"', escapechar='') for r in data: raw_id = r['raw_id'] # Check if valid with regex match = re.match(r"^(tt)*(?P<id>\d{7,10}).*", raw_id) if not match: progress.count() wrongs.append(raw_id) continue imdb_id = match.group(2) film_node = n['Movie/tt' + imdb_id] # Create a node for dbpedia uri = r['uri'] wiki_node = URIRef(uri) g.add((film_node, n['has' + source + 'Node'], wiki_node)) progress.count() if progress.finished(): break g.serialize(destination=outfile, format='turtle') end = time.time() print('Wrong formatted IMDB IDs found: ', len(wrongs)) print(wrongs) print("Total Items Processed: ", progress.total) print("Total Time: ", end - start) g.close()
class Hisco2RDF(): ''' Scrapes the HISCO Web site The hierarchy goes as "master > minor > rubri > micro" ''' def __init__(self): # The graph to store the data self.graph = ConjunctiveGraph() self.graph.namespace_manager.bind('skos', SKOS) self.graph.namespace_manager.bind('hisco', HISCO) self.graph.namespace_manager.bind('dcterms', DCTERMS) self.graph.namespace_manager.bind('sdmx-dimension', SDMX_DIMENSION) self.graph.namespace_manager.bind('sdmx-code', SDMX_CODE) self.graph.namespace_manager.bind('qb', QB) # SQLite DB for the cache self.cache = sqlite3.connect('cache.db') cursor = self.cache.cursor() cursor.execute( "CREATE TABLE IF NOT EXISTS page (url text, html text)") self.cache.commit() def __del__(self): self.cache.close() def get_page(self, url): #log.debug("Load %s" % url) c = self.cache.cursor() c.execute("SELECT * FROM page WHERE url = ?", (url, )) res = c.fetchone() doc = None if res == None: doc = requests.get(url).content c.execute("INSERT INTO page VALUES (?,?)", (url, doc)) self.cache.commit() else: (_, doc) = res return BeautifulSoup(doc) def save_output(self): # Add more things needed for DataCubes dimprop = HISCO['occupation'] self.graph.add((dimprop, RDF.type, QB['DimensionProperty'])) self.graph.add((dimprop, RDFS.range, SKOS.Collection)) self.graph.add((dimprop, QB['Concept'], SKOS.Collection)) self.graph.add( (dimprop, RDFS.label, Literal('Occupation code', lang='en'))) self.graph.add((dimprop, RDFS.comment, Literal('The HISCO group of the occupation', lang='en'))) # Print to the screen #outfile = sys.stdout.buffer #self.graph.serialize(destination=outfile, format='n3') # Save to the file outfile = open('../hisco.ttl', "wb") self.graph.serialize(destination=outfile, format='n3') outfile.close() def parse_hisco_tree(self): ''' Parse the hisco tree ''' # Load the page doc = self.get_page(ROOT + HISCO_TREE) # Find the major groups major_groups = [] major_group = None for table in doc.find_all('table', attrs={'border': '0'}): for row in table.find_all('tr'): for col in row.find_all('td'): # Skip empty rows if len(col.text) == 1: continue # We are starting a new group if col.text.startswith('Majorgroup'): # Save the one we were building if any if major_group != None: major_groups.append(major_group) m = re.search("Majorgroup ([^ ]*) ", col.text) major_group = {} major_group['title'] = col.text major_group['code'] = m.group(1).replace('/', '-') # We have a description if col.text.startswith('Workers'): major_group['description'] = col.text # We have links to minor if col.text.startswith('List Minor'): link = col.find_all('a')[0]['href'] major_group.setdefault('links', []) major_group['links'].append(link) # Add the last group in the making if major_group != None: major_groups.append(major_group) # Add the groups to the graph for group in major_groups: major_group_uri = self._get_group_uri(group['code']) self.graph.add((major_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add( (major_group_uri, DCTERMS.title, Literal(group['title']))) self.graph.add((major_group_uri, DCTERMS.description, Literal(group['description']))) # Now move onto the minor groups following the links for major_group in major_groups: major_group_uri = self._get_group_uri(major_group['code']) for minor_link in major_group['links']: # Look for the minor groups minor_groups = self._parse_records_table(minor_link, 2) # Add the groups to the graph for minor_group in minor_groups: minor_group_uri = self._get_group_uri(minor_group['code']) self.graph.add( (minor_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add((minor_group_uri, RDFS.label, Literal(minor_group['title']))) self.graph.add((minor_group_uri, DCTERMS.description, Literal(minor_group['description']))) self.graph.add( (major_group_uri, SKOS.related, minor_group_uri)) # Got one level deeper into the rubri for rubri_link in minor_group['links']: # Look for the minor groups rubri_groups = self._parse_records_table(rubri_link, 3) # Add the groups to the graph for rubri_group in rubri_groups: rubri_group_uri = self._get_group_uri( rubri_group['code']) self.graph.add((rubri_group_uri, RDF.type, SKOS['ConceptScheme'])) self.graph.add((rubri_group_uri, RDFS.label, Literal(rubri_group['title']))) self.graph.add( (rubri_group_uri, DCTERMS.description, Literal(rubri_group['description']))) self.graph.add((minor_group_uri, SKOS.related, rubri_group_uri)) # And one deeper for the micro for micro_link in rubri_group['links']: # Look for the minor groups micro_groups = self._parse_records_table( micro_link, 5) # Add the groups to the graph for micro_group in micro_groups: hisco_uri = self._get_hisco_uri( micro_group['code']) self.graph.add((hisco_uri, RDF.type, SKOS['Collection'])) self.graph.add( (hisco_uri, RDFS.label, Literal(micro_group['title']))) self.graph.add( (hisco_uri, DCTERMS.description, Literal(micro_group['description']))) self.graph.add((rubri_group_uri, SKOS.related, hisco_uri)) def parse_occupational_titles(self): ''' Scrape the section of the site about occupational titles Last page = http://historyofwork.iisg.nl/list_hiswi.php?step=1845&publish=Y&modus=ftsearch ''' parsed_status_page = set() next_page = OCCUPATIONAL_TITLES while next_page != None: log.info("Parse titles %s" % next_page) # Load the page doc = self.get_page(ROOT + next_page) # Find the right table table = doc.find('table', attrs={ 'cellspacing': '0', 'cellpadding': '2', 'border': '0' }) # Look for all the titles for row in table.find_all('tr')[1:]: # Skip the header cols = row.find_all('td') occupation_title = cols[1].text details_page_link = cols[1].find_all('a')[0]['href'] language = LANG_MAP[cols[2].text] hisco_code = cols[3].text.replace('*', '') # Get the DB index from details_page_link m = re.search('know_id=([^&]*)&', details_page_link) occupation_index = m.group(1) # Add the concept to the graph resource = self._get_occupation_title_uri(occupation_index) self.graph.add((resource, RDF.type, SKOS['Concept'])) self.graph.add((resource, SKOS.prefLabel, Literal(occupation_title, lang=language))) self.graph.add( (resource, SKOS.member, self._get_hisco_uri(hisco_code))) # Get more information about the title and add it as a member of the collection details_page = self.get_page(ROOT + details_page_link) details_table = details_page.find('table', attrs={ 'cellspacing': '8', 'cellpadding': '0' }) keyvalues = {} for details_row in details_table.find_all('tr'): details_cols = details_row.find_all('td') keyvalues[details_cols[0].text.strip()] = details_cols[-1] # We already dealt with these two del keyvalues['Hisco code'] del keyvalues['Occupational title'] # TODO Country , use refArea # TODO Language # Do we know the gender ? if 'Gender' in keyvalues: sex = SDMX_CODE['sex-U'] # Also applies to "Male/Female" if keyvalues['Gender'].text.strip() == 'Male': sex = SDMX_CODE['sex-M'] elif keyvalues['Gender'].text.strip() == 'Female': sex = SDMX_CODE['sex-F'] self.graph.add((resource, SDMX_DIMENSION['sex'], sex)) del keyvalues['Gender'] # Do we know the status ? if 'Status' in keyvalues: # Add the status status = keyvalues['Status'].text.strip() self.graph.add((resource, HISCO['status'], self._get_status_uri(status))) # Parse the status page if necessary status_page = keyvalues['Status'].find_all('a')[0]['href'] if status_page not in parsed_status_page: self._parse_status_page(status_page) parsed_status_page.add(status_page) del keyvalues['Status'] # TODO Relation # TODO Product # TODO Provenance # Do we have a translation in English ? if 'Translation' in keyvalues: trans = Literal( keyvalues['Translation'].text.strip().replace( '´', "'"), lang='en') self.graph.add((resource, SKOS.altLabel, trans)) del keyvalues['Translation'] # Print whatever is left #if len(keyvalues.keys()) != 0: # log.info(keyvalues.keys()) # Look for the "next" link next_table = doc.find('table', class_='nextprev') next_page = None for link in next_table.find_all('a'): if 'Next' in link.text: next_page = link['href'] def _parse_status_page(self, url): ''' Parses a status page such as http://historyofwork.iisg.nl/status.php?int02=32 ''' # Work-around broken content if url == 'status.php?int02=15': return # Load the page doc = self.get_page(ROOT + url) # Find the data about this status status_uri = None for line in doc.find('pre').text.split('\n'): if re.match("^[0-9]* [a-zA-Z]*", line): m = re.search("^([0-9]*) ([a-zA-Z]*)", line) status_uri = self._get_status_uri(m.group(1)) self.graph.add((status_uri, RDF.type, HISCO['Status'])) self.graph.add((status_uri, RDFS.label, Literal(m.group(2)))) self.graph.add( (status_uri, SKOS.prefLabel, Literal(m.group(2)))) self.graph.add( (status_uri, SKOS.notation, Literal(m.group(1)))) if re.match("^[A-Z]{2}:\t[a-zA-Z]*", line): m = re.search("^([A-Z]{2}):\t([a-zA-Z]*)", line) lang_code = m.group(1).lower() label = Literal(m.group(2), lang=lang_code) self.graph.add((status_uri, SKOS.altLabel, label)) # Describe the class status_class = HISCO['Status'] descr = doc.find('table', attrs={ 'width': '600' }).text.strip().split('\r\n') self.graph.add((status_class, RDF.type, RDFS.Class)) self.graph.add((status_class, RDFS.label, Literal("Status code"))) self.graph.add((status_class, DCTERMS.comment, Literal(descr[1]))) # Describe the property status_property = HISCO['status'] self.graph.add((status_property, RDF.type, RDF.Property)) self.graph.add((status_property, RDFS.label, Literal("status associated to the occupation"))) self.graph.add((status_property, RDFS.range, HISCO['Status'])) self.graph.add((status_property, RDFS.domain, SKOS.Concept)) def _parse_records_table(self, url, size): ''' Minor, Rubri and Micro have the same structure except an additional column for Micro with links to the titles ''' # Load the page doc = self.get_page(ROOT + url) # Find the right table table = doc.find('table', attrs={ 'cellspacing': '8', 'cellpadding': '0' }) # If we can't find the table return an empty list # work around for http://historyofwork.iisg.nl/list_micro.php?keywords=920&keywords_qt=lstrict if table == None: return [] # Look for the minor groups groups = [] group = None columns = table.find_all('td') for index in range(0, len(columns)): # New group if re.match("[0-9]{%d}" % size, columns[index].text): if group != None: groups.append(group) group = {} group['code'] = columns[index].text group['title'] = columns[index + 1].text link = columns[index + 1].find_all('a')[0]['href'] group.setdefault('links', []) group['links'].append(link) group['description'] = columns[index + 2].text if columns[index + 3].text == "Display Titles": link = columns[index + 3].find_all('a')[0]['href'] group['titles_link'] = link groups.append(group) return groups def _get_group_uri(self, code): return HISCO['group-%s' % code] def _get_hisco_uri(self, code): return HISCO['hisco-%s' % code] def _get_occupation_title_uri(self, code): return HISCO['occupation-%s' % code] def _get_status_uri(self, code): return HISCO['status-%s' % code]
gmary = Graph( store=store, identifier=cmary ) #Creamos un grafo para Mary para así almanenar sus propiedades gmary.add((mary, ns['hasName'], Literal("Mary"))) gmary.add((mary, ns['loves'], john)) gjohn = Graph(store=store, identifier=cjohn) gjohn.add((john, ns['hasName'], Literal("John"))) #Una vez creados los grafos tanto para Mary como para John Mostraremos el contenido print('#Contenido del grafo de Conjuntos') print print for c in g.contexts(): print("-- %s " % c) print print('#Contenido del Grafo de John en notación N3') print(gjohn.serialize(format='n3')) print("===================") print('#Contenido del Grafo de John en notación N3') print(gmary.serialize(format='n3')) print("===================") #full graph print(g.serialize(format='n3')) # query the conjunction of all graphs print('Mary loves:') for x in g[ mary:ns.loves / ns. hasName]: #La forma que realizamos la query es indicando que deseamos de los valores del predicado ns.loves queremos unicamente su propiedad ns.hasName print(x)
class RDFAggregator(Aggregator): def __init__(self, *args, **kw): """Inicializa o agregador RDF. """ super(RDFAggregator, self).__init__('csv', *args, **kw) self.aggregator = ConjunctiveGraph() self.aggregator.bind(u'owl', OWL) self.aggregator.bind(u'lic', LIC) self.aggregator.bind(u'siorg', SIORG) self.aggregator.bind(u'siafi', SIAFI) self.aggregator.bind(u'geo', GEO) self.aggregator.bind(u'dbpedia', DBPEDIA) self.aggregator.bind(u'dbprop', DBPROP) self.aggregator.bind(u'dbo', DBONT) self.aggregator.bind(u'void', VOID) self.aggregator.bind(u'foaf', FOAF) self.aggregator.bind(u'vcard', VCARD) def add(self, obj): """Acrescenta as triplas do objeto ao grafo agregador. """ if getattr(obj, 'repr_rdf', None): # objeto tem um metodo para representacao propria em rdf triplas = obj.repr_rdf() for t in triplas: self.aggregator.add(t) else: # o objeto nao tem o metodo, tenta criar triplas por heuristicas subject = obj.uri doc = obj.doc_uri if doc == subject: doc = None class_uri = getattr(obj.__class__, '__class_uri__', None) expostos = getattr(obj.__class__,self.atributo_serializar, set()) prop_map = getattr(obj.__class__, '__rdf_prop__', {}) g = self.aggregator # classe if class_uri: g.add((URIRef(subject), RDF['type'], URIRef(class_uri))) # documento if doc: g.add((URIRef(doc), RDF['type'], FOAF['Document'])) g.add((URIRef(subject), FOAF['isPrimaryTopicOf'], URIRef(doc))) g.add((URIRef(doc), FOAF['primaryTopic'], URIRef(subject))) # nome if getattr(obj, 'nome', None): if getattr(obj, '__rdf_prop__', None) is None or \ obj.__rdf_prop__.get('nome', None) is None: g.add((URIRef(subject), RDFS['label'], Literal(obj.nome))) # localizacao geo if getattr(obj, 'geo_ponto', None): ponto = obj.geo_ponto if ponto: g.add((URIRef(subject), GEO['lat'], Literal(ponto['lat']))) g.add((URIRef(subject), GEO['long'], Literal(ponto['lon']))) # propriedades for atr in expostos: if atr in prop_map.keys(): if getattr(prop_map[atr], '__call__', None): # as triplas da propriedade sao dadas por uma funcao triplas = prop_map[atr](obj) if triplas: for t in triplas: g.add(t) elif prop_map[atr].get('metodo', None): # as triplas da propriedade sao dadas por um metodo m = getattr(obj, prop_map[atr]['metodo']) triplas = m(atr) if triplas: for t in triplas: g.add(t) elif prop_map[atr].get('pred_uri', None): # a propriedade corresponde a uma unica tripla pred_uri = prop_map[atr]['pred_uri'] object = getattr(obj, atr, None) if object: obj_uri = getattr(object, 'uri', lambda: None)() obj_cls_uri = getattr(object, '__class_uri__', None) # o objeto tem uri definida? if obj_uri: g.add((URIRef(subject), URIRef(pred_uri), URIRef(obj_uri))) elif obj_cls_uri: # se o objeto nao tem uri mas tem uri da classe, # tenta criar blank node bn = BNode() g.add((URIRef(subject), URIRef(pred_uri), bn)) g.add((bn, RDF['type'], URIRef(obj_cls_uri))) g.add((bn, RDFS['comment'], Literal(unicode(obj)))) else: # caso contrario, tratar a propriedade como um literal g.add((URIRef(subject), URIRef(pred_uri), Literal(unicode(object)))) def serialize(self, format="n3"): """Retorna a serializacao do agregador RDF (uniao dos grafos). """ format_map = { 'xml': 'xml', 'rdf': 'pretty-xml', 'rdf/xml': 'pretty-xml', 'ttl': 'n3', 'n3': 'n3', 'nt': 'nt', } f = format_map.get(format, 'n3') current_url = self.dataset_split.get('current_url', '') # url do documento atual dataset_url = self.dataset_split.get('dataset_url', '') # url geral do dataset next_url = self.dataset_split.get('next_url', '') # url da proxima pagina # a uri do dataset: url do documento acrescida de #dataset if current_url: self.aggregator.add((URIRef(current_url+"#dataset"),RDF['type'],VOID['Dataset'])) self.aggregator.add((URIRef(current_url),RDF['type'],VOID['DatasetDescription'])) self.aggregator.add((URIRef(current_url),FOAF['primaryTopic'],URIRef(current_url+"#dataset"))) if next_url: self.aggregator.add((URIRef(current_url+"#dataset"),RDFS['seeAlso'],URIRef(next_url+"#dataset"))) if next_url: self.aggregator.add((URIRef(next_url+"#dataset"),RDF['type'], VOID['Dataset'])) self.aggregator.add((URIRef(next_url),RDF['type'],VOID['DatasetDescription'])) self.aggregator.add((URIRef(next_url),FOAF['primaryTopic'],URIRef(next_url+"#dataset"))) if dataset_url: self.aggregator.add((URIRef(dataset_url+"#dataset"),RDF['type'], VOID['Dataset'])) self.aggregator.add((URIRef(dataset_url),RDF['type'],VOID['DatasetDescription'])) self.aggregator.add((URIRef(dataset_url),FOAF['primaryTopic'],URIRef(dataset_url+"#dataset"))) if current_url: self.aggregator.add((URIRef(dataset_url+"#dataset"),VOID['subset'],URIRef(current_url+"#dataset"))) if next_url: self.aggregator.add((URIRef(dataset_url+"#dataset"),VOID['subset'],URIRef(next_url+"#dataset"))) return self.aggregator.serialize(format=f)
def testQuotedSerialization(self): g = ConjunctiveGraph() g.parse(data=test_data, format="n3") g.serialize(format="n3")
# with the object init (and it added some namespaces as well) # By default, your main namespace is the URI of your # current working directory, so lets make that simpler: myNS = Namespace(URIRef('http://www.w3.org/2000/10/swap/Primer#')) primer.bind('', myNS) primer.bind('owl', 'http://www.w3.org/2002/07/owl#') primer.bind('dc', 'http://purl.org/dc/elements/1.1/') primer.bind('swap', 'http://www.w3.org/2000/10/swap/') sourceCode = StringInputSource(mySource, myNS) # Lets load it up! primer.parse(sourceCode, format='n3') # Now you can query, either directly straight into a list: [(x, y, z) for x, y, z in primer] # or spit it back out (mostly) the way we created it: print primer.serialize(format='n3') # for more insight into things already done, lets see the namespaces list(primer.namespaces()) # lets ask something about the data list(primer.objects(myNS.pat, myNS.child))
def rdf_description(name, notation='xml'): """ Funtion takes title of node, and rdf notation. """ valid_formats = ["xml", "n3", "ntriples", "trix"] default_graph_uri = "http://gstudio.gnowledge.org/rdfstore" # default_graph_uri = "http://example.com/" configString = "/var/tmp/rdfstore" # Get the IOMemory plugin. store = plugin.get('IOMemory', Store)('rdfstore') # Open previously created store, or create it if it doesn't exist yet graph = Graph(store="IOMemory", identifier=URIRef(default_graph_uri)) path = mkdtemp() rt = graph.open(path, create=False) if rt == NO_STORE: graph.open(path, create=True) else: assert rt == VALID_STORE, "The underlying store is corrupt" # Now we'll add some triples to the graph & commit the changes #rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') graph.bind("gstudio", "http://gnowledge.org/") exclusion_fields = [ "id", "rght", "node_ptr_id", "image", "lft", "_state", "_altnames_cache", "_tags_cache", "nid_ptr_id", "_mptt_cached_fields" ] #verifies the type of node node = NID.objects.get(title=name) node_type = node.reftype if (node_type == 'Gbobject'): node = Gbobject.objects.get(title=name) rdflib = link(node) elif (node_type == 'None'): node = Gbobject.objects.get(title=name) rdflib = link(node) elif (node_type == 'Processes'): node = Gbobject.objects.get(title=name) rdflib = link(node) elif (node_type == 'System'): node = Gbobject.objects.get(title=name) rdflib = link(node) elif (node_type == 'Objecttype'): node = Objecttype.objects.get(title=name) rdflib = link(node) elif (node_type == 'Attributetype'): node = Attributetype.objects.get(title=name) rdflib = link(node) elif (node_type == 'Complement'): node = Complement.objects.get(title=name) rdflib = link(node) elif (node_type == 'Union'): node = Union.objects.get(title=name) rdflib = link(node) elif (node_type == 'Intersection'): node = Intersection.objects.get(title=name) rdflib = link(node) elif (node_type == 'Expression'): node = Expression.objects.get(title=name) rdflib = link(node) elif (node_type == 'Processtype'): node = Processtype.objects.get(title=name) rdflib = link(node) elif (node_type == 'Systemtype'): node = Systemtype.objects.get(title=name) rdflib = link(node) elif (node_type == 'AttributeSpecification'): node = AttributeSpecification.objects.get(title=name) rdflib = link(node) elif (node_type == 'RelationSpecification'): node = RelationSpecification.objects.get(title=name) rdflib = link(node) elif (node_type == 'Attribute'): node = Attribute.objects.get(title=name) rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') elif (node_type == 'Relationtype'): node = Relationtype.objects.get(title=name) rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') elif (node_type == 'Metatype'): node = Metatype.objects.get(title=name) rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') else: rdflib = Namespace('http://sbox.gnowledge.org/gstudio/') node_dict = node.__dict__ subject = str(node_dict['id']) for key in node_dict: if key not in exclusion_fields: predicate = str(key) pobject = str(node_dict[predicate]) graph.add((rdflib[subject], rdflib[predicate], Literal(pobject))) rdf_code = graph.serialize(format=notation) graph.commit() print rdf_code graph.close()