def command(self): print("Loading RDF database...") rdfStorage = RDF.FileStorage("lodstatsrdf") rdfModel = RDF.Model(rdfStorage) print("Finished!") ckanCatalogPath = "/tmp/ckan_catalogs.pickled" print("Reading " + ckanCatalogPath) f = open(ckanCatalogPath, 'rU') ckanCatalogs = pickle.load(f) f.close() print("Finished!") #Fetch the all rdfdocs from the DB print("Fetching the data from DB...") rdfdocs = Session.query(model.RDFDoc).all() print("Fetched the data from DB!") overall = len(rdfdocs) for num, rdfdoc in enumerate(rdfdocs): print("Processing %d out of %d" % (num, overall)) try: self.generateRdfForRdfDoc(rdfdoc, rdfModel, ckanCatalogs) except BaseException as e: print("Oops, exception occured: "+str(e)) serializer = RDF.Serializer(name="ntriples") serializer.serialize_model_to_file("lodstats.nt", rdfModel, base_uri=None)
def command(self): ckanCatalogPath = "/tmp/ckan_catalogs.pickled" f = open(ckanCatalogPath, 'rU') ckanCatalogs = pickle.load(f) f.close() for catalog in ckanCatalogs: prefix = catalog['prefix'] #datagov ckanApiUrl = catalog['ckanApiUrl'] #http://catalog.data.gov/api packages = catalog['rdfpackages'] for package in packages: rdfPackageName = package['name'] #name is a part of URI http://catalog.data.gov/dataset/name #just pickup first resource which is not None rdfResource = None for resource in package['resources']: if(resource is not None): rdfResource = resource break rdfResourceUrl = rdfResource['url'] rdfResourceFormat = rdfResource['format'] rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==rdfPackageName).first() if(rdfdoc): continue else: newRdfdoc = model.RDFDoc(name=rdfPackageName, uri=rdfResourceUrl, format=rdfResourceFormat, ckan_catalog=prefix) Session.add(newRdfdoc) Session.commit()
def command(self): id = self.args[0] exchange = "lodstats_datasets_exchange" queue = "lodstats_datasets_queue" message_broker = Messaging() message_broker.declareDirectExchange(exchange) message_broker.declareQueue(queue) message_broker.bindExchangeToQueue(exchange, queue) rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.active==True, model.RDFDoc.id==id).one() dataset = { 'id': rdfdoc.id, } message = json.dumps(dataset) message_broker.sendMessageToQueue(queue, message)
def getVoid(self): #Join on rdfdoc here (!) replace uri of the dataset with the http://lodstats.aksw.org/stat_result/6702.void statResults = Session.query(model.StatResult, model.RDFDoc).\ filter(model.StatResult.rdfdoc_id==model.RDFDoc.id).\ all() void = []; for statResult, rdfdoc in statResults: if(statResult.void is not None): try: statResultUri = "http://lodstats.aksw.org/stat_result/"+str(statResult.id)+".void" if(re.search("<http://stats.lod2.eu/rdf/void/.source.*>" , statResult.void)): replacedVoid = re.sub("<http...stats.lod2.eu.rdf.void..source.*>", "<"+statResultUri+">", statResult.void) f = codecs.open("./void/"+str(statResult.id)+".ttl", 'w', 'utf-8') f.write(replacedVoid) f.close() else: g=rdflib.Graph() g.parse(data=statResult.void, format='turtle') result = g.update(""" INSERT {<"""+statResultUri+"""> ?p ?o} WHERE { ?s ?p ?o . ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://rdfs.org/ns/void#dataset> } """) g.commit() result = g.query(""" SELECT DISTINCT ?s ?p ?o WHERE { ?s ?p ?o . ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://rdfs.org/ns/void#dataset> . FILTER(?s != <"""+statResultUri+""">) } """) subjToDelete = ""; for res in result: subjToDelete = res.s; break; g.remove((subjToDelete, None, None)) g.commit() f = codecs.open("./void/"+str(statResult.id)+".nt", 'w', 'utf-8') f.write(g.serialize(format="nt")) f.close() except BaseException as e: print str(e)
def command(self): self.logging_file_config(config_file) log = logging.getLogger(__name__) self.worker_proc = None self.rdfdoc_to_do = None signal.signal(signal.SIGINT, self.term_handler) signal.signal(signal.SIGTERM, self.term_handler) # do not spawn more than two workers number_of_workers = Session.query(model.WorkerProc).with_lockmode('read').count() if number_of_workers >= 2: return 0 # check for orphaned local packages allLocalPackages = Session.query(model.RDFDoc).all() for pkg in all_local_pkgs: if pkg.name not in package_list: log.debug("%s is gone and will be deleted" % pkg.name) #Session.delete(pkg) #Session.commit() for package_name in package_list: try: package = ckan.package_entity_get(package_name) except Exception, errorstr: log.debug("ERROR with %s: %s" % (package_name, errorstr)) continue rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==package['name']).first() if rdfdoc is None: rdfdoc = model.RDFDoc() Session.add(rdfdoc) rdfdoc.name = package['name'] class BreakIt: pass try: for resource in package['resources']: if resource['format'].lower() in ["application/x-ntriples", "nt", "gzip:ntriples"]: rdfdoc.format = "nt" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["application/x-nquads", "nquads"]: rdfdoc.format = "nq" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["application/rdf+xml", "rdf"]: rdfdoc.format = "rdf" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["text/turtle", "rdf/turtle", "ttl"]: rdfdoc.format = "ttl" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["text/n3", "n3"]: rdfdoc.format = "n3" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["api/sparql", "sparql"]: # prefer a sitemap.xml over sparql, if any for sitemap_resource in package['resources']: if sitemap_resource['format'].lower() in ["meta/sitemap"]: rdfdoc.format = "sitemap" rdfdoc.uri = sitemap_resource['url'] raise BreakIt rdfdoc.format = "sparql" rdfdoc.uri = resource['url'] except BreakIt: pass if rdfdoc.format is not None: Session.commit() else: Session.rollback()
def _get_dataset(self, id): return Session.query(model.RDFDoc).filter(model.RDFDoc.id==id).first()
except Exception, errorstr: log.error(errorstr) error = errorstr if error is None and (modified or rdfdoc_to_do.current_stats is None): stat_result.triples = rdfdocstats.get_no_of_triples() stat_result.void = rdfdocstats.voidify('turtle') stat_result.warnings = rdfdocstats.get_no_of_warnings() if stat_result.warnings > 0: last_warning = rdfdocstats.get_last_warning() stat_result.last_warning = unicode(last_warning.message, errors='replace') stat_result.has_errors = False stat_result.errors = None stats_results = rdfdocstats.get_stats_results() for class_uri,usage_count in stats_results['usedclasses']['usage_count'].iteritems(): c = Session.query(model.RDFClass).filter(model.RDFClass.uri==class_uri).first() if c is None: c = model.RDFClass() c.uri = class_uri Session.add(c) rcs = model.RDFClassStat() rcs.rdf_class = c rcs.stat_result = stat_result rcs.count = usage_count Session.add(rcs) # vocab: for base_uri,result in stats_results['vocabularies'].iteritems(): if result > 0: v = Session.query(model.Vocab).filter(model.Vocab.uri==base_uri).first() if v is None: v = model.Vocab()