def command(self): ckanCatalogPath = "/tmp/ckan_catalogs.pickled" f = open(ckanCatalogPath, 'rU') ckanCatalogs = pickle.load(f) f.close() for catalog in ckanCatalogs: prefix = catalog['prefix'] #datagov ckanApiUrl = catalog['ckanApiUrl'] #http://catalog.data.gov/api packages = catalog['rdfpackages'] for package in packages: rdfPackageName = package['name'] #name is a part of URI http://catalog.data.gov/dataset/name #just pickup first resource which is not None rdfResource = None for resource in package['resources']: if(resource is not None): rdfResource = resource break rdfResourceUrl = rdfResource['url'] rdfResourceFormat = rdfResource['format'] rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==rdfPackageName).first() if(rdfdoc): continue else: newRdfdoc = model.RDFDoc(name=rdfPackageName, uri=rdfResourceUrl, format=rdfResourceFormat, ckan_catalog=prefix) Session.add(newRdfdoc) Session.commit()
def callback_stats(self, rdfdocstat): no_of_statements = rdfdocstat.get_no_of_triples() if no_of_statements > 0: # update triples done if no_of_statements % 10000 == 0: self.stat_result.triples_done = no_of_statements self.stat_result.warnings = rdfdocstat.warnings Session.commit()
def process_dataset(self, id): self.logging_file_config(config_file) log = logging.getLogger(__name__) self.worker_proc = None self.rdfdoc_to_do = None signal.signal(signal.SIGINT, self.term_handler) signal.signal(signal.SIGTERM, self.term_handler) rdfdoc_to_do = self._get_dataset(id) if rdfdoc_to_do is None: log.warning("rdfdoc_to_do is None") return 0 # register this worker self.worker_proc = model.WorkerProc() self.worker_proc.pid = os.getpid() self.worker_proc.rdfdoc = rdfdoc_to_do Session.add(self.worker_proc) rdfdoc_to_do.worked_on = True self.rdfdoc_to_do = rdfdoc_to_do log.debug("worker %i working on %i" % (self.worker_proc.pid, self.rdfdoc_to_do.id)) if rdfdoc_to_do.current_stats and rdfdoc_to_do.current_stats.errors == 'broken': rdfdoc_to_do.worked_on = False rdfdoc_to_do.last_updated = datetime.now() Session.delete(self.worker_proc) Session.commit() sys.exit(0) last_stat_result = rdfdoc_to_do.current_stats stat_result = model.StatResult() self.stat_result = stat_result rdfdoc_to_do.stats.append(stat_result) rdfdoc_to_do.current_stats = stat_result stat_result.triples_done = None stat_result.content_length = None stat_result.bytes_download = None stat_result.bytes = None stat_result.warnings = None stat_result.last_warning = None Session.commit() log.info(rdfdoc_to_do.format) error = None modified = True # set True if remote file has been modified try: rdfdocstats = RDFStats(rdfdoc_to_do.uri.encode('utf-8'), format=rdfdoc_to_do.format, stats=lodstats_stats) rdfdocstats.set_callback_function_download(self.callback_function_download) rdfdocstats.set_callback_function_extraction(self.callback_function_extraction) rdfdocstats.set_callback_function_statistics(self.callback_stats) rdfdocstats.start_statistics() except NotModified, errorstr: log.warning("not modified") modified = False
def term_handler(self, signum, frame): log.debug("exiting through term handler") Session.rollback() if self.rdfdoc_to_do is None or self.rdfdoc_to_do.worked_on == False: if self.worker_proc != None: Session.delete(self.worker_proc) Session.commit() sys.exit(0) else: self.rdfdoc_to_do.worked_on = False Session.delete(self.stat_result) Session.delete(self.worker_proc) Session.commit() sys.exit(0)
def reset_current_stats_and_worker(self): if self.current_stats is not None: self.current_stats.prep_delete() Session.commit() Session.delete(self.current_stats) Session.commit() if self.worker is not None: Session.delete(self.worker) self.last_updated=None self.file_last_modified=None self.worked_on=False Session.commit()
def command(self): self.logging_file_config(config_file) log = logging.getLogger(__name__) self.worker_proc = None self.rdfdoc_to_do = None signal.signal(signal.SIGINT, self.term_handler) signal.signal(signal.SIGTERM, self.term_handler) # do not spawn more than two workers number_of_workers = Session.query(model.WorkerProc).with_lockmode('read').count() if number_of_workers >= 2: return 0 # check for orphaned local packages allLocalPackages = Session.query(model.RDFDoc).all() for pkg in all_local_pkgs: if pkg.name not in package_list: log.debug("%s is gone and will be deleted" % pkg.name) #Session.delete(pkg) #Session.commit() for package_name in package_list: try: package = ckan.package_entity_get(package_name) except Exception, errorstr: log.debug("ERROR with %s: %s" % (package_name, errorstr)) continue rdfdoc = Session.query(model.RDFDoc).filter(model.RDFDoc.name==package['name']).first() if rdfdoc is None: rdfdoc = model.RDFDoc() Session.add(rdfdoc) rdfdoc.name = package['name'] class BreakIt: pass try: for resource in package['resources']: if resource['format'].lower() in ["application/x-ntriples", "nt", "gzip:ntriples"]: rdfdoc.format = "nt" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["application/x-nquads", "nquads"]: rdfdoc.format = "nq" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["application/rdf+xml", "rdf"]: rdfdoc.format = "rdf" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["text/turtle", "rdf/turtle", "ttl"]: rdfdoc.format = "ttl" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["text/n3", "n3"]: rdfdoc.format = "n3" rdfdoc.uri = resource['url'] raise BreakIt for resource in package['resources']: if resource['format'].lower() in ["api/sparql", "sparql"]: # prefer a sitemap.xml over sparql, if any for sitemap_resource in package['resources']: if sitemap_resource['format'].lower() in ["meta/sitemap"]: rdfdoc.format = "sitemap" rdfdoc.uri = sitemap_resource['url'] raise BreakIt rdfdoc.format = "sparql" rdfdoc.uri = resource['url'] except BreakIt: pass if rdfdoc.format is not None: Session.commit() else: Session.rollback()
def callback_function_extraction(self, rdfdocstat): self.stat_result.bytes = rdfdocstat.bytes_extracted Session.commit()
def callback_function_download(self, rdfdocstat): self.stat_result.content_length = rdfdocstat.content_length self.stat_result.bytes_downloaded = rdfdocstat.bytes_downloaded Session.commit()
for link_uri,result in namespacelinks_ordered.iteritems(): c = Session.query(model.Link).filter(model.Link.code==link_uri).first() if c is None: c = model.Link() c.code = link_uri Session.add(c) rcs = model.LinkStat() rcs.link = c rcs.stat_result = stat_result rcs.count = result Session.add(rcs) nsl_count += 1 if nsl_count >= 500: break elif not modified: rdfdoc_to_do.current_stats = last_stat_result Session.delete(stat_result) else: stat_result.triples = None stat_result.void = None stat_result.has_errors = True stat_result.errors = unicode(error) rdfdoc_to_do.worked_on = False rdfdoc_to_do.last_updated = datetime.now() rdfdoc_to_do.file_last_modified = rdfdocstats.last_modified stat_result.last_updated = datetime.now() Session.delete(self.worker_proc) Session.commit() log.debug("Done!")