def dump_datasets(self): """ Output all public datasets as a .jl file """ registry = LocalCKAN('visitor') package_names = registry.action.package_list() cmd = [sys.argv[0], 'canada', 'dump-datasets-worker', '-c', self.options.config] stats = completion_stats(self.options.processes) pool = worker_pool(cmd, self.options.processes, enumerate(package_names)) sink = sys.stdout if self.options.gzip: sink = gzip.GzipFile(fileobj=sys.stdout, mode='wb') expecting_number = 0 results = {} with _quiet_int_pipe(): for job_ids, finished, result in pool: sys.stderr.write("%s %s %s\n" % ( job_ids, stats.next(), finished)) results[finished] = result # keep the output in the same order as package_names while expecting_number in results: sink.write(results.pop(expecting_number)) expecting_number += 1
def portal_update(self, source, activity_date=None): """ collect batches of package ids modified at source since activity_date and apply the package updates to the local CKAN instance for all packages with published_date set to any time in the past. """ if activity_date: # XXX local time :-( activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) seen_package_id_set = set() def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( source, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date pool = worker_pool( [sys.argv[0], 'canada', 'portal-update-worker', source, '-c', self.options.config], self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below try: for package_ids, next_date in changed_package_id_runs(activity_date): stats = dict(created=0, updated=0, deleted=0, unchanged=0) jobs = ((i, i + '\n') for i in package_ids) try: job_ids, finished, result = pool.send(jobs) while result is not None: stats[result.strip()] += 1 job_ids, finished, result = pool.next() except KeyboardInterrupt: break print next_date.isoformat(), print " ".join("%s:%s" % kv for kv in sorted(stats.items())) except IOError, e: # let pipe errors cause silent exit -- # the worker will have provided the real traceback if e.errno != 32: raise
def portal_update(self, source, activity_date=None): """ collect batches of package ids modified at source since activity_date and apply the package updates to the local CKAN instance for all packages with published_date set to any time in the past. """ if activity_date: # XXX local time :-( activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) seen_package_id_set = set() def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( source, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date pool = worker_pool( [sys.argv[0], 'canada', 'portal-update-worker', source, '-c', self.options.config], self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below with _quiet_int_pipe(): for package_ids, next_date in changed_package_id_runs(activity_date): stats = dict(created=0, updated=0, deleted=0, unchanged=0) job_ids, finished, result = pool.send(enumerate(package_ids)) while result is not None: stats[result.strip()] += 1 job_ids, finished, result = pool.next() print next_date.isoformat(), print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
def load_datasets(self, jl_source, start_line=1, max_count=None): start_line = int(start_line) if max_count is not None: max_count = int(max_count) log = None if self.options.log: log = open(self.options.log, 'a') def line_reader(): for num, line in enumerate(open(jl_source), 1): if num < start_line: continue if max_count is not None and num >= start_line + max_count: break yield num, line.strip() + '\n' cmd = [sys.argv[0], 'canada', 'load-dataset-worker', '-c', self.options.config] if self.options.ckan_user: cmd += ['-u', self.options.ckan_user] if self.options.replace_datasets: cmd += ['-r'] stats = completion_stats(self.options.processes) pool = worker_pool(cmd, self.options.processes, line_reader()) try: for job_ids, finished, result in pool: timestamp, action, error, response = json.loads(result) print job_ids, stats.next(), finished, action, print json.dumps(response) if response else '' if log: log.write(json.dumps([ timestamp, finished, action, error, response, ]) + '\n') log.flush() except IOError, e: # let pipe errors cause silent exit -- # the worker will have provided the real traceback if e.errno != 32: raise
def load_datasets(self, jl_source, start_line=1, max_count=None): start_line = int(start_line) if max_count is not None: max_count = int(max_count) log = None if self.options.log: log = open(self.options.log, 'a') def line_reader(): if self.options.gzip: source_file = gzip.GzipFile(jl_source) else: source_file = open(jl_source) for num, line in enumerate(source_file, 1): if num < start_line: continue if max_count is not None and num >= start_line + max_count: break yield num, line.strip() cmd = [sys.argv[0], 'canada', 'load-dataset-worker', '-c', self.options.config] if self.options.ckan_user: cmd += ['-u', self.options.ckan_user] if self.options.replace_datasets: cmd += ['-r'] stats = completion_stats(self.options.processes) pool = worker_pool(cmd, self.options.processes, line_reader()) with _quiet_int_pipe(): for job_ids, finished, result in pool: timestamp, action, error, response = json.loads(result) print job_ids, stats.next(), finished, action, print json.dumps(response) if response else '' if log: log.write(json.dumps([ timestamp, finished, action, error, response, ]) + '\n') log.flush()
def _portal_update(self, source, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = (int(x) if x else 0 for x in past.groups()) activity_date = datetime.now() - timedelta(days=days, seconds=(hours * 60 + minutes) * 60) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') seen_package_id_set = set() if self.options.push_apikey and not self.options.fetch: registry = LocalCKAN() elif self.options.fetch: registry = RemoteCKAN(source) else: print "exactly one of -f or -a options must be specified" return def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( registry, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date cmd = [sys.argv[0], 'canada', 'copy-datasets', source, '-c', self.options.config] if self.options.push_apikey: cmd.extend(['-a', self.options.push_apikey]) else: cmd.append('-f') if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log(None, None, "started updating from:", activity_date.isoformat()) for package_ids, next_date in changed_package_id_runs(activity_date): job_ids, finished, result = pool.send(enumerate(package_ids)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log(None, None, "next batch starting at:", next_date.isoformat()) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True
def _portal_update(self, source, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = (int(x) if x else 0 for x in past.groups()) activity_date = datetime.now() - timedelta(days=days, seconds=(hours * 60 + minutes) * 60) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') if self.options.push_apikey and not self.options.fetch: registry = LocalCKAN() elif self.options.fetch: registry = RemoteCKAN(source) else: print "exactly one of -f or -a options must be specified" return def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( registry, start_date) if next_date is None: return yield package_ids, next_date start_date = next_date cmd = [sys.argv[0], 'canada', 'copy-datasets', source, '-c', self.options.config] if self.options.push_apikey: cmd.extend(['-a', self.options.push_apikey]) else: cmd.append('-f') if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log(None, None, "started updating from:", activity_date.isoformat()) for package_ids, next_date in changed_package_id_runs(activity_date): job_ids, finished, result = pool.send(enumerate(package_ids)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log(None, None, "next batch starting at:", next_date.isoformat()) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True