def sixodp_to_opendata_postprocess(package_dict): package_dict['collection_type'] = 'Open Data' package_dict['maintainer'] = package_dict.get('maintainer', ' ') or ' ' package_dict['maintainer_email'] = package_dict.get('maintainer_email', ' ') or ' ' date_released = parse_datetime(package_dict['date_released']) if date_released: date_released_isoformat = "%s.000000" % date_released.isoformat().split('+', 2)[0] package_dict['date_released'] = date_released_isoformat package_dict['metadata_created'] = date_released_isoformat for resource in package_dict['resources']: time_series_start = resource.get('time_series_start') if time_series_start: try: isodate(time_series_start, {}) except Invalid: resource.pop('time_series_start') time_series_end = resource.get('time_series_end') if time_series_end: try: isodate(time_series_end, {}) except Invalid: resource.pop('time_series_end')
def taxonomy_update(context, data_dict): """ Updates an existing taxonomy. title, name and uri are required :returns: The newly updated taxonomy :rtype: A dictionary """ _check_access('taxonomy_update', context, data_dict) model = context['model'] id = logic.get_or_bust(data_dict, 'id') tax = Taxonomy.get(id) if not tax: raise logic.NotFound() tax.name = data_dict.get('name', tax.name) tax.title = data_dict.get('title', tax.title) tax.uri = data_dict.get('name', tax.uri) last_modified = data_dict.get('last_modified', tax.last_modified) if tax.last_modified != last_modified: tax.last_modified = isodate(last_modified, context) model.Session.add(tax) model.Session.commit() return tax.as_dict()
def _changed_packages_since(self, registry, since_time): """ Query source ckan instance for packages changed since_time. returns (packages, next since_time to query) or (None, None) when no more changes are found. registry - LocalCKAN or RemoteCKAN instance since_time - local datetime to start looking for changes If all the package ids found were included in seen_id_set this function will return an empty list of package ids. Note that this is different than when no more changes found and (None, None) is returned. """ data = registry.action.changed_packages_activity_list_since( since_time=since_time.isoformat()) if not data: return None, None packages = [] for result in data: package_id = result['data']['package']['id'] try: packages.append(json.dumps(registry.action.package_show(id=package_id))) except NotFound: pass if data: since_time = isodate(data[-1]['timestamp'], None) return packages, since_time
def _trim_package(pkg): """ remove keys from pkg that we don't care about when comparing or updating/creating packages. Also try to convert types and create missing fields that will be present in package_show. """ # XXX full of custom hacks and deep knowledge of our schema :-( if not pkg: return for k in ['extras', 'metadata_modified', 'metadata_created', 'revision_id', 'revision_timestamp', 'organization', 'version', 'tracking_summary', 'tags', # just because we don't use them 'num_tags', 'num_resources', 'maintainer', 'isopen', 'relationships_as_object', 'license_title', 'license_title_fra', 'license_url_fra', 'license_url', 'author', 'groups', # just because we don't use them 'relationships_as_subject', 'department_number', # FIXME: remove these when we can: 'resource_type', # new in 2.3: 'creator_user_id', ]: if k in pkg: del pkg[k] for r in pkg['resources']: for k in ['package_id', 'revision_id', 'revision_timestamp', 'cache_last_updated', 'webstore_last_updated', 'state', 'hash', 'description', 'tracking_summary', 'mimetype_inner', 'mimetype', 'cache_url', 'created', 'webstore_url', 'last_modified', 'position']: if k in r: del r[k] for k in ['name', 'size']: if k not in r: r[k] = None for k in ['private']: pkg[k] = boolean_validator(unicode(pkg.get(k, '')), None) if 'name' not in pkg: pkg['name'] = pkg['id'] if 'type' not in pkg: pkg['type'] = 'dataset' if 'state' not in pkg: pkg['state'] = 'active' for k in ['url']: if k not in pkg: pkg[k] = '' for name, lang, field in schema_description.dataset_field_iter(): if field['type'] == 'date': try: pkg[name] = str(isodate(pkg[name], None)) if pkg.get(name) else '' except Invalid: pass # not for us to fail validation elif field['type'] == 'url': if not pkg.get(name): # be consistent about what an empty url is pkg[name] = "" elif field['type'] == 'fixed' and name in pkg: del pkg[name]
def portal_update_worker(self, source): """ a process that accepts package ids on stdin which are passed to the package_show API on the remote CKAN instance and compared to the local version of the same package. The local package is then created, updated, deleted or left unchanged. This process outputs that action as a string 'created', 'updated', 'deleted' or 'unchanged' """ registry = RemoteCKAN(source) portal = LocalCKAN() now = datetime.now() for package_id in iter(sys.stdin.readline, ''): try: data = registry.action.package_show(id=package_id.strip()) source_pkg = data['result'] except NotAuthorized: source_pkg = None _trim_package(source_pkg) if source_pkg: # treat unpublished packages same as deleted packages if not source_pkg['portal_release_date'] or isodate( source_pkg['portal_release_date'], None) > now: source_pkg = None try: # don't pass user in context so deleted packages # raise NotAuthorized target_pkg = portal.call_action('package_show', {'id':package_id.strip()}, {}) except (NotFound, NotAuthorized): target_pkg = None _trim_package(target_pkg) if target_pkg is None and source_pkg is None: result = 'unchanged' elif target_pkg is None: # CREATE portal.action.package_create(**source_pkg) result = 'created' elif source_pkg is None: # DELETE portal.action.package_delete(id=package_id.strip()) result = 'deleted' elif source_pkg == target_pkg: result = 'unchanged' else: # UPDATE portal.action.package_update(**source_pkg) result = 'updated' sys.stdout.write(result + '\n') try: sys.stdout.flush() except IOError: break
def portal_update(self, source, activity_date=None): """ collect batches of package ids modified at source since activity_date and apply the package updates to the local CKAN instance for all packages with published_date set to any time in the past. """ if activity_date: # XXX local time :-( activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) seen_package_id_set = set() def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( source, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date pool = worker_pool( [sys.argv[0], 'canada', 'portal-update-worker', source, '-c', self.options.config], self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below try: for package_ids, next_date in changed_package_id_runs(activity_date): stats = dict(created=0, updated=0, deleted=0, unchanged=0) jobs = ((i, i + '\n') for i in package_ids) try: job_ids, finished, result = pool.send(jobs) while result is not None: stats[result.strip()] += 1 job_ids, finished, result = pool.next() except KeyboardInterrupt: break print next_date.isoformat(), print " ".join("%s:%s" % kv for kv in sorted(stats.items())) except IOError, e: # let pipe errors cause silent exit -- # the worker will have provided the real traceback if e.errno != 32: raise
def changed_packages_activity_list_since(context, data_dict): '''Return the activity stream of all recently added or changed packages. :param since_time: starting date/time Limited to 31 records (configurable via the ckan.activity_list_hard_limit setting) but may be called repeatedly with the timestamp of the last record to collect all activities. :rtype: list of dictionaries ''' since = get_or_bust(data_dict, 'since_time') try: since_time = isodate(since, None) except Invalid, e: raise ValidationError({'since_time': e.error})
def changed_packages_activity_list_since(context, data_dict): '''Return the activity stream of all recently added or changed packages. :param since_time: starting date/time Limited to 31 records (configurable via the ckan.activity_list_hard_limit setting) but may be called repeatedly with the timestamp of the last record to collect all activities. :rtype: list of dictionaries ''' since = get_or_bust(data_dict, 'since_time') try: since_time = isodate(since, None) except Invalid, e: raise ValidationError({'since_time':e.error})
def changed_datasets(self, since_date): """ Produce a list of dataset ids and requested dates. Each package id will appear at most once, showing the activity date closest to since_date. Requested dates are preceeded with a "#" """ since_date = isodate(since_date, None) seen_ids = set() while True: ids, since_date = self._changed_package_ids_since( self.options.server, since_date, seen_ids) if not ids: return for i in ids: print i if not self.options.brief: print "# {0}".format(since_date.isoformat())
def portal_update(self, source, activity_date=None): """ collect batches of package ids modified at source since activity_date and apply the package updates to the local CKAN instance for all packages with published_date set to any time in the past. """ if activity_date: # XXX local time :-( activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) seen_package_id_set = set() def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( source, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date pool = worker_pool( [sys.argv[0], 'canada', 'portal-update-worker', source, '-c', self.options.config], self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below with _quiet_int_pipe(): for package_ids, next_date in changed_package_id_runs(activity_date): stats = dict(created=0, updated=0, deleted=0, unchanged=0) job_ids, finished, result = pool.send(enumerate(package_ids)) while result is not None: stats[result.strip()] += 1 job_ids, finished, result = pool.next() print next_date.isoformat(), print " ".join("%s:%s" % kv for kv in sorted(stats.items()))
def taxonomy_create(context, data_dict): """ Creates a new taxonomy. Terms are not created here, they must be created using taxonomy_term_create with the taxonomy id from this call. :param owner_org: the id of the dataset's owning organization, see :returns: The newly created taxonomy :rtype: A dictionary. """ _check_access('taxonomy_create', context, data_dict) model = context['model'] name = data_dict.get('name') title = logic.get_or_bust(data_dict, 'title') uri = logic.get_or_bust(data_dict, 'uri') last_modified = data_dict.get('last_modified', "") if not name: name = munge_name(title) # Check the name has not been used if model.Session.query(Taxonomy).filter(Taxonomy.name == name).count() > 0: raise logic.ValidationError("Name is already in use") if last_modified != "": t = Taxonomy(name=name, title=title, uri=uri, last_modified=isodate(last_modified, context)) else: t = Taxonomy(name=name, title=title, uri=uri, last_modified=datetime.date.today()) model.Session.add(t) model.Session.commit() return t.as_dict()
def _changed_package_ids_since(self, registry, since_time, seen_id_set=None): """ Query source ckan instance for packages changed since_time. returns (package ids, next since_time to query) or (None, None) when no more changes are found. registry - LocalCKAN or RemoteCKAN instance since_time - local datetime to start looking for changes seen_id_set - set of package ids already processed, this set is modified by calling this function If all the package ids found were included in seen_id_set this function will return an empty list of package ids. Note that this is different than when no more changes found and (None, None) is returned. """ data = registry.action.changed_packages_activity_list_since( since_time=since_time.isoformat()) if seen_id_set is None: seen_id_set = set() if not data: return None, None package_ids = [] for result in data: package_id = result['data']['package']['id'] if package_id in seen_id_set: continue seen_id_set.add(package_id) package_ids.append(package_id) if data: since_time = isodate(data[-1]['timestamp'], None) return package_ids, since_time
def changed_datasets(self, since_date): """ Produce a list of dataset ids and requested dates. Each package id will appear at most once, showing the activity date closest to since_date. Requested dates are preceeded with a "#" """ since_date = isodate(since_date, None) seen_ids = set() if self.options.server: registry = RemoteCKAN(self.options.server) else: registry = LocalCKAN() while True: ids, since_date = self._changed_package_ids_since( registry, since_date, seen_ids) if not ids: return for i in ids: print i if not self.options.brief: print "# {0}".format(since_date.isoformat())
def copy_datasets(self, remote, package_ids=None): """ a process that accepts packages on stdin which are compared to the local version of the same package. The local package is then created, updated, deleted or left unchanged. This process outputs that action as a string 'created', 'updated', 'deleted' or 'unchanged' """ portal = LocalCKAN() now = datetime.now() packages = iter(sys.stdin.readline, '') for package in packages: source_pkg = json.loads(package) package_id = source_pkg['id'] reason = None target_deleted = False if source_pkg and source_pkg['state'] == 'deleted': source_pkg = None if source_pkg and source_pkg['type'] not in DATASET_TYPES: # non-default dataset types ignored source_pkg = None _trim_package(source_pkg) action = None if source_pkg and not self.options.mirror: if source_pkg.get('ready_to_publish') == 'false': source_pkg = None reason = 'marked not ready to publish' elif not source_pkg.get('portal_release_date'): source_pkg = None reason = 'release date not set' elif isodate(source_pkg['portal_release_date'], None) > now: source_pkg = None reason = 'release date in future' else: # portal packages published public source_pkg['private'] = False if action != 'skip': try: target_pkg = portal.call_action('package_show', { 'id': package_id }) except (NotFound, NotAuthorized): target_pkg = None except (CKANAPIError, urllib2.URLError), e: sys.stdout.write( json.dumps([ package_id, 'target error', unicode(e.args) ]) + '\n' ) raise if target_pkg and target_pkg['state'] == 'deleted': target_pkg = None target_deleted = True _trim_package(target_pkg) if action == 'skip': pass elif target_pkg is None and source_pkg is None: action = 'unchanged' reason = reason or 'deleted on registry' elif target_deleted: action = 'updated' reason = 'undeleting on target' portal.action.package_update(**source_pkg) elif target_pkg is None: action = 'created' portal.action.package_create(**source_pkg) elif source_pkg is None: action = 'deleted' portal.action.package_delete(id=package_id) elif source_pkg == target_pkg: action = 'unchanged' reason = 'no difference found' else: action = 'updated' portal.action.package_update(**source_pkg) sys.stdout.write(json.dumps([package_id, action, reason]) + '\n') sys.stdout.flush()
def copy_datasets(self, remote, package_ids=None): """ a process that accepts package ids on stdin which are passed to the package_show API on the remote CKAN instance and compared to the local version of the same package. The local package is then created, updated, deleted or left unchanged. This process outputs that action as a string 'created', 'updated', 'deleted' or 'unchanged' """ if self.options.push_apikey and not self.options.fetch: registry = LocalCKAN() portal = RemoteCKAN(remote, apikey=self.options.push_apikey) elif self.options.fetch: registry = RemoteCKAN(remote) portal = LocalCKAN() else: print "exactly one of -f or -a options must be specified" return now = datetime.now() if not package_ids: package_ids = iter(sys.stdin.readline, '') for package_id in package_ids: package_id = package_id.strip() reason = None target_deleted = False try: source_pkg = registry.action.package_show(id=package_id) except NotAuthorized: source_pkg = None except (CKANAPIError, urllib2.URLError), e: sys.stdout.write(json.dumps([package_id, 'source error', unicode(e.args)]) + '\n') raise if source_pkg and source_pkg['state'] == 'deleted': source_pkg = None if source_pkg and source_pkg['type'] != 'dataset': # non-default dataset types ignored source_pkg = None _trim_package(source_pkg) if source_pkg and not self.options.mirror: # treat unpublished packages same as deleted packages if not source_pkg['portal_release_date']: source_pkg = None reason = 'release date not set' elif isodate(source_pkg['portal_release_date'], None) > now: source_pkg = None reason = 'release date in future' try: target_pkg = portal.call_action('package_show', {'id':package_id}) except (NotFound, NotAuthorized): target_pkg = None except (CKANAPIError, urllib2.URLError), e: sys.stdout.write(json.dumps([package_id, 'target error', unicode(e.args)]) + '\n') raise
def _portal_update(self, portal_ini, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = ( int(x) if x else 0 for x in past.groups() ) activity_date = datetime.now() - timedelta( days=days, seconds=(hours * 60 + minutes) * 60 ) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') registry = LocalCKAN() def changed_package_id_runs(start_date): while True: packages, next_date = self._changed_packages_since( registry, start_date) if next_date is None: return yield packages, next_date start_date = next_date cmd = [ sys.argv[0], 'canada', 'copy-datasets', '-c', portal_ini ] if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) # Advance generator so we may call send() below pool.next() def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log( None, None, "started updating from:", activity_date.isoformat() ) for packages, next_date in ( changed_package_id_runs(activity_date)): job_ids, finished, result = pool.send(enumerate(packages)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log( None, None, "next batch starting at:", next_date.isoformat() ) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True
def _trim_package(pkg): """ remove keys from pkg that we don't care about when comparing or updating/creating packages. Also try to convert types and create missing fields that will be present in package_show. """ # XXX full of custom hacks and deep knowledge of our schema :-( if not pkg: return for k in ['extras', 'metadata_modified', 'metadata_created', 'revision_id', 'revision_timestamp', 'organization', 'version', 'tracking_summary', 'tags', # just because we don't use them 'num_tags', 'num_resources', 'maintainer', 'isopen', 'relationships_as_object', 'license_title', 'license_title_fra', 'license_url_fra', 'license_url', 'maintainer_email', 'author', 'groups', # just because we don't use them 'relationships_as_subject', 'department_number', # FIXME: remove these when we can: 'resource_type', ]: if k in pkg: del pkg[k] for r in pkg['resources']: for k in ['resource_group_id', 'revision_id', 'revision_timestamp', 'cache_last_updated', 'webstore_last_updated', 'id', 'state', 'hash', 'description', 'tracking_summary', 'mimetype_inner', 'mimetype', 'cache_url', 'created', 'webstore_url', 'last_modified', 'position', ]: if k in r: del r[k] for k in ['name', 'size']: if k not in r: r[k] = None for k in ['ready_to_publish', 'private']: pkg[k] = boolean_validator(unicode(pkg.get(k, '')), None) if 'name' not in pkg: pkg['name'] = pkg['id'] if 'type' not in pkg: pkg['type'] = 'dataset' if 'state' not in pkg: pkg['state'] = 'active' for k in ['url']: if k not in pkg: pkg[k] = '' for name, lang, field in schema_description.dataset_field_iter(): if field['type'] == 'date': try: pkg[name] = str(isodate(pkg[name], None)) if pkg.get(name) else '' except Invalid: pass # not for us to fail validation elif field['type'] == 'tag_vocabulary' and not isinstance( pkg.get(name), list): pkg[name] = convert_pilot_uuid_list(field)(pkg.get(name, [])) elif field['type'] == 'url': if not pkg.get(name): # be consistent about what an empty url is pkg[name] = "" elif field['type'] == 'fixed' and name in pkg: del pkg[name]
def _portal_update(self, source, activity_date): if activity_date: past = re.match(PAST_RE, activity_date) if past: days, hours, minutes = (int(x) if x else 0 for x in past.groups()) activity_date = datetime.now() - timedelta(days=days, seconds=(hours * 60 + minutes) * 60) else: activity_date = isodate(activity_date, None) else: activity_date = datetime.now() - timedelta(days=7) log = None if self.options.log: log = open(self.options.log, 'a') seen_package_id_set = set() if self.options.push_apikey and not self.options.fetch: registry = LocalCKAN() elif self.options.fetch: registry = RemoteCKAN(source) else: print "exactly one of -f or -a options must be specified" return def changed_package_id_runs(start_date): while True: package_ids, next_date = self._changed_package_ids_since( registry, start_date, seen_package_id_set) if next_date is None: return yield package_ids, next_date start_date = next_date cmd = [sys.argv[0], 'canada', 'copy-datasets', source, '-c', self.options.config] if self.options.push_apikey: cmd.extend(['-a', self.options.push_apikey]) else: cmd.append('-f') if self.options.mirror: cmd.append('-m') pool = worker_pool( cmd, self.options.processes, [], stop_when_jobs_done=False, stop_on_keyboard_interrupt=False, ) pool.next() # advance generator so we may call send() below def append_log(finished, package_id, action, reason): if not log: return log.write(json.dumps([ datetime.now().isoformat(), finished, package_id, action, reason, ]) + '\n') log.flush() with _quiet_int_pipe(): append_log(None, None, "started updating from:", activity_date.isoformat()) for package_ids, next_date in changed_package_id_runs(activity_date): job_ids, finished, result = pool.send(enumerate(package_ids)) stats = completion_stats(self.options.processes) while result is not None: package_id, action, reason = json.loads(result) print job_ids, stats.next(), finished, package_id, \ action, reason append_log(finished, package_id, action, reason) job_ids, finished, result = pool.next() print " --- next batch starting at: " + next_date.isoformat() append_log(None, None, "next batch starting at:", next_date.isoformat()) self._portal_update_activity_date = next_date.isoformat() self._portal_update_completed = True