def test_0_package_role(self): test0 = model.Package.by_name(u'test0') mradmin = model.User.by_name(u'tester') uor = model.UserObjectRole(role=model.Role.ADMIN, user=mradmin) model.Session.add(uor) pr = model.PackageRole(role=model.Role.ADMIN, package=test0, user=mradmin) model.Session.add(pr) test0 = model.Package.by_name(u'test0') prs = model.Session.query(model.PackageRole).filter_by( role=model.Role.ADMIN, package=test0, user=mradmin) model.repo.commit_and_remove() # basic test of existence assert len(prs.all()) == 1, prs.all() pr = prs.first() assert pr.context == 'Package', pr.context # test delete-orphan q = model.Session.query(model.UserObjectRole) q = q.filter_by(user=mradmin) assert q.count() == 2, q.all() uow = q.filter_by(context=u'user_object').first() uow.user = None model.repo.commit_and_remove() assert q.count() == 1, q.all() # now test delete-orphan on PackageRole prs = model.Session.query(model.PackageRole) pr = prs.first() pr.user = None model.repo.commit_and_remove() prs = model.Session.query(model.PackageRole)
def import_stage(self, harvest_object): log.debug('In ZhGisHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False try: package_dict = json.loads(harvest_object.content) package_dict['id'] = harvest_object.guid user = model.User.get(self.HARVEST_USER) context = { 'model': model, 'session': Session, 'user': self.HARVEST_USER } # Find or create group the dataset should get assigned to package_dict['groups'] = self._find_or_create_groups(context) # Find or create the organization # the dataset should get assigned to package_dict['owner_org'] = self._find_or_create_organization( context) # Save license url in extras extras = [] if 'license_url' in package_dict: extras.append(('license_url', package_dict['license_url'])) package_dict['extras'] = extras package = model.Package.get(package_dict['id']) model.PackageRole(package=package, user=user, role=model.Role.ADMIN) log.debug('Save or update package %s (%s)' % (package_dict['name'], package_dict['id'])) self._create_or_update_package(package_dict, harvest_object) log.debug('Save or update term translations') self._submit_term_translations(context, package_dict) Session.commit() except Exception, e: log.exception(e) raise
def import_stage(self, harvest_object): log.debug('In ZhstatHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False try: package_dict = json.loads(harvest_object.content) package_dict['id'] = harvest_object.guid package_dict['name'] = self._gen_new_name(package_dict['title'], package_dict['id']) user = model.User.get(self.HARVEST_USER) context = { 'model': model, 'session': Session, 'user': self.HARVEST_USER } # Find or create group the dataset should get assigned to for group_name in package_dict['groups']: if not group_name: raise GroupNotFoundError( 'Group is not defined for dataset %s' % package_dict['title']) data_dict = { 'id': group_name, 'name': munge_title_to_name(group_name), 'title': group_name } try: group = get_action('group_show')(context, data_dict) log.info('found group ' + group['id']) except: group = get_action('group_create')(context, data_dict) log.info('created the group ' + group['id']) # Find or create the organization # the dataset should get assigned to data_dict = { 'permission': 'edit_group', 'id': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'name': munge_title_to_name(self.ORGANIZATION[u'de']['name']), 'title': self.ORGANIZATION[u'de']['name'], 'description': self.ORGANIZATION[u'de']['description'], 'extras': [{ 'key': 'website', 'value': self.ORGANIZATION[u'de']['website'] }] } try: package_dict['owner_org'] = get_action('organization_show')( context, data_dict)['id'] except: organization = get_action('organization_create')(context, data_dict) package_dict['owner_org'] = organization['id'] # Save additional metadata in extras extras = [] if 'license_url' in package_dict: extras.append(('license_url', package_dict['license_url'])) package_dict['extras'] = extras log.debug('Extras %s' % extras) package = model.Package.get(package_dict['id']) model.PackageRole(package=package, user=user, role=model.Role.ADMIN) self._create_or_update_package(package_dict, harvest_object) # Add the translations to the term_translations table for translation in package_dict['translations']: action.update.term_translation_update(context, translation) Session.commit() except Exception, detail: log.exception(detail) raise
result = self._create_or_update_package(package_dict, harvest_object) if result is True and self.config.get('read_only', False) is True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return result except ValidationError, e: self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') except Exception, e:
def import_stage(self, harvest_object): log.debug("In CKANHarvester import_stage") context = { "model": model, "session": Session, "user": self._get_user_name() } if not harvest_object: log.error("No harvest object received") return False if harvest_object.content is None: self._save_object_error( "Empty content for object %s" % harvest_object.id, harvest_object, "Import", ) return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get("type") == "harvest": log.warn("Remote dataset is a harvest source, ignoring...") return True # Set default tags if needed default_tags = self.config.get("default_tags", []) if default_tags: if not "tags" in package_dict: package_dict["tags"] = [] package_dict["tags"].extend( [t for t in default_tags if t not in package_dict["tags"]]) remote_groups = self.config.get("remote_groups", None) if not remote_groups in ("only_local", "create"): # Ignore remote groups package_dict.pop("groups", None) else: if not "groups" in package_dict: package_dict["groups"] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict["groups"]: try: data_dict = {"id": group_name} group = get_action("group_show")(context, data_dict) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) except NotFound as e: log.info("Group %s is not available" % group_name) if remote_groups == "create": try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error("Could not get remote group %s" % group_name) continue for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", ]: group.pop(key, None) get_action("group_create")(context, group) log.info("Group %s has been newly created" % group_name) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) package_dict["groups"] = validated_groups # Local harvest source organization source_dataset = get_action("package_show")( context, { "id": harvest_object.source.id }) local_org = source_dataset.get("owner_org") remote_orgs = self.config.get("remote_orgs", None) if not remote_orgs in ("only_local", "create"): # Assign dataset to the source organization package_dict["owner_org"] = local_org else: if not "owner_org" in package_dict: package_dict["owner_org"] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict["owner_org"] if remote_org: try: data_dict = {"id": remote_org} org = get_action("organization_show")(context, data_dict) validated_org = org["id"] except NotFound as e: log.info("Organization %s is not available" % remote_org) if remote_orgs == "create": try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", "type", ]: org.pop(key, None) get_action("organization_create")(context, org) log.info( "Organization %s has been newly created" % remote_org) validated_org = org["id"] except (RemoteResourceError, ValidationError): log.error("Could not get remote org %s" % remote_org) package_dict["owner_org"] = validated_org or local_org # Set default groups if needed default_groups = self.config.get("default_groups", []) if default_groups: if not "groups" in package_dict: package_dict["groups"] = [] package_dict["groups"].extend([ g for g in default_groups if g not in package_dict["groups"] ]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in list(package_dict["extras"].keys()): if not isinstance(package_dict["extras"][key], str): try: package_dict["extras"][key] = json.dumps( package_dict["extras"][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict["extras"][key] # Set default extras if needed default_extras = self.config.get("default_extras", {}) if default_extras: override_extras = self.config.get("override_extras", False) if not "extras" in package_dict: package_dict["extras"] = {} for key, value in default_extras.items(): if not key in package_dict["extras"] or override_extras: # Look for replacement strings if isinstance(value, str): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip("/"), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict["id"], ) package_dict["extras"][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get("resources", []): resource.pop("url_type", None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get("read_only", False) == True: package = model.Package.get(package_dict["id"]) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get("user", "harvest") user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in ("visitor", "logged_in"): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError as e: self._save_object_error( "Invalid package with GUID %s: %r" % (harvest_object.guid, e.error_dict), harvest_object, "Import", ) except Exception as e: self._save_object_error("%r" % e, harvest_object, "Import")
def import_stage(self, harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: d = socrataAdaptor() log.debug("Converting View") stripped_source = harvest_object.source.url.rstrip('/') package_dict = d.convertViewXml(harvest_object.id, stripped_source, harvest_object.content) package_dict.update({"catalogue_url": str(harvest_object.source.url.rstrip('/'))}) package_dict.update({"platform": "socrata"}) if 'category' in package_dict.keys(): package_dict['extras'].update({'category': package_dict['category']}) del package_dict['category'] log.debug(package_dict) if package_dict['id'] not in ids: metadata_created = datetime.datetime.now() package_dict.update({"metadata_created": str(metadata_created)}) socrata_db.save(package_dict) log.info('Metadata saved succesfully to MongoDb.') else: document = socrata_db.find_one({"id": package_dict['id']}) met_created = document['metadata_created'] package_dict.update({'metadata_created': met_created}) package_dict.update({'metadata_updated': str(datetime.datetime.now())}) package_dict.update({'updated_dataset': True}) socrata_db.remove({"id": package_dict['id']}) socrata_db.save(package_dict) log.info('Metadata updated succesfully to MongoDb.') # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError, e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') log.debug("Validation Error: %s", harvest_object.guid)
def import_stage(self,harvest_object): log.debug('In NTPCHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) log.debug(package_dict) log.debug('=============================================') package_dict["id"] = harvest_object.guid # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) log.debug(remote_groups) log.debug('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = {'model': model, 'session': Session, 'user': '******'} for group_name in package_dict['groups']: log.debug(group_name) try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group(harvest_object.source.url, group_name) except: log.error('Could not get remote group %s' % group_name) continue for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Ignore remote orgs for the time being package_dict.pop('owner_org', None) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value log.debug('_create_or_update_package') log.debug(package_dict) log.debug(harvest_object) result = self._create_or_update_package(package_dict,harvest_object) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) log.debug('import_stage return true') return True
def import_stage(self, harvest_object): log.debug('In HTMLHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: harvest_object.content = harvest_object.content.replace("'", '"') #package_dict=harvest_object.content package_dict = json.loads( harvest_object.content.decode('utf-8', 'ignore')) ## handle notes validation errors as existance of: " and / extrasjson = [] try: extras = package_dict['extras'] except: extras = "" j = 0 ##transformations to json's extras if 'value' in str(extras) and 'key' in str(extras): extrasjson[:] = [] extrasjson2 = "" while j < len(package_dict['extras']): extra_key = package_dict['extras'][j]['key'] extra_value = package_dict['extras'][j]['value'] if len(extra_value) > 0: c = 0 extra_value1 = "" while c < len(extra_value): extra_value1 = extra_value1 + extra_value[c] c += 1 c = 0 extra_value = extra_value1 extra = '"' + str( extra_key.encode('utf-8')) + '":' + '"' + str( extra_value.encode('utf-8')) + '"' extrasjson.append(extra) j += 1 k = 0 extrasjson1 = "" while k < len(extrasjson): extrasjson1 = extrasjson1 + extrasjson[k] + "," k += 1 k = 0 j = 0 extrasjson1 = "{" + extrasjson1.rstrip(',') + "}" try: extrasjson2 = json.loads(extrasjson1) except: errorscounter += 1 if len(extrasjson) > 0: package_dict.update({"extras": extrasjson2}) try: tags = package_dict['tags'] j = 0 if 'name' in str(tags): while j < len(package_dict['tags']): tag = package_dict['tags'][j]['name'] tagsarray.append(tag) j += 1 if len(tagsarray) > 0: package_dict.update({"tags": tagsarray}) tagsarray[:] = [] j = 0 except: pass if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = { 'model': model, 'session': Session, 'user': '******' } for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups context = {'model': model, 'session': Session, 'user': '******'} # Local harvest source organization #source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id}) #local_org = source_dataset.get('owner_org') #remote_orgs = self.config.get('remote_orgs', None) #if not remote_orgs in ('only_local', 'create'): ## Assign dataset to the source organization #package_dict['owner_org'] = local_org #else: #if not 'owner_org' in package_dict: #package_dict['owner_org'] = None ## check if remote org exist locally, otherwise remove #validated_org = None #remote_org = package_dict['owner_org'] #if remote_org: #try: #data_dict = {'id': remote_org} #org = get_action('organization_show')(context, data_dict) #validated_org = org['id'] #except NotFound, e: #log.info('Organization %s is not available' % remote_org) #if remote_orgs == 'create': #try: #org = self._get_group(harvest_object.source.url, remote_org) #for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: #org.pop(key, None) #get_action('organization_create')(context, org) #log.info('Organization %s has been newly created' % remote_org) #validated_org = org['id'] #except: #log.error('Could not get remote org %s' % remote_org) #package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if not 'extras' in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get('resources', []): resource.pop('url_type', None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) # Ignore remote groups for the time being del package_dict['groups'] # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if not 'extras' in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError, e: self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
class CKANSchemingHarvester(CKANHarvester): ''' A Harvester for CKAN instances with custom scheming dataset ''' def info(self): return { 'name': 'ckan-scheming', 'title': 'CKAN-scheming', 'description': 'Harvests remote CKAN instances with ckanext-scheming', 'form_config_interface': 'Text' } def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') context = { 'model': model, 'session': Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Local harvest source organization source_dataset = get_action('package_show')( context, { 'id': harvest_object.source.id }) local_org = source_dataset.get('owner_org') remote_orgs = self.config.get('remote_orgs', None) if not remote_orgs in ('only_local', 'create'): # Assign dataset to the source organization package_dict['owner_org'] = local_org else: if not 'owner_org' in package_dict: package_dict['owner_org'] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict['owner_org'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) validated_org = org['id'] except NotFound, e: log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type' ]: org.pop(key, None) get_action('organization_create')(context, org) log.info( 'Organization %s has been newly created' % remote_org) validated_org = org['id'] except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org) package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # FIXME: enable only if not using ckanext-scheming dataset schemas # handle extras in harvested schema # """ # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value """ # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get('resources', []): resource.pop('url_type', None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
def import_stage(self,harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: #log.debug(harvest_object.content) d = socrataAdaptor() log.debug("Converting View") package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content) log.debug(package_dict) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict,harvest_object) #log.debug(result) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')