def test_basic_query(self): schema = default_create_package_schema() context = {'model':model,'session':Session,'user':'******','extras_as_string':True,'schema':schema,'api_version':2} package_dict = package_create(context,self.package_fixture_data) package_id = context.get('id') # Point inside bbox offset = self._offset_with_bbox() res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 1 assert res_dict['results'][0] == package_id # Point outside bbox offset = self._offset_with_bbox(-10,10,-20,20) res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 0 assert res_dict['results'] == [] # Delete the package and ensure it does not come up on # search results package_delete(context,{'id':package_id}) offset = self._offset_with_bbox() res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 0 assert res_dict['results'] == []
def create_package_schema(): """ Add our custom fields for validation from the form """ schema = default_create_package_schema() _schema_update(schema, 'create') return schema
def setup(self): super(TestProjectBase, self).setup() self.user = factories.User() context = { 'model': model, 'session': model.Session, 'user': self.user['name'], } org_create_context = context.copy() org_create_context['schema'] = schema.default_group_schema() self.organization = helpers.call_action( 'organization_create', context=context, id='1', name='organization' ) project_context = context.copy() project_context['schema'] = schema.default_create_package_schema() self.project = helpers.call_action( 'package_create', context=project_context, type='project', id='1', name='test', title='Test', owner_org=self.organization['name'], )
def setup(self): super(TestProjectBase, self).setup() self.user = factories.User() context = { 'model': model, 'session': model.Session, 'user': self.user['name'], } org_create_context = context.copy() org_create_context['schema'] = schema.default_group_schema() self.organization = helpers.call_action('organization_create', context=context, id='1', name='organization') project_context = context.copy() project_context['schema'] = schema.default_create_package_schema() self.project = helpers.call_action( 'package_create', context=project_context, type='project', id='1', name='test', title='Test', owner_org=self.organization['name'], )
def form_to_db_schema_options(self, options={}): context = options.get('context', {}) schema = context.get('schema', None) if schema: return schema elif options.get('api'): if options.get('type') == 'create': return default_schema.default_create_package_schema() else: return default_schema.default_update_package_schema() schema = self.form_to_db_schema() # Sysadmins can save UKLP datasets with looser validation # constraints. This is because UKLP datasets are created using # a custom schema passed in from the harvester. However, when it # comes to re-saving the dataset via the dataset form, there are # some validation requirements we need to drop. That's what this # section of code does. pkg = context.get('package') user = context.get('user', '') if Authorizer().is_sysadmin(unicode(user)) and \ pkg and pkg.extras.get('UKLP', 'False') == 'True': schema.update(self._uklp_sysadmin_schema_updates) return schema
def test_1_basic(self): schema = default_create_package_schema() context = { 'model': model, 'session': Session, 'user': '******', 'extras_as_string': True, 'schema': schema, 'api_version': 2 } package_dict_1 = package_create(context, self.package_fixture_data_1) del context['package'] package_dict_2 = package_create(context, self.package_fixture_data_2) postparams = '%s=1' % json.dumps( { 'q': 'test', 'facet.field': ('groups', 'tags', 'res_format', 'license'), 'rows': 20, 'start': 0, 'extras': { 'ext_bbox': '%s,%s,%s,%s' % (10, 10, 40, 40) } }) res = self.app.post('/api/action/package_search', params=postparams) res = json.loads(res.body) result = res['result'] # Only one dataset returned assert_equal(res['success'], True) assert_equal(result['count'], 1) assert_equal(result['results'][0]['name'], 'test-spatial-dataset-search-point-2')
def create_package_schema(self): schema = default_create_package_schema() schema.update({ 'vocab_tags': [ignore_missing, convert_to_tags(TEST_VOCAB_NAME)], }) return schema
def form_to_db_schema_options(self, options={}): context = options.get('context', {}) schema = context.get('schema',None) if schema: return schema elif options.get('api'): if options.get('type') == 'create': return default_schema.default_create_package_schema() else: return default_schema.default_update_package_schema() schema = self.form_to_db_schema() # Sysadmins can save UKLP datasets with looser validation # constraints. This is because UKLP datasets are created using # a custom schema passed in from the harvester. However, when it # comes to re-saving the dataset via the dataset form, there are # some validation requirements we need to drop. That's what this # section of code does. pkg = context.get('package') user = context.get('user', '') if Authorizer().is_sysadmin(unicode(user)) and \ pkg and pkg.extras.get('UKLP', 'False') == 'True': schema.update(self._uklp_sysadmin_schema_updates) if Authorizer().is_sysadmin(unicode(user)) and \ pkg and pkg.extras.get('external_reference') == 'ONSHUB': self._ons_sysadmin_schema_updates(schema) return schema
def create_package_schema(self): schema = default_create_package_schema() schema.update( relationships_as_object=default_relationship_schema(), relationships_as_subject=default_relationship_schema(), ) return schema
def datajson_create(context, data_dict): model = context['model'] new_package = create_data_dict(data_dict) owner_org = model.Group.get(new_package['owner_org']) group_name = new_package.pop('owner_name', None) new_package['name'] = _slugify(new_package['title'])[:80] existing_package = model.Package.get(new_package['name']) if existing_package: new_package[ 'name'] = new_package['name'] + '-' + new_package['id'].lower() if not owner_org: p.toolkit.get_action('organization_create')(context, { 'name': new_package['owner_org'], 'title': group_name, 'extras': [{ 'key': 'organization_type', 'value': "Federal Government" }] }) context['schema'] = schema.default_create_package_schema() context['schema']['id'] = [p.toolkit.get_validator('not_empty')] context['return_id_only'] = True return p.toolkit.get_action('package_create')(context, new_package)
def create_package(cls, **package_dict): context = {'model': model, 'session': model.Session, 'user': '******', 'extras_as_string': True, 'schema': default_create_package_schema(), 'api_version': 2} package_dict = package_create(context, package_dict) return context.get('id')
def create_package_schema(): schema = default_create_package_schema() _modify_schema(schema) schema['name'].append(no_pending_dataset_with_same_name) schema['title'].extend([ unique_title_within_organization, no_pending_dataset_with_same_title_in_same_org, ]) return schema
def project_create_schema(): schema = default_create_package_schema() schema.update({ 'id': [if_empty_generate_uuid], 'title': [not_missing, unicode, project_title_blacklist_char_validator], 'name': [ignore_missing, unicode, slugify_title_to_name, project_name_validator], 'ona_api_key': [ignore_missing, unicode], '__after': [create_cadasta_project], }) schema.update(project_schema()) return schema
def package_create_validate(context, data_dict): model = context['model'] user = context['user'] schema = context.get('schema') or default_create_package_schema() model.Session.remove() model.Session()._context = context check_access('package_create',context,data_dict) data, errors = validate(data_dict, schema, context) if errors: model.Session.rollback() raise ValidationError(errors, package_error_summary(errors)) else: return data
def package_create_validate(context, data_dict): model = context['model'] user = context['user'] schema = context.get('schema') or default_create_package_schema() model.Session.remove() model.Session()._context = context check_access('package_create', context, data_dict) data, errors = validate(data_dict, schema, context) if errors: model.Session.rollback() raise ValidationError(errors, package_error_summary(errors)) else: return data
def doi_create(context, data_dict): model = context['model'] new_package = data_dict source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest() new_package["extras"].append({"key": "source_hash", "value": source_hash}) new_package["extras"].append({"key": "metadata-source", "value": "doi"}) new_package["extras"].append({ "key": "source_doi_import_identifier", "value": True }) owner_org = model.Group.get( ORG_MAPPING.get(new_package['organization']['name'])) if not owner_org: print str( datetime.datetime.now()) + ' Fail to import doi id ' + new_package[ 'id'] + '. Organization ' + new_package['organization'][ 'name'] + ' does not exist.' return new_package['owner_org'] = owner_org.name group_name = new_package.pop('owner_name', None) new_package['name'] = _slugify(new_package['title'])[:80] existing_package = model.Package.get(new_package['name']) if existing_package: new_package['name'] = new_package['name'] + '-' + str(int(time.time())) resources = [] for resource in new_package['resources']: resource.pop('resource_group_id', None) resource.pop('revision_id', None) resource.pop('id', None) resources.append(resource) new_package['resources'] = resources obj = HarvestObject(guid=uuid.uuid4().hex, job=context['harvest_job'], content=context['harvestobj']) obj.save() new_package["extras"].append({"key": "harvest_object_id", "value": obj.id}) context['schema'] = schema.default_create_package_schema() context['schema']['id'] = [p.toolkit.get_validator('not_empty')] context['return_id_only'] = True p.toolkit.get_action('package_create')(context, new_package) print str( datetime.datetime.now()) + ' Imported doi id ' + new_package['id']
def package_create(context, data_dict): model = context['model'] user = context['user'] preview = context.get('preview', False) schema = context.get('schema') or default_create_package_schema() model.Session.remove() model.Session()._context = context check_access(model.System(), model.Action.PACKAGE_CREATE, context) check_group_auth(context, data_dict) data, errors = validate(data_dict, schema, context) if errors: model.Session.rollback() raise ValidationError(errors, package_error_summary(errors)) if not preview: rev = model.repo.new_revision() rev.author = user if 'message' in context: rev.message = context['message'] else: rev.message = _(u'REST API: Create object %s') % data.get("name") pkg = package_dict_save(data, context) admins = [] if user: admins = [model.User.by_name(user.decode('utf8'))] if not preview: model.setup_default_user_roles(pkg, admins) for item in PluginImplementations(IPackageController): item.create(pkg) model.repo.commit() ## need to let rest api create and preview context["package"] = pkg ## this is added so that the rest controller can make a new location context["id"] = pkg.id log.debug('Created object %s' % str(pkg.name)) if not preview: return package_dictize(pkg, context) else: return data
def package_create(self, context, data_dict): preview = context.get('preview', False) schema = context.get('schema') or default_create_package_schema() if preview: return session = context['model'].Session url = urlparse.urljoin(self.base_url, 'services/package.json') data_dict['body'] = data_dict.get('notes', '') ## run through validate to make sure tags are in correct place data, errors = validate(data_dict, schema, context) terms = {} for num, tag in enumerate(data.get('tags', [])): terms[str(num)] = tag['name'] data_dict['terms'] = terms data = json.dumps({'data': data_dict}) req = urllib2.Request(url, data, {'Content-type': 'application/json'}) ##XXX think about error conditions a bit more f = urllib2.urlopen(req, None, 3) try: drupal_info = json.loads(f.read()) finally: f.close() nid = drupal_info['nid'] context['nid'] = nid try: package_create = create.package_create(context, data_dict) except: url = urlparse.urljoin(self.base_url, 'services/package/%s.json' % (nid)) req = urllib2.Request(url) req.get_method = lambda: 'DELETE' f = urllib2.urlopen(req, None, 3) try: drupal_info = f.read() finally: f.close() raise package_create['nid'] = context['nid'] package_create['revision_message'] = '%s-%s' % ( session.revision.id, session.revision.message) return package_create
def package_create_validate(context, data_dict): model = context['model'] user = context['user'] preview = context.get('preview', False) schema = context.get('schema') or default_create_package_schema() model.Session.remove() model.Session()._context = context check_access(model.System(), model.Action.PACKAGE_CREATE, context) check_group_auth(context, data_dict) data, errors = validate(data_dict, schema, context) if errors: model.Session.rollback() raise ValidationError(errors, package_error_summary(errors)) else: return data
def validate(self, context, data_dict, schema, action): if action in ('package_update', 'package_create'): # If the caller to package_update specified a schema (e.g. # harvesters specify the default schema) then we don't want to # override that. if not context.get('schema'): if 'api_version' in context: # When accessed by the API, just use the default schemas. # It's only the forms that are customized to make it easier # for humans. if action == 'package_create': schema = default_schema.default_create_package_schema() else: schema = default_schema.default_update_package_schema() else: # Customized schema for DGU form schema = self.form_to_db_schema_options(context) return toolkit.navl_validate(data_dict, schema, context)
def package_create(self, context, data_dict): preview = context.get('preview', False) schema = context.get('schema') or default_create_package_schema() if preview: return session = context['model'].Session url = urlparse.urljoin(self.base_url, 'services/package.json') data_dict['body'] = data_dict.get('notes', '') ## run through validate to make sure tags are in correct place data, errors = validate(data_dict, schema, context) terms = {} for num, tag in enumerate(data.get('tags', [])): terms[str(num)] = tag['name'] data_dict['terms'] = terms data = json.dumps({'data': data_dict}) req = urllib2.Request(url, data, {'Content-type': 'application/json'}) ##XXX think about error conditions a bit more f = urllib2.urlopen(req, None, 3) try: drupal_info = json.loads(f.read()) finally: f.close() nid = drupal_info['nid'] context['nid'] = nid try: package_create = create.package_create(context, data_dict) except: url = urlparse.urljoin(self.base_url, 'services/package/%s.json' % (nid)) req = urllib2.Request(url) req.get_method = lambda: 'DELETE' f = urllib2.urlopen(req, None, 3) try: drupal_info = f.read() finally: f.close() raise package_create['nid'] = context['nid'] package_create['revision_message'] = '%s-%s'%(session.revision.id,session.revision.message) return package_create
def datajson_create(context, data_dict): model = context['model'] new_package = create_data_dict(data_dict) owner_org = model.Group.get(new_package['owner_org']) group_name = new_package.pop('owner_name', None) new_package['name'] = _slugify(new_package['title'])[:80] existing_package = model.Package.get(new_package['name']) if existing_package: new_package['name'] = new_package['name'] + '-' + new_package['id'].lower() if not owner_org: p.toolkit.get_action('organization_create')( context, {'name': new_package['owner_org'], 'title': group_name, 'extras': [{'key': 'organization_type', 'value': "Federal Government"}]}) context['schema'] = schema.default_create_package_schema() context['schema']['id'] = [p.toolkit.get_validator('not_empty')] context['return_id_only'] = True return p.toolkit.get_action('package_create')(context, new_package)
def test_basic_query(self): schema = default_create_package_schema() context = { 'model': model, 'session': Session, 'user': '******', 'extras_as_string': True, 'schema': schema, 'api_version': 2 } package_dict = package_create(context, self.package_fixture_data) package_id = context.get('id') # Point inside bbox offset = self._offset_with_bbox() res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 1 assert res_dict['results'][0] == package_id # Point outside bbox offset = self._offset_with_bbox(-10, 10, -20, 20) res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 0 assert res_dict['results'] == [] # Delete the package and ensure it does not come up on # search results package_delete(context, {'id': package_id}) offset = self._offset_with_bbox() res = self.app.get(offset, status=200) res_dict = self.data_from_res(res) assert res_dict['count'] == 0 assert res_dict['results'] == []
def package_create_schema(): schema = default_create_package_schema() schema.update({ 'frequency_time_modifier': [ignore_missing, unicode, convert_to_extras], 'frequency_count': [ignore_missing, convert_to_extras], 'frequency_update_period': [ignore_missing, unicode, convert_to_extras], 'frequency_period': [ignore_missing, unicode, convert_to_extras], # frequency is constructed from the other frequency_ fields 'frequency': [ignore_missing], 'retention_count': [ignore_missing, is_positive_integer, convert_to_extras], 'retention_period': [ignore_missing, unicode, convert_to_extras], 'delivery_unit': [ignore_missing, unicode, convert_to_extras], 'service': [ignore_missing, unicode, convert_to_extras], 'next_update': [ignore_missing, unicode, convert_to_extras], 'review_date': [ignore_missing, unicode, convert_to_extras], 'coverage_start_date': [ignore_missing, unicode, convert_to_extras], 'coverage_end_date': [ignore_missing, unicode, convert_to_extras], }) return schema
def doi_create(context, data_dict): model = context['model'] new_package = data_dict source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest() new_package["extras"].append({"key": "source_hash", "value": source_hash}) new_package["extras"].append({"key": "metadata-source", "value": "doi"}) new_package["extras"].append({"key": "source_doi_import_identifier", "value": True}) owner_org = model.Group.get(ORG_MAPPING.get(new_package['organization']['name'])) if not owner_org: print str(datetime.datetime.now()) + ' Fail to import doi id ' + new_package['id'] + '. Organization ' + new_package['organization']['name'] + ' does not exist.' return new_package['owner_org'] = owner_org.name group_name = new_package.pop('owner_name', None) new_package['name'] = _slugify(new_package['title'])[:80] existing_package = model.Package.get(new_package['name']) if existing_package: new_package['name'] = new_package['name'] + '-' + str(int(time.time())) resources = [] for resource in new_package['resources']: resource.pop('resource_group_id', None) resource.pop('revision_id', None) resource.pop('id', None) resources.append(resource) new_package['resources'] = resources obj = HarvestObject( guid=uuid.uuid4().hex, job=context['harvest_job'], content=context['harvestobj']) obj.save() new_package["extras"].append({"key": "harvest_object_id", "value": obj.id}) context['schema'] = schema.default_create_package_schema() context['schema']['id'] = [p.toolkit.get_validator('not_empty')] context['return_id_only'] = True p.toolkit.get_action('package_create')(context, new_package) print str(datetime.datetime.now()) + ' Imported doi id ' + new_package['id']
def test_1_basic(self): schema = default_create_package_schema() context = {'model':model,'session':Session,'user':'******','extras_as_string':True,'schema':schema,'api_version':2} package_dict_1 = package_create(context,self.package_fixture_data_1) del context['package'] package_dict_2 = package_create(context,self.package_fixture_data_2) postparams = '%s=1' % json.dumps({ 'q': 'test', 'facet.field': ('groups', 'tags', 'res_format', 'license'), 'rows': 20, 'start': 0, 'extras': { 'ext_bbox': '%s,%s,%s,%s' % (10,10,40,40) } }) res = self.app.post('/api/action/package_search', params=postparams) res = json.loads(res.body) result = res['result'] # Only one dataset returned assert_equal(res['success'], True) assert_equal(result['count'], 1) assert_equal(result['results'][0]['name'], 'test-spatial-dataset-search-point-2')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' log.debug('_create_or_update_package') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') #TODO: use site user when available user_name = self.config.get('user', u'harvest') else: api_version = 2 user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags] tags = list(set(tags)) package_dict['tags'] = tags #log.debug('tag tag tag tag') #log.debug(tag) # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Check if name has not already been used package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def create_package_schema(self): from ckan.logic.schema import default_create_package_schema schema = schema_defs.create_package_schema( default_create_package_schema()) schema = self._modify_package_schema(schema) return schema
def _create_package_schema(cls): """ Create common schema for dataset create and update. Used by user interfaces and harvesters. """ # Note: harvester schemas schema = default_create_package_schema() schema.pop('author') for key in settings.KATA_FIELDS_REQUIRED: schema[key] = [not_empty, co.convert_to_extras_kata, unicode, va.validate_general] for key in settings.KATA_FIELDS_RECOMMENDED: schema[key] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general] schema['accept-terms'] = [va.usage_terms_accepted, ignore] schema['__after'] = [co.gen_translation_str_from_langtitle, co.gen_translation_str_from_langnotes] schema['agent'] = {'role': [not_empty, va.check_agent_fields, va.validate_general, unicode, co.flattened_to_extras], 'name': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'id': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras], 'organisation': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'URL': [ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras], 'fundingid': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras]} schema['contact'] = {'name': [not_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'email': [not_empty, co.remove_trailing_spaces, unicode, va.validate_email, co.flattened_to_extras], 'URL': [ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras], # phone number can be missing from the first users 'phone': [ignore_missing, co.remove_trailing_spaces, unicode, va.validate_phonenum, co.flattened_to_extras]} schema['event'] = {'type': [ignore_missing, va.check_events, unicode, co.flattened_to_extras, va.validate_general], 'who': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric], 'when': [ignore_missing, unicode, co.flattened_to_extras, va.validate_kata_interval_date], 'descr': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric]} schema['id'] = [not_empty, va.validate_package_id_format, unicode] # Langtitle fields are used by the UI, to construct a 'title' field with translations in JSON format # This is not necessarily needed for the API calls schema['langtitle'] = {'value': [unicode, va.validate_title, va.validate_title_duplicates, co.escape_quotes], 'lang': [unicode, co.convert_languages]} # The title field contains all the title translations in JSON format. # The converter gen_translation_str_from_langtitle # needs to be called to construct the JSON string from the UI's langtitle fields. schema['title'] = [va.not_empty_if_langtitle_empty] # Description (notes) is a multilanguage field similar to title schema['langnotes'] = {'value': [unicode, va.validate_notes_duplicates, co.escape_quotes], 'lang': [unicode, co.convert_languages]} schema['notes'] = [ignore_empty] schema['language'] = \ [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode] schema['license_id'] = [co.to_license_id, unicode] schema['temporal_coverage_begin'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['temporal_coverage_end'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['pids'] = {'provider': [ignore_missing, unicode, co.flattened_to_extras], 'id': [not_empty, va.validate_general, va.validate_primary_pid_uniqueness, unicode, co.flattened_to_extras], 'type': [not_missing, co.remove_trailing_spaces, va.validate_pid_type, unicode, co.flattened_to_extras], 'relation': [ignore_missing, co.remove_trailing_spaces, co.to_relation, va.validate_pid_relation_type, unicode, co.flattened_to_extras]} schema['tag_string'] = [ignore_missing, not_empty, va.kata_tag_string_convert] # otherwise the tags would be validated with default tag validator during update schema['tags'] = cls.tags_schema() schema['xpaths'] = [ignore_missing, co.to_extras_json] schema['version'] = [not_empty, unicode, va.validate_kata_date] schema['availability'] = [not_missing, va.validate_availability, co.convert_to_extras_kata] schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata] schema['__extras'] = [va.check_agent, va.check_contact, va.check_langtitle] schema['__junk'] = [va.check_junk] schema['name'] = [va.continue_if_missing, co.default_name_from_id, unicode, package_name_validator, va.validate_general] schema['external_id'] = [ignore_missing, co.remove_trailing_spaces, co.convert_external_id, va.validate_external_id_uniqueness, unicode, va.validate_general, co.convert_to_extras_kata] schema['access_application_download_URL'] = [ignore_missing, co.remove_trailing_spaces, va.validate_access_application_download_url, unicode, va.validate_general, co.convert_to_extras_kata] schema['access_application_URL'] = [ignore_missing, co.remove_trailing_spaces, va.validate_access_application_url, unicode, va.validate_general, co.convert_to_extras_kata] schema['access_request_URL'] = [ignore_missing, co.remove_trailing_spaces, va.check_access_request_url, url_validator, unicode, va.validate_general, co.convert_to_extras_kata] schema['discipline'] = [ignore_missing, va.validate_discipline, co.convert_to_extras_kata, unicode] schema['geographic_coverage'] = [ignore_missing, va.validate_spatial, co.convert_to_extras_kata, unicode] schema['license_URL'] = [va.continue_if_missing, va.validate_license_url, co.populate_license_URL_if_license_id_not_resolved, co.convert_to_extras_kata, unicode, va.validate_general] schema['owner_org'] = [va.kata_owner_org_validator, unicode] schema['resources']['url'] = [default(settings.DATASET_URL_UNKNOWN), va.check_resource_url_for_direct_download_url, unicode, va.validate_general] # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource() schema['resources']['algorithm'] = [ignore_missing, unicode, va.validate_algorithm] schema['resources']['format'] = [ignore_missing, unicode, va.validate_general] schema['resources']['hash'].append(va.validate_general) schema['resources']['mimetype'].append(va.validate_mimetype) return schema
def _create_package_schema(cls): """ Create common schema for dataset create and update. Used by user interfaces and harvesters. """ # Note: harvester schemas schema = default_create_package_schema() schema.pop('author') for key in settings.KATA_FIELDS_REQUIRED: schema[key] = [ not_empty, co.convert_to_extras_kata, unicode, va.validate_general ] for key in settings.KATA_FIELDS_RECOMMENDED: schema[key] = [ ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general ] schema['accept-terms'] = [va.usage_terms_accepted, ignore] schema['__after'] = [ co.gen_translation_str_from_langtitle, co.gen_translation_str_from_langnotes ] schema['agent'] = { 'role': [ not_empty, va.check_agent_fields, va.validate_general, unicode, co.flattened_to_extras ], 'name': [ ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras ], 'id': [ ignore_empty, va.validate_general, unicode, co.flattened_to_extras ], 'organisation': [ ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras ], 'URL': [ ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras ], 'fundingid': [ ignore_empty, va.validate_general, unicode, co.flattened_to_extras ] } schema['contact'] = { 'name': [ not_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras ], 'email': [ not_empty, co.remove_trailing_spaces, unicode, va.validate_email, co.flattened_to_extras ], 'URL': [ ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras ], # phone number can be missing from the first users 'phone': [ ignore_missing, co.remove_trailing_spaces, unicode, va.validate_phonenum, co.flattened_to_extras ] } schema['event'] = { 'type': [ ignore_missing, va.check_events, unicode, co.flattened_to_extras, va.validate_general ], 'who': [ ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric ], 'when': [ ignore_missing, unicode, co.flattened_to_extras, va.validate_kata_interval_date ], 'descr': [ ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric ] } schema['id'] = [not_empty, va.validate_package_id_format, unicode] # Langtitle fields are used by the UI, to construct a 'title' field with translations in JSON format # This is not necessarily needed for the API calls schema['langtitle'] = { 'value': [ unicode, va.validate_title, va.validate_title_duplicates, co.escape_quotes ], 'lang': [unicode, co.convert_languages] } # The title field contains all the title translations in JSON format. # The converter gen_translation_str_from_langtitle # needs to be called to construct the JSON string from the UI's langtitle fields. schema['title'] = [va.not_empty_if_langtitle_empty] # Description (notes) is a multilanguage field similar to title schema['langnotes'] = { 'value': [unicode, va.validate_notes_duplicates, co.escape_quotes], 'lang': [unicode, co.convert_languages] } schema['notes'] = [ignore_empty] schema['language'] = \ [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode] schema['license_id'] = [co.to_license_id, unicode] schema['temporal_coverage_begin'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['temporal_coverage_end'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['pids'] = { 'provider': [ignore_missing, unicode, co.flattened_to_extras], 'id': [ not_empty, va.validate_general, va.validate_primary_pid_uniqueness, unicode, co.flattened_to_extras ], 'type': [ not_missing, co.remove_trailing_spaces, va.validate_pid_type, unicode, co.flattened_to_extras ], 'relation': [ ignore_missing, co.remove_trailing_spaces, co.to_relation, va.validate_pid_relation_type, unicode, co.flattened_to_extras ] } schema['tag_string'] = [ ignore_missing, not_empty, va.kata_tag_string_convert ] # otherwise the tags would be validated with default tag validator during update schema['tags'] = cls.tags_schema() schema['xpaths'] = [ignore_missing, co.to_extras_json] schema['version'] = [not_empty, unicode, va.validate_kata_date] schema['availability'] = [ not_missing, va.validate_availability, co.convert_to_extras_kata ] schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata] schema['__extras'] = [ va.check_agent, va.check_contact, va.check_langtitle ] schema['__junk'] = [va.check_junk] schema['name'] = [ va.continue_if_missing, co.default_name_from_id, unicode, package_name_validator, va.validate_general ] schema['external_id'] = [ ignore_missing, co.remove_trailing_spaces, co.convert_external_id, va.validate_external_id_uniqueness, unicode, va.validate_general, co.convert_to_extras_kata ] schema['access_application_download_URL'] = [ ignore_missing, co.remove_trailing_spaces, va.validate_access_application_download_url, unicode, va.validate_general, co.convert_to_extras_kata ] schema['access_application_URL'] = [ ignore_missing, co.remove_trailing_spaces, va.validate_access_application_url, unicode, va.validate_general, co.convert_to_extras_kata ] schema['access_request_URL'] = [ ignore_missing, co.remove_trailing_spaces, va.check_access_request_url, url_validator, unicode, va.validate_general, co.convert_to_extras_kata ] schema['discipline'] = [ ignore_missing, va.validate_discipline, co.convert_to_extras_kata, unicode ] schema['geographic_coverage'] = [ ignore_missing, va.validate_spatial, co.convert_to_extras_kata, unicode ] schema['license_URL'] = [ va.continue_if_missing, va.validate_license_url, co.populate_license_URL_if_license_id_not_resolved, co.convert_to_extras_kata, unicode, va.validate_general ] schema['owner_org'] = [va.kata_owner_org_validator, unicode] schema['resources']['url'] = [ default(settings.DATASET_URL_UNKNOWN), va.check_resource_url_for_direct_download_url, unicode, va.validate_general ] # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource() schema['resources']['algorithm'] = [ ignore_missing, unicode, va.validate_algorithm ] schema['resources']['format'] = [ ignore_missing, unicode, va.validate_general ] schema['resources']['hash'].append(va.validate_general) schema['resources']['mimetype'].append(va.validate_mimetype) return schema
def create_package_schema(self): return default_schema.default_create_package_schema()
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: api_version = self.config.get('api_version', '2') #TODO: use site user when available user_name = self.config.get('user', u'harvest') else: api_version = '2' user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) new_package = get_action('package_update_rest')( context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Check if name has not already been used package_dict['name'] = self._check_name(package_dict['name']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) new_package = get_action('package_create_rest')(context, package_dict) harvest_object.package_id = new_package['id'] # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) Session.commit() # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() return True except ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')(context, package_dict) else: log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def test_theme_to_group_mapping(self): # multilang requires lang to be set # class dummyreq(object): # class p(object): # translator = object() # environ = {'pylons.pylons': p()} # CKANRequest(dummyreq) # pylons.request = dummyreq() # pylons.translator.pylons_lang = ['en_GB'] #set_lang('en_GB') #assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config[ 'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config' with open(get_example_file('dataset.rdf'), 'r') as f: contents = f.read() p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = { 'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{ 'key': 'theme', 'value': ['non-mappable', 'thememap1'] }], 'groups': [], # [{'name':existing_g.name}], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy-' + uuid4().hex, 'identifier': 'dummy' + uuid4().hex, 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created': datetime.now(), 'metadata_modified': datetime.now(), 'guid': str(uuid.uuid4), } package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert { 'theme': ['non-mappable', 'thememap1'] } == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups( group_type='group'), 'should be {}, got {}'.format( [], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created expected_groups = ['existing-group'] self.assertSetEqual(set(expected_groups), set(groups), 'Error in assigned groups') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group and 2 other groups that have been automatically created expected_groups = expected_groups + ['somegroup1', 'somegroup2'] self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ') # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE]) aggr.append({'theme': 'thememap-multi', 'subthemes': []}) package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups expected_groups = expected_groups + ['othergroup'] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') meta.Session.rollback()
def create_package_schema(): schema = default_create_package_schema() _modify_schema(schema) return schema
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = p.toolkit.get_action( 'package_update' if package_dict_form == 'package_show' else 'package_update_rest')( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists try: existing_package_dict = self._find_existing_package(package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name(package_dict['name']) else: package_dict['name'] = self._gen_new_name(package_dict['title']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') #TODO: use site user when available user_name = self.config.get('user', self._get_user_name()) else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } tags = package_dict.get('tags', []) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) except NotFound: # Package needs to be created # Check if name has not already been used package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() new_package = get_action('package_create_rest')(context, package_dict) Session.commit() return True except ValidationError, e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def test_1_package_schema(self): pkg = ( model.Session.query(model.Package) .filter_by(name="annakarenina") .first() ) package_id = pkg.id result = package_dictize(pkg, self.context) self.remove_changable_columns(result) result["name"] = "anna2" # we need to remove these as they have been added del result["relationships_as_object"] del result["relationships_as_subject"] converted_data, errors = validate( result, default_create_package_schema(), self.context ) expected_data = { "extras": [ {"key": u"genre", "value": u"romantic novel"}, {"key": u"original media", "value": u"book"}, ], "groups": [ {u"name": u"david", u"title": u"Dave's books"}, {u"name": u"roger", u"title": u"Roger's books"}, ], "license_id": u"other-open", "name": u"anna2", "type": u"dataset", "notes": u"Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n", "private": False, "resources": [ { "alt_url": u"alt123", "description": u'Full text. Needs escaping: " Umlaut: \xfc', "format": u"plain text", "hash": u"abc123", "size_extra": u"123", "url": u"http://datahub.io/download/x=1&y=2", }, { "alt_url": u"alt345", "description": u"Index of the novel", "format": u"JSON", "hash": u"def456", "size_extra": u"345", "url": u"http://datahub.io/index.json", }, ], "tags": [ {"name": u"Flexible \u30a1"}, {"name": u"russian"}, {"name": u"tolstoy"}, ], "title": u"A Novel By Tolstoy", "url": u"http://datahub.io", "version": u"0.7a", } assert converted_data == expected_data, pformat(converted_data) assert not errors, errors data = converted_data data["name"] = u"annakarenina" data.pop("title") data["resources"][0]["url"] = "fsdfafasfsaf" data["resources"][1].pop("url") converted_data, errors = validate( data, default_create_package_schema(), self.context ) assert errors == {"name": [u"That URL is already in use."]}, pformat( errors ) data["id"] = package_id data["name"] = "????jfaiofjioafjij" converted_data, errors = validate( data, default_update_package_schema(), self.context ) assert errors == { "name": [ u"Must be purely lowercase alphanumeric (ascii) " "characters and these symbols: -_" ] }, pformat(errors)
def _create_or_update_package(self, package_dict, harvest_object): ''' Creates a new package or updates an exisiting one according to the package dictionary provided. The package dictionary should look like the REST API response for a package: http://ckan.net/api/rest/package/statistics-catalunya Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. ''' try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, unicode] schema['__junk'] = [ignore] # Check API version if self.config: api_version = self.config.get('api_version','2') #TODO: use site user when available user_name = self.config.get('user',u'harvest') else: api_version = '2' user_name = u'harvest' context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, } tags = package_dict.get('tags', []) tags = [munge_tag(t) for t in tags if munge_tag(t) != ''] tags = list(set(tags)) package_dict['tags'] = tags # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if not 'metadata_modified' in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) new_package = get_action('package_update_rest')(context, package_dict) else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) return except NotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Check if name has not already been used package_dict['name'] = self._check_name(package_dict['name']) log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) new_package = get_action('package_create_rest')(context, package_dict) harvest_object.package_id = new_package['id'] # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) Session.commit() # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() return True except ValidationError,e: log.exception(e) self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
def test_package_schema(self): group1 = factories.Group(title="Dave's books") group2 = factories.Group(title="Roger's books") first_name = factories.Dataset.stub().name second_name = factories.Dataset.stub().name expected_data = { "extras": [ {"key": u"genre", "value": u"romantic novel"}, {"key": u"original media", "value": u"book"}, ], "groups": [ {u"name": group1["name"], u"title": group1["title"]}, {u"name": group2["name"], u"title": group2["title"]}, ], "license_id": u"other-open", "name": first_name, "type": u"dataset", "notes": u"Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n", "private": False, "resources": [ { "alt_url": u"alt123", "description": u'Full text. Needs escaping: " Umlaut: \xfc', "format": u"plain text", "hash": u"abc123", "size_extra": u"123", "url": u"http://datahub.io/download/x=1&y=2", }, { "alt_url": u"alt345", "description": u"Index of the novel", "format": u"JSON", "hash": u"def456", "size_extra": u"345", "url": u"http://datahub.io/index.json", }, ], "tags": sorted([ {"name": factories.Tag.stub().name}, {"name": factories.Tag.stub().name}, {"name": factories.Tag.stub().name}, ], key=operator.itemgetter("name")), "title": u"A Novel By Tolstoy", "url": u"http://datahub.io", "version": u"0.7a", "relationships_as_subject": [], "relationships_as_object": [], } context = {"model": model, "session": model.Session} pkg = factories.Dataset.model(**expected_data) package_id = pkg.id result = package_dictize(pkg, context) self.remove_changable_columns(result) result["name"] = second_name expected_data["name"] = second_name converted_data, errors = validate( result, default_create_package_schema(), context ) assert converted_data == expected_data, pformat(converted_data) assert not errors, errors data = converted_data data["name"] = first_name data.pop("title") data["resources"][0]["url"] = "fsdfafasfsaf" data["resources"][1].pop("url") converted_data, errors = validate( data, default_create_package_schema(), context ) assert errors == {"name": [u"That URL is already in use."]}, pformat( errors ) data["id"] = package_id data["name"] = "????jfaiofjioafjij" converted_data, errors = validate( data, default_update_package_schema(), context ) assert errors == { "name": [ u"Must be purely lowercase alphanumeric (ascii) " "characters and these symbols: -_" ] }, pformat(errors)
def test_1_package_schema(self): pkg = model.Session.query(model.Package)\ .filter_by(name='annakarenina')\ .first() package_id = pkg.id result = package_dictize(pkg, self.context) self.remove_changable_columns(result) result['name'] = 'anna2' # we need to remove these as they have been added del result['relationships_as_object'] del result['relationships_as_subject'] converted_data, errors = validate(result, default_create_package_schema(), self.context) expected_data = { 'extras': [{'key': u'genre', 'value': u'romantic novel'}, {'key': u'original media', 'value': u'book'}], 'groups': [{u'name': u'david', u'title': u"Dave's books"}, {u'name': u'roger', u'title': u"Roger's books"}], 'license_id': u'other-open', 'name': u'anna2', 'type': u'dataset', 'notes': u'Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n', 'private': False, 'resources': [{'alt_url': u'alt123', 'description': u'Full text. Needs escaping: " Umlaut: \xfc', 'format': u'plain text', 'hash': u'abc123', 'size_extra': u'123', 'url': u'http://datahub.io/download/x=1&y=2'}, {'alt_url': u'alt345', 'description': u'Index of the novel', 'format': u'JSON', 'hash': u'def456', 'size_extra': u'345', 'url': u'http://datahub.io/index.json'}], 'tags': [{'name': u'Flexible \u30a1'}, {'name': u'russian'}, {'name': u'tolstoy'}], 'title': u'A Novel By Tolstoy', 'url': u'http://datahub.io', 'version': u'0.7a' } assert converted_data == expected_data, pformat(converted_data) assert not errors, errors data = converted_data data['name'] = u'annakarenina' data.pop("title") data["resources"][0]["url"] = 'fsdfafasfsaf' data["resources"][1].pop("url") converted_data, errors = validate(data, default_create_package_schema(), self.context) assert errors == { 'name': [u'That URL is already in use.'], 'resources': [{}, {'url': [u'Missing value']}] }, pformat(errors) data["id"] = package_id converted_data, errors = validate(data, default_update_package_schema(), self.context) assert errors == { 'resources': [{}, {'url': [u'Missing value']}] }, pformat(errors) data['name'] = '????jfaiofjioafjij' converted_data, errors = validate(data, default_update_package_schema(), self.context) assert errors == { 'name': [u'Must be purely lowercase alphanumeric (ascii) ' 'characters and these symbols: -_'], 'resources': [{}, {'url': [u'Missing value']}] }, pformat(errors)
RESOURCE_FIELDS = [ 'name', 'resource_type', 'url', 'size', 'format', 'language', ] EXISTING_RESOURCE_FIELDS = set(default_resource_schema()) BILINGUAL_RESOURCE_FIELDS = set([ 'name', ]) EXISTING_FIELDS = set(default_create_package_schema() ) | set(['spatial']) # The field order here must match the proposed schema spreadsheet ProposedField = namedtuple("ProposedField", """ class_ sub_class property_name property_label iso_multiplicity property_name_fra property_label_fra gc_multiplicity type_ ckan_type description
RESOURCE_FIELDS = [ 'name', 'resource_type', 'url', 'size', 'format', 'language', ] EXISTING_RESOURCE_FIELDS = set(default_resource_schema()) BILINGUAL_RESOURCE_FIELDS = set([ 'name', ]) EXISTING_FIELDS = set(default_create_package_schema()) | set(['spatial']) # The field order here must match the proposed schema spreadsheet ProposedField = namedtuple( "ProposedField", """ class_ sub_class property_name property_label iso_multiplicity property_name_fra property_label_fra gc_multiplicity type_ ckan_type description
def test_1_package_schema(self): pkg = model.Session.query(model.Package)\ .filter_by(name='annakarenina')\ .first() package_id = pkg.id result = package_dictize(pkg, self.context) self.remove_changable_columns(result) result['name'] = 'anna2' # we need to remove these as they have been added del result['relationships_as_object'] del result['relationships_as_subject'] converted_data, errors = validate(result, default_create_package_schema(), self.context) expected_data = { 'extras': [{ 'key': u'genre', 'value': u'romantic novel' }, { 'key': u'original media', 'value': u'book' }], 'groups': [{ u'name': u'david', u'title': u"Dave's books" }, { u'name': u'roger', u'title': u"Roger's books" }], 'license_id': u'other-open', 'name': u'anna2', 'type': u'dataset', 'notes': u'Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n', 'private': False, 'resources': [{ 'alt_url': u'alt123', 'description': u'Full text. Needs escaping: " Umlaut: \xfc', 'format': u'plain text', 'hash': u'abc123', 'size_extra': u'123', 'url': u'http://www.annakarenina.com/download/x=1&y=2' }, { 'alt_url': u'alt345', 'description': u'Index of the novel', 'format': u'JSON', 'hash': u'def456', 'size_extra': u'345', 'url': u'http://www.annakarenina.com/index.json' }], 'tags': [{ 'name': u'Flexible \u30a1' }, { 'name': u'russian' }, { 'name': u'tolstoy' }], 'title': u'A Novel By Tolstoy', 'url': u'http://www.annakarenina.com', 'version': u'0.7a' } assert converted_data == expected_data, pformat(converted_data) assert not errors, errors data = converted_data data['name'] = u'annakarenina' data.pop("title") data["resources"][0]["url"] = 'fsdfafasfsaf' data["resources"][1].pop("url") converted_data, errors = validate(data, default_create_package_schema(), self.context) assert errors == { 'name': [u'That URL is already in use.'], 'resources': [{}, { 'url': [u'Missing value'] }] }, pformat(errors) data["id"] = package_id converted_data, errors = validate(data, default_update_package_schema(), self.context) assert errors == { 'resources': [{}, { 'url': [u'Missing value'] }] }, pformat(errors) data['name'] = '????jfaiofjioafjij' converted_data, errors = validate(data, default_update_package_schema(), self.context) assert errors == { 'name': [ u'Url must be purely lowercase alphanumeric (ascii) ' 'characters and these symbols: -_' ], 'resources': [{}, { 'url': [u'Missing value'] }] }, pformat(errors)
def test_mapping(self): # multilang requires lang to be set from pylons.i18n.translation import set_lang, get_lang import pylons class dummyreq(object): class p(object): translator = object() environ = {'pylons.pylons': p()} pylons.request = dummyreq() pylons.translator.pylons_lang = ['en_GB'] set_lang('en_GB') assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config" contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy', 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified': datetime.now(), 'guid': unicode(uuid.uuid4), 'identifier': 'dummy'} package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file package_dict['theme'] = ['non-mappable', 'thememap1'] expected_groups_existing = ['existing-group'] expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2'] expected_groups_multi = expected_groups_new + ['othergroup'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) #meta.Session.flush() #meta.Session.revision = repo.new_revision() # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,) config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,) assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,) package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) meta.Session.rollback()
def create_package_schema(self) -> Schema: return schema.default_create_package_schema()
def _create_or_update_package(self, package_dict, harvest_object, package_dict_form='rest'): ''' Creates a new package or updates an existing one according to the package dictionary provided. The package dictionary can be in one of two forms: 1. 'rest' - as seen on the RESTful API: http://datahub.io/api/rest/dataset/1996_population_census_data_canada This is the legacy form. It is the default to provide backward compatibility. * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'} * 'tags' is a list of strings e.g. ['large-river', 'flood'] 2. 'package_show' form, as provided by the Action API (CKAN v2.0+): http://datahub.io/api/action/package_show?id=1996_population_census_data_canada * 'extras' is a list of dicts e.g. [{'key': 'theme', 'value': 'health'}, {'key': 'sub-theme', 'value': 'cancer'}] * 'tags' is a list of dicts e.g. [{'name': 'large-river'}, {'name': 'flood'}] Note that the package_dict must contain an id, which will be used to check if the package needs to be created or updated (use the remote dataset id). If the remote server provides the modification date of the remote package, add it to package_dict['metadata_modified']. :returns: The same as what import_stage should return. i.e. True if the create or update occurred ok, 'unchanged' if it didn't need updating or False if there were errors. TODO: Not sure it is worth keeping this function. If useful it should use the output of package_show logic function (maybe keeping support for rest api based dicts ''' assert package_dict_form in ('rest', 'package_show') try: # Change default schema schema = default_create_package_schema() schema['id'] = [ignore_missing, six.text_type] schema['__junk'] = [ignore] # Check API version if self.config: try: api_version = int(self.config.get('api_version', 2)) except ValueError: raise ValueError('api_version must be an integer') else: api_version = 2 user_name = self._get_user_name() context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'schema': schema, 'ignore_auth': True, } if self.config and self.config.get('clean_tags', False): tags = package_dict.get('tags', []) package_dict['tags'] = self._clean_tags(tags) # Check if package exists try: # _find_existing_package can be overridden if necessary existing_package_dict = self._find_existing_package( package_dict) # In case name has been modified when first importing. See issue #101. package_dict['name'] = existing_package_dict['name'] # Check modified date if 'metadata_modified' not in package_dict or \ package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified') or package_dict['name'] == "status-of-covid-19-cases-in-ontario-by-public-health-unit-phu" or package_dict['id'] == 'ecb75ea0-8b72-4f46-a14a-9bd54841d6ab': log.info( 'Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id': package_dict['id']}) package_dict.setdefault('name', existing_package_dict['name']) ''' what we want to do here is - not overwrite maintainer name or maintainer email or maintainer branch with blank information - not include resources because it will overwrite the existing resources - match owner_org - not overwrite all keywords (just add) ''' package_dict['keywords'] = { "en": list( set(existing_package_dict['keywords']['en'] + package_dict['keywords']['en'])), "fr": list( set(existing_package_dict['keywords']['fr'] + package_dict['keywords']['fr'])) } package_dict['owner_org'] = package_dict['organization'][ 'name'] package_dict['harvester'] = "ontario-data-catalogue" if package_dict.get("maintainer_email", "") == "": del package_dict['maintainer_email'] if "maintainer_translated" in package_dict: if package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") == "": del package_dict['maintainer_translated'] elif package_dict['maintainer_translated'].get( "en", "" ) != "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'][ 'fr'] = package_dict['maintainer_translated'][ 'en'] elif package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") != "": package_dict['maintainer_translated'][ 'en'] = package_dict['maintainer_translated'][ 'fr'] if "maintainer_branch" in package_dict: if package_dict['maintainer_branch'].get( "en", "" ) == "" and package_dict['maintainer_branch'].get( "fr", "") == "": del package_dict['maintainer_branch'] if 'resources' in package_dict: for resource in package_dict['resources']: resource.update({"harvested_resource": True}) resource_context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'id': resource['id'], 'ignore_auth': True, } p.toolkit.get_action( "resource_patch" if resource['id'] in list( map(lambda x: x["id"], existing_package_dict["resources"]) ) else "resource_create")(resource_context, resource) list_of_remote_resources = list( map(lambda x: x["id"], package_dict["resources"])) for resource in list( filter( lambda x: x["harvested_resource"] == True, existing_package_dict["resources"])): # if there's a harvested resource locally that isn't in the latest harvested list of resources, delete it if resource['id'] not in list_of_remote_resources: resource_context = { 'model': model, 'session': Session, 'user': user_name, 'api_version': api_version, 'id': resource['id'], 'ignore_auth': True, } p.toolkit.get_action("resource_delete")( resource_context, { 'id': resource['id'] }) del package_dict['resources'] new_package = p.toolkit.get_action("package_patch")( context, package_dict) else: log.info( 'No changes to package with GUID %s, skipping...' % harvest_object.guid) # NB harvest_object.current/package_id are not set return 'unchanged' # Flag the other objects linking to this package as not current anymore from ckanext.harvest.model import harvest_object_table conn = Session.connection() u = update(harvest_object_table)\ .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \ .values(current=False) conn.execute(u, b_package_id=new_package['id']) # Flag this as the current harvest object harvest_object.package_id = new_package['id'] harvest_object.current = True harvest_object.save() except p.toolkit.ObjectNotFound: # Package needs to be created # Get rid of auth audit on the context otherwise we'll get an # exception context.pop('__auth_audit', None) # Set name for new package to prevent name conflict, see issue #117 if package_dict.get('name', None): package_dict['name'] = self._gen_new_name( package_dict['name']) else: package_dict['name'] = self._gen_new_name( package_dict['title']) log.info( 'Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) harvest_object.current = True harvest_object.package_id = package_dict['id'] # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) harvest_object.add() model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() package_dict['owner_org'] = package_dict['organization'][ 'name'] package_dict['harvester'] = "ontario-data-catalogue" for resource in package_dict['resources']: resource.update({"harvested_resource": True}) if package_dict.get("maintainer_email", "") == "": package_dict['maintainer_email'] = "*****@*****.**" if "maintainer_translated" in package_dict: if package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'] = { "en": "Open Data", "fr": "Données ouvertes" } elif package_dict['maintainer_translated'].get( "en", "" ) != "" and package_dict['maintainer_translated'].get( "fr", "") == "": package_dict['maintainer_translated'][ 'fr'] = package_dict['maintainer_translated']['en'] elif package_dict['maintainer_translated'].get( "en", "" ) == "" and package_dict['maintainer_translated'].get( "fr", "") != "": package_dict['maintainer_translated'][ 'en'] = package_dict['maintainer_translated']['fr'] else: package_dict['maintainer_translated'] = { "en": "Open Data", "fr": "Données ouvertes" } new_package = p.toolkit.get_action( 'package_create' if package_dict_form == 'package_show' else 'package_create_rest')(context, package_dict) Session.commit() return True except p.toolkit.ValidationError as e: log.exception(e) self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') except Exception as e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import') return None
def _create_package_schema(cls): """ Create common schema for dataset create and update. """ # TODO: MIKKO: Use the general converter for lang_title and check that lang_title exists! # Note: harvester schemas schema = default_create_package_schema() schema.pop('author') for key in settings.KATA_FIELDS_REQUIRED: schema[key] = [not_empty, co.convert_to_extras_kata, unicode, va.validate_general] for key in settings.KATA_FIELDS_RECOMMENDED: schema[key] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general] schema['agent'] = {'role': [not_empty, va.check_agent_fields, va.validate_general, unicode, co.flattened_to_extras], 'name': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'id': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras], 'organisation': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'URL': [ignore_empty, url_validator, va.validate_general, unicode, co.flattened_to_extras], 'fundingid': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras]} schema['contact'] = {'name': [not_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras], 'email': [not_empty, unicode, va.validate_email, co.flattened_to_extras], 'URL': [ignore_empty, url_validator, va.validate_general, unicode, co.flattened_to_extras], # phone number can be missing from the first users 'phone': [ignore_missing, unicode, va.validate_phonenum, co.flattened_to_extras]} # phone number can be missing from the first users # schema['contact_phone'] = [ignore_missing, validate_phonenum, convert_to_extras_kata, unicode] # schema['contact_URL'] = [ignore_missing, url_validator, convert_to_extras_kata, unicode, validate_general] schema['event'] = {'type': [ignore_missing, va.check_events, unicode, co.flattened_to_extras, va.validate_general], 'who': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric], 'when': [ignore_missing, unicode, co.flattened_to_extras, va.validate_kata_date], 'descr': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric]} schema['id'] = [default(u''), co.update_pid, unicode] schema['langtitle'] = {'value': [not_missing, unicode, va.validate_title, va.validate_title_duplicates, co.ltitle_to_extras], 'lang': [not_missing, unicode, co.convert_languages]} schema['language'] = \ [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode] schema['temporal_coverage_begin'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['temporal_coverage_end'] = \ [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode] schema['pids'] = {'provider': [ignore_missing, unicode, co.flattened_to_extras], 'id': [not_empty, va.validate_general, unicode, co.flattened_to_extras], 'type': [not_missing, unicode, co.flattened_to_extras], 'primary': [ignore_missing, unicode, co.flattened_to_extras]} schema['tag_string'] = [ignore_missing, not_empty, va.kata_tag_string_convert] # otherwise the tags would be validated with default tag validator during update schema['tags'] = cls.tags_schema() schema['xpaths'] = [ignore_missing, co.to_extras_json] # these two can be missing from the first Kata end users # TODO: version date validation should be tighter, see metadata schema schema['version'] = [not_empty, unicode, va.validate_kata_date] schema['availability'] = [not_missing, co.convert_to_extras_kata] schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata] # TODO: MIKKO: __extras: check_langtitle needed? Its 'raise' seems to be unreachable schema['__extras'] = [va.check_agent, va.check_langtitle, va.check_contact, va.check_pids] schema['__junk'] = [va.check_junk] schema['name'] = [ignore_missing, unicode, co.default_name_from_id, package_name_validator, va.validate_general] schema['access_application_download_URL'] = [ignore_missing, va.validate_access_application_download_url, unicode, va.validate_general, co.convert_to_extras_kata] schema['access_application_new_form'] = [co.checkbox_to_boolean, co.convert_to_extras_kata, co.remove_access_application_new_form] schema['access_application_URL'] = [ignore_missing, va.validate_access_application_url, unicode, va.validate_general, co.convert_to_extras_kata] schema['access_request_URL'] = [ignore_missing, va.check_access_request_url, url_validator, unicode, va.validate_general, co.convert_to_extras_kata] schema['through_provider_URL'] = [ignore_missing, va.check_through_provider_url, url_validator, unicode, va.validate_general, co.convert_to_extras_kata] schema['discipline'] = [ignore_missing, va.validate_discipline, co.convert_to_extras_kata, unicode] schema['geographic_coverage'] = [ignore_missing, va.validate_spatial, co.convert_to_extras_kata, unicode] schema['license_URL'] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general] schema['owner_org'] = [ignore_missing, va.kata_owner_org_validator, unicode] schema['resources']['url'] = [default(settings.DATASET_URL_UNKNOWN), va.check_direct_download_url, unicode, va.validate_general] # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource() schema['resources']['algorithm'] = [ignore_missing, unicode, va.validate_algorithm] schema['resources']['hash'].append(va.validate_general) schema['resources']['mimetype'].append(va.validate_mimetype) return schema