def test_gather(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() self.harvester.client = _FakeClient() self.harvester.gather_stage(job)
def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def harvest_source_create(context, data_dict): log.info('Creating harvest source: %r', data_dict) check_access('harvest_source_create', context, data_dict) model = context['model'] session = context['session'] schema = context.get('schema') or default_harvest_source_schema() data, errors = validate(data_dict, schema) if errors: session.rollback() log.warn('Harvest source does not validate: %r', errors) raise ValidationError(errors, _error_summary(errors)) source = HarvestSource() source.url = data['url'].strip() source.type = data['type'] opt = [ 'active', 'title', 'description', 'user_id', 'publisher_id', 'config' ] for o in opt: if o in data and data[o] is not None: source.__setattr__(o, data[o]) if 'active' in data_dict: source.active = data['active'] source.save() log.info('Harvest source created: %s', source.id) return harvest_source_dictize(source, context)
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def harvest_source_create(context,data_dict): log.info('Creating harvest source: %r', data_dict) check_access('harvest_source_create',context,data_dict) model = context['model'] session = context['session'] schema = context.get('schema') or default_harvest_source_schema() data, errors = validate(data_dict, schema) if errors: session.rollback() log.warn('Harvest source does not validate: %r', errors) raise ValidationError(errors,_error_summary(errors)) source = HarvestSource() source.url = data['url'].strip() source.type = data['type'] opt = ['active','title','description','user_id','publisher_id','config'] for o in opt: if o in data and data[o] is not None: source.__setattr__(o,data[o]) if 'active' in data_dict: source.active = data['active'] source.save() log.info('Harvest source created: %s', source.id) return harvest_source_dictize(source,context)
def harvest_job_list(context, data_dict): model = context['model'] user = context.get('user') source_id = data_dict.get('source_id', False) if not source_id: return { 'success': False, 'msg': _('Only sysadmins can list all harvest jobs') % str(user) } source = HarvestSource.get(source_id) if not source: raise p.toolkit.ObjectNotFound # Check the user is admin/editor for the publisher - i.e. has # update_dataset permission check1 = ckan.new_authz.has_user_permission_for_group_or_org( source.publisher_id, user, 'update_dataset') if not check1: return { 'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user), source.id) } return {'success': True}
def test_auth_publisher_profile_different_publisher(self): # Create a source for publisher 1 source = HarvestSource(url=u'http://test-source.com', type='ckan', publisher_id=self.publisher1.id) Session.add(source) Session.commit() extra_environ = { 'REMOTE_USER': self.publisher2_user.name.encode('utf8') } # List (Publihsers can see the sources list) res = self.app.get('/harvest', extra_environ=extra_environ) assert 'Harvesting Sources' in res # Create res = self.app.get('/harvest/new', extra_environ=extra_environ) assert 'New harvest source' in res assert 'publisher_id' in res # Check that this publihser is not allowed to manage sources from other publishers status = 401 # Read res = self.app.get('/harvest/%s' % source.id, status=status, extra_environ=extra_environ) # Edit res = self.app.get('/harvest/edit/%s' % source.id, status=status, extra_environ=extra_environ) # Refresh res = self.app.get('/harvest/refresh/%s' % source.id, status=status, extra_environ=extra_environ)
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_source_index_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id') source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) try: conn.delete_query(query) if asbool(config.get('ckan.search.solr_commit', 'true')): conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e)
def harvest_source_id_exists(value, context): result = HarvestSource.get(value) if not result: raise Invalid('Harvest Source with id %r does not exist.' % str(value)) return value
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict ={ 'source_id':source_id, 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def after_show(self, context, data_dict): if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME: # This is a harvest source dataset, add extra info from the # HarvestSource object source = HarvestSource.get(data_dict['id']) if not source: log.error('Harvest source not found for dataset {0}'.format(data_dict['id'])) return data_dict data_dict['status'] = harvest_logic.action.get.harvest_source_show_status(context, {'id': source.id}) elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME: # This is a normal dataset, check if it was harvested and if so, add # info about the HarvestObject and HarvestSource harvest_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id==data_dict['id']) \ .filter(HarvestObject.current==True) \ .first() # validate is false is passed only on indexing. if harvest_object and not context.get('validate', True): for key, value in [ ('harvest_object_id', harvest_object.id), ('harvest_source_id', harvest_object.source.id), ('harvest_source_title', harvest_object.source.title), ]: _add_extra(data_dict, key, value) return data_dict
def harvest_source_index_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id') source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id conn = make_connection() query = ''' +%s:"%s" +site_id:"%s" ''' % ( 'harvest_source_id', harvest_source_id, config.get('ckan.site_id')) solr_commit = toolkit.asbool(config.get('ckan.search.solr_commit', 'true')) if toolkit.check_ckan_version(max_version='2.5.99'): # conn is solrpy try: conn.delete_query(query) if solr_commit: conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e) finally:
def harvest_job_list(context,data_dict): model = context['model'] user = context.get('user') # Check user is logged in if not user: return {'success': False, 'msg': _('Only logged users are authorized to see their sources')} user_obj = User.get(user) # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): if not user_obj or len(user_obj.get_groups(u'publisher')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)} source_id = data_dict.get('source_id',False) if not source_id: return {'success': False, 'msg': _('Only sysadmins can list all harvest jobs') % str(user)} source = HarvestSource.get(source_id) if not source: raise NotFound if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher')]: return {'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user),source.id)} return {'success': True}
def _test_auth_not_allowed(self, user_name=None, source=None, status=401): if not source: # Create harvest source source = HarvestSource(url=u'http://test-source.com', type='ckan') Session.add(source) Session.commit() if user_name: extra_environ = {'REMOTE_USER': user_name.encode('utf8')} else: extra_environ = {} # List res = self.app.get('/harvest', status=status, extra_environ=extra_environ) # Create res = self.app.get('/harvest/new', status=status, extra_environ=extra_environ) # Read res = self.app.get('/harvest/%s' % source.id, status=status, extra_environ=extra_environ) # Edit res = self.app.get('/harvest/edit/%s' % source.id, status=status, extra_environ=extra_environ) # Refresh res = self.app.get('/harvest/refresh/%s' % source.id, status=status, extra_environ=extra_environ)
def harvest_job_create(context, data_dict): model = context['model'] user = context.get('user') source_id = data_dict['source_id'] if not user: return { 'success': False, 'msg': _('Non-logged in users are not authorized to create harvest jobs') } if ckan.new_authz.is_sysadmin(user): return {'success': True} user_obj = User.get(user) source = HarvestSource.get(source_id) if not source: raise NotFound if not user_obj or not source.publisher_id in [ g.id for g in user_obj.get_groups(u'organization') ]: return { 'success': False, 'msg': _('User %s not authorized to create a job for source %s') % (str(user), source.id) } else: return {'success': True}
def harvest_source_clear(context, data_dict): ''' Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string ''' check_access('harvest_source_clear', context, data_dict) harvest_source_id = data_dict.get('id', None) source = HarvestSource.get(harvest_source_id) if not source: log.error('Harvest source %s does not exist', harvest_source_id) raise NotFound('Harvest source %s does not exist' % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = '''begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;'''.format( harvest_source_id=harvest_source_id) model = context['model'] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, { 'id': harvest_source_id }) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {'id': harvest_source_id}
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict = {'source_id': source_id, 'status': u'New'} exists = harvest_job_list(context, data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def harvest_source_show(context,data_dict): ''' Returns the metadata of a harvest source This method just proxies the request to package_show. All auth checks and validation will be done there. :param id: the id or name of the harvest source :type id: string :returns: harvest source metadata :rtype: dictionary ''' check_access('harvest_source_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) source = HarvestSource.get(id,attr=attr) context['source'] = source if not source: raise NotFound if 'include_status' not in context: context['include_status'] = True return harvest_source_dictize(source,context)
def harvest_source_clear(context, data_dict): """ Clears all datasets, jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. :param id: the id of the harvest source to clear :type id: string """ check_access("harvest_source_clear", context, data_dict) harvest_source_id = data_dict.get("id", None) source = HarvestSource.get(harvest_source_id) if not source: log.error("Harvest source %s does not exist", harvest_source_id) raise NotFound("Harvest source %s does not exist" % harvest_source_id) harvest_source_id = source.id # Clear all datasets from this source from the index harvest_source_index_clear(context, data_dict) sql = """begin; update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}'); delete from harvest_object where harvest_source_id = '{harvest_source_id}'; delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}'); delete from harvest_job where source_id = '{harvest_source_id}'; delete from package_role where package_id in (select id from package where state = 'to_delete' ); delete from user_object_role where id not in (select user_object_role_id from package_role) and context = 'Package'; delete from resource_revision where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from resource_group_revision where package_id in (select id from package where state = 'to_delete'); delete from package_tag_revision where package_id in (select id from package where state = 'to_delete'); delete from member_revision where table_id in (select id from package where state = 'to_delete'); delete from package_extra_revision where package_id in (select id from package where state = 'to_delete'); delete from package_revision where id in (select id from package where state = 'to_delete'); delete from package_tag where package_id in (select id from package where state = 'to_delete'); delete from resource where resource_group_id in (select id from resource_group where package_id in (select id from package where state = 'to_delete')); delete from package_extra where package_id in (select id from package where state = 'to_delete'); delete from member where table_id in (select id from package where state = 'to_delete'); delete from resource_group where package_id in (select id from package where state = 'to_delete'); delete from package where id in (select id from package where state = 'to_delete'); commit;""".format( harvest_source_id=harvest_source_id ) model = context["model"] model.Session.execute(sql) # Refresh the index for this source to update the status object context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": harvest_source_id}) if package_dict: package_index = PackageSearchIndex() package_index.index_package(package_dict) return {"id": harvest_source_id}
def test_form_bound_to_new_object(self): source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini') fs = form.get_harvest_source_fieldset() fs = fs.bind(source) text = fs.render() assert 'url' in text assert 'http://localhost/' in text assert 'description' in text assert 'My source' in text
def test_form_validate_new_object_and_sync(self): assert not HarvestSource.get(u'http://localhost/', None, 'url') fs = form.get_harvest_source_fieldset() register = HarvestSource data = { 'HarvestSource--url': u'http://localhost/', 'HarvestSource--type': u'Gemini', 'HarvestSource--description': u'My source' } fs = fs.bind(register, data=data, session=model.Session) # Test bound_fields.validate(). fs.validate() assert not fs.errors # Test bound_fields.sync(). fs.sync() model.Session.commit() source = HarvestSource.get(u'http://localhost/', None, 'url') assert source.id
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = [ 'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency', 'time' ] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f, data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def setup(self): print ("") print ("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict=get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id}) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model' : model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid' : 'guid', 'content' : self.contentDataset, 'job_id' : self.oHarvestJob.id, 'extras' : { 'a key' : 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model':model, 'session': model.Session, 'user':u'harvest', 'schema':package_schema, 'api_version': '2' }
def after_show(self, context, data_dict): if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME: # This is a harvest source dataset, add extra info from the # HarvestSource object source = HarvestSource.get(data_dict['id']) if not source: log.error('Harvest source not found for dataset {0}'.format( data_dict['id'])) return data_dict st_action_name = 'harvest_source_show_status' try: status_action = p.toolkit.get_action(st_action_name) except KeyError: logic.clear_actions_cache() status_action = p.toolkit.get_action(st_action_name) data_dict['status'] = status_action(context, {'id': source.id}) elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME: # This is a normal dataset, check if it was harvested and if so, add # info about the HarvestObject and HarvestSource harvest_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id==data_dict['id']) \ .filter(HarvestObject.current==True) \ .first() # If the harvest extras are there, remove them. This can happen eg # when calling package_update or resource_update, which call # package_show if data_dict.get('extras'): data_dict['extras'][:] = [ e for e in data_dict.get('extras', []) if not e['key'] in ( 'harvest_object_id', 'harvest_source_id', 'harvest_source_title', ) ] # We only want to add these extras at index time so they are part # of the cached data_dict used to display, search results etc. We # don't want them added when editing the dataset, otherwise we get # duplicated key errors. # The only way to detect indexing right now is checking that # validate is set to False. if harvest_object and not context.get('validate', True): for key, value in [ ('harvest_object_id', harvest_object.id), ('harvest_source_id', harvest_object.source.id), ('harvest_source_title', harvest_object.source.title), ]: _add_extra(data_dict, key, value) return data_dict
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package( get_action('package_show')({ 'validate': False, 'ignore_auth': True }, { 'id': source.id })) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def get_source_object(context, data_dict={}): if not 'source' in context: model = context['model'] id = data_dict.get('id', None) source = HarvestSource.get(id) if not source: raise NotFound else: source = context['source'] return source
def get_source_object(context, data_dict = {}): if not 'source' in context: model = context['model'] id = data_dict.get('id',None) source = HarvestSource.get(id) if not source: raise NotFound else: source = context['source'] return source
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = ['url', 'title', 'description', 'user_id', 'publisher_id', 'frequency'] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f,data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def _create_source(self, source_fixture=FISBROKER_HARVESTER_CONFIG): context = { 'model': model, 'session': Session, 'user': u'harvest' } source_dict = get_action('harvest_source_create')(context,source_fixture) source = HarvestSource.get(source_dict['id']) assert source return source
def test_form_bound_to_existing_object(self): source = HarvestSource(url=u'http://localhost/', description=u'My source', type=u'Gemini') model.Session.add(source) model.Session.commit() model.Session.remove() fs = form.get_harvest_source_fieldset() fs = fs.bind(source) text = fs.render() assert 'url' in text assert 'http://localhost/' in text assert 'description' in text assert 'My source' in text
def harvest_source_update(context, data_dict): check_access('harvest_source_update', context, data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors, _error_summary(errors)) fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f, data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source, context)
def harvest_source_show(context,data_dict): p.toolkit.check_access('harvest_source_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) source = HarvestSource.get(id,attr=attr) if not source: raise NotFound return harvest_source_dictize(source,context)
def harvest_source_show(context, data_dict): check_access('harvest_source_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) source = HarvestSource.get(id, attr=attr) if not source: raise NotFound return harvest_source_dictize(source, context)
def harvest_sources(self): ddi = HarvestSource(url='http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt', type='DDI') ddi.save() oai = HarvestSource(url='http://helda.helsinki.fi/oai/request', type='OAI-PMH') oai.save()
def after_show(self, context, data_dict): if 'type' in data_dict and data_dict['type'] == DATASET_TYPE_NAME: # This is a harvest source dataset, add extra info from the # HarvestSource object source = HarvestSource.get(data_dict['id']) if not source: log.error('Harvest source not found for dataset {0}'.format(data_dict['id'])) return data_dict st_action_name = 'harvest_source_show_status' try: status_action = p.toolkit.get_action(st_action_name) except KeyError: logic.clear_actions_cache() status_action = p.toolkit.get_action(st_action_name) data_dict['status'] = status_action(context, {'id': source.id}) elif not 'type' in data_dict or data_dict['type'] != DATASET_TYPE_NAME: # This is a normal dataset, check if it was harvested and if so, add # info about the HarvestObject and HarvestSource harvest_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id==data_dict['id']) \ .filter(HarvestObject.current==True) \ .first() # If the harvest extras are there, remove them. This can happen eg # when calling package_update or resource_update, which call # package_show if data_dict.get('extras'): data_dict['extras'][:] = [e for e in data_dict.get('extras', []) if not e['key'] in ('harvest_object_id', 'harvest_source_id', 'harvest_source_title',)] # We only want to add these extras at index time so they are part # of the cached data_dict used to display, search results etc. We # don't want them added when editing the dataset, otherwise we get # duplicated key errors. # The only way to detect indexing right now is checking that # validate is set to False. if harvest_object and not context.get('validate', True): for key, value in [ ('harvest_object_id', harvest_object.id), ('harvest_source_id', harvest_object.source.id), ('harvest_source_title', harvest_object.source.title), ]: _add_extra(data_dict, key, value) return data_dict
def _create_source_and_job(self, source_fixture): context = {"model": model, "session": Session, "user": u"harvest"} if config.get("ckan.harvest.auth.profile") == u"publisher" and not "publisher_id" in source_fixture: source_fixture["publisher_id"] = self.publisher.id source_dict = get_action("harvest_source_create")(context, source_fixture) source = HarvestSource.get(source_dict["id"]) assert source job = self._create_job(source.id) return source, job
def _create_harvest_source_object(context, data_dict): ''' Creates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to create harvest sources. The created harvest source will have the same id as the dataset. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' log.info('Creating harvest source: %r', data_dict) source = HarvestSource() source.id = data_dict['id'] source.url = data_dict['url'].strip() # Avoids clashes with the dataset type source.type = data_dict['source_type'] opt = [ 'active', 'title', 'description', 'user_id', 'publisher_id', 'config', 'frequency' ] for o in opt: if o in data_dict and data_dict[o] is not None: source.__setattr__(o, data_dict[o]) source.active = not data_dict.get('state', None) == 'deleted' # Don't commit yet, let package_create do it source.add() log.info('Harvest source created: %s', source.id) return source
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source,context)
def harvest_objects_import(context,data_dict): ''' Reimports the current harvest objects It performs the import stage with the last fetched objects, optionally belonging to a certain source. Please note that no objects will be fetched from the remote server. It will only affect the last fetched objects already present in the database. ''' log.info('Harvest objects import: %r', data_dict) check_access('harvest_objects_import',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('source_id',None) if source_id: source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) if not source.active: log.warn('Harvest source %s is not active.', source_id) raise Exception('This harvest source is not active') last_objects_ids = session.query(HarvestObject.id) \ .join(HarvestSource).join(Package) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ .all() else: last_objects_ids = session.query(HarvestObject.id) \ .join(Package) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ .all() last_objects = [] for obj_id in last_objects_ids: obj = session.query(HarvestObject).get(obj_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: if hasattr(harvester,'force_import'): harvester.force_import = True harvester.import_stage(obj) break last_objects.append(harvest_object_dictize(obj,context)) log.info('Harvest objects imported: %r', last_objects) return last_objects
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org', type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def _create_source_and_job(self, source_fixture): context = {'model': model, 'session': Session, 'user': u'harvest'} if not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher['id'] source_dict = get_action('harvest_source_create')(context, source_fixture) source = HarvestSource.get(source_dict['id']) assert source job = self._create_job(source.id) return source, job
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org',type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def run_job_synchronously(self): import datetime from ckan import model from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject from ckanext.harvest.queue import fetch_and_import_stages from ckan.lib.search.index import PackageSearchIndex package_index = PackageSearchIndex() source_id = unicode(self.args[1]) source = HarvestSource.get(source_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == source.type: break else: print "No harvester found to handle the job." return job = HarvestJob() job.source = source job.status = "Running" job.gather_started = datetime.datetime.utcnow() job.save() try: harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.utcnow() job.save() for obj_id in harvest_object_ids: obj = HarvestObject.get(obj_id) obj.retry_times += 1 obj.save() fetch_and_import_stages(harvester, obj) job.finished = datetime.datetime.utcnow() job.status = "Done" job.save() # And reindex the harvest source so it gets its counts right. # Must call update on a data_dict as returned by package_show, not the class object. package_index.index_package(get_action('package_show')({'validate': False, 'ignore_auth': True}, {'id': source.id})) finally: job.finished = datetime.datetime.utcnow() if job.status != "Done": job.status = "Error" job.save()
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError( 'Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')(context, { 'id': job.id }) return harvest_job_dictize(job, context)
def _create_source_and_job(self, source_fixture): context ={'model':model, 'session':Session, 'user':u'harvest'} if not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher['id'] source_dict=get_action('harvest_source_create')(context,source_fixture) source = HarvestSource.get(source_dict['id']) assert source job = self._create_job(source.id) return source, job
def harvest_object_list(context, data_dict): model = context['model'] user = context.get('user') # Check user is logged in if not user: return { 'success': False, 'msg': _('Only logged users are authorized to see their sources') } user_obj = User.get(user) # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): if not user_obj or len(user_obj.get_groups(u'publisher')) == 0: return { 'success': False, 'msg': _('User %s must belong to a publisher to list harvest objects') % str(user) } source_id = data_dict.get('source_id', False) if not source_id: return { 'success': False, 'msg': _('Only sysadmins can list all harvest objects') % str(user) } source = HarvestSource.get(source_id) if not source: raise NotFound if not source.publisher_id in [ g.id for g in user_obj.get_groups(u'publisher') ]: return { 'success': False, 'msg': _('User %s not authorized to list objects from source %s') % (str(user), source.id) } return {'success': True}
def _create_source_and_job(self, source_fixture): context = {'model': model, 'session': Session, 'user': u'harvest'} if config.get('ckan.harvest.auth.profile') == u'publisher' \ and 'publisher_id' not in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict = get_action('harvest_source_create')(context, source_fixture) source = HarvestSource.get(source_dict['id']) assert source job = self._create_job(source.id) return source, job
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')( context, {'id': job.id}) return harvest_job_dictize(job, context)
def _create_harvest_source_object(context, data_dict): ''' Creates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to create harvest sources. The created harvest source will have the same id as the dataset. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' log.info('Creating harvest source: %r', data_dict) source = HarvestSource() source.id = data_dict['id'] source.url = data_dict['url'].strip() # Avoids clashes with the dataset type source.type = data_dict['source_type'] opt = ['active', 'title', 'description', 'user_id', 'publisher_id', 'config', 'frequency'] for o in opt: if o in data_dict and data_dict[o] is not None: source.__setattr__(o,data_dict[o]) source.active = not data_dict.get('state', None) == 'deleted' # Don't commit yet, let package_create do it source.add() log.info('Harvest source created: %s', source.id) return source
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() return harvest_source_dictize(source,context)
def _create_source_and_job(self): context = { 'model': model, 'session': model.Session, 'user': u'harvest' } source_fixture = {'url': u'http://csw/GetCapabilities', 'type': u'csw'} if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict = get_action('harvest_source_create')(context, source_fixture) source = HarvestSource.get(source_dict['id']) assert source job = self._create_job(source.id) return source, job
def _create_source_and_job(self): context ={'model': model, 'session': model.Session, 'user': u'harvest'} source_fixture = { 'url': u'http://csw/GetCapabilities', 'type': u'csw' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict=get_action('harvest_source_create')(context,source_fixture) source = HarvestSource.get(source_dict['id']) assert source job = self._create_job(source.id) return source, job
def get_harvest_source_config(harvester_id): source_config = {} keys_lookfor = [ 'default_groups', 'private_datasets', 'validator_profiles', ] try: harvest_source = HarvestSource.get(harvester_id) source_config = json.loads(harvest_source.config) except: pass # convert single string element list to string if source_config: for key in keys_lookfor: value = source_config.get(key, '') if type(value) is list: source_config[key] = value[0] return source_config
def get_harvest_source_config(harvester_id): source_config = {} keys_lookfor =[ 'default_groups', 'private_datasets', 'validator_profiles', ] try: harvest_source = HarvestSource.get(harvester_id) source_config = json.loads(harvest_source.config) except: pass # convert single string element list to string if source_config: for key in keys_lookfor: value = source_config.get(key, '') if type(value) is list: source_config[key] = value[0] return source_config