def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def test_gather(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() self.harvester.client = _FakeClient() self.harvester.gather_stage(job)
def harvest_job_create(context,data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create',context,data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict ={ 'source_id':source_id, 'status':u'New' } exists = harvest_job_list(context,data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job,context)
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise Exception('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def harvest_job_create(context, data_dict): log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestError('Can not create jobs on inactive sources') # Check if there already is an unrun job for this source data_dict = {'source_id': source_id, 'status': u'New'} exists = harvest_job_list(context, data_dict) if len(exists): log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestError('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) return harvest_job_dictize(job, context)
def test_zaincremental_harvester(self): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" harvest_job.gather_started = ((datetime.now() + timedelta(days=1))) harvest_job.source.config = '{"incremental":"True"}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) rev = model.repo.new_revision() rev.timestamp = ((datetime.now() + timedelta(days=2))) pkg = Package(name='footest', revision=rev) Session.add(pkg) pkg.save() roger = Group.get('roger') roger.add_package_by_name('footest') Session.add(roger) roger.save() gathered = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(gathered[0]) harv.fetch_stage(harvest_object) harvobj = json.loads(harvest_object.content) self.assert_(harvobj['records'])
def harvest_job_abort(context, data_dict): ''' Aborts a harvest job. Given a harvest source_id, it looks for the latest one and (assuming it not already Finished) marks it as Finished. It also marks any of that source's harvest objects and (if not complete or error) marks them "ERROR", so any left in limbo are cleaned up. Does not actually stop running any queued harvest fetchs/objects. :param source_id: the name or id of the harvest source with a job to abort :type source_id: string ''' check_access('harvest_job_abort', context, data_dict) model = context['model'] source_id = data_dict.get('source_id') source = harvest_source_show(context, {'id': source_id}) # HarvestJob set status to 'Finished' # Don not use harvest_job_list since it can use a lot of memory last_job = model.Session.query(HarvestJob) \ .filter_by(source_id=source['id']) \ .order_by(HarvestJob.created.desc()).first() if not last_job: raise NotFound('Error: source has no jobs') job = get_action('harvest_job_show')(context, {'id': last_job.id}) if job['status'] != 'Finished': # i.e. New or Running job_obj = HarvestJob.get(job['id']) job_obj.status = new_status = 'Finished' model.repo.commit_and_remove() log.info('Harvest job changed status from "%s" to "%s"', job['status'], new_status) else: log.info('Harvest job unchanged. Source %s status is: "%s"', job['id'], job['status']) # HarvestObjects set to ERROR job_obj = HarvestJob.get(job['id']) objs = job_obj.objects for obj in objs: if obj.state not in ('COMPLETE', 'ERROR'): old_state = obj.state obj.state = 'ERROR' log.info('Harvest object changed state from "%s" to "%s": %s', old_state, obj.state, obj.id) else: log.info('Harvest object not changed from "%s": %s', obj.state, obj.id) model.repo.commit_and_remove() job_obj = HarvestJob.get(job['id']) return harvest_job_dictize(job_obj, context)
def test_0harvester_url_error(self): self.harv = MetadataHarvester() self.harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://foo" harvest_job.source.type = "Metadata" urllib2.urlopen = realopen self.assert_(self.harv.gather_stage(harvest_job) == None)
def test_harvester_1gather_ddi(self): self.harv = MetadataHarvester() self.harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://foo" harvest_job.source.type = "Metadata" urllib2.urlopen = mock.Mock(side_effect=self._side_effect_ddi_datas) self.gathered = self.harv.gather_stage(harvest_job) self.assert_(len(self.gathered) == 1) self.assert_(isinstance(self.harv.harvester, DDIHarvester))
def _create_harvester_info(self, config=True): rev = model.repo.new_revision() harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" if config: harvest_job.source.config = '{"query": ""}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) return harvest_job, harv
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def _create_harvester(self, config=True): harv = DDIHarvester() harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://foo" if config: harvest_job.source.config = '' else: harvest_job.source.config = None harvest_job.source.type = "DDI" Session.add(harvest_job) return harv, harvest_job
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError( 'Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')(context, { 'id': job.id }) return harvest_job_dictize(job, context)
def harvest_job_report(context, data_dict): check_access('harvest_job_show', context, data_dict) model = context['model'] id = data_dict.get('id') job = HarvestJob.get(id) if not job: raise NotFound report = { 'gather_errors': [], 'object_errors': {} } # Gather errors q = model.Session.query(harvest_model.HarvestGatherError) \ .join(harvest_model.HarvestJob) \ .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \ .order_by(harvest_model.HarvestGatherError.created.desc()) for error in q.all(): report['gather_errors'].append({ 'message': error.message }) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: if hasattr(harvester, 'get_original_url'): original_url_builder = harvester.get_original_url q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \ .join(harvest_model.HarvestObject) \ .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \ .order_by(harvest_model.HarvestObjectError.harvest_object_id) for error, guid in q.all(): if not error.harvest_object_id in report['object_errors']: report['object_errors'][error.harvest_object_id] = { 'guid': guid, 'errors': [] } if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report['object_errors'][error.harvest_object_id]['original_url'] = url report['object_errors'][error.harvest_object_id]['errors'].append({ 'message': error.message, 'line': error.line, 'type': error.stage }) return report
def test_harvest_jobs_run_does_not_timeout_if_timeout_not_set( self, mock_error_log): harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) harvest_job = factories.HarvestJobObj(source=harvest_source, run=True) # date in the past, assumes ckan.harvest.timeout has been set to 5 minutes harvest_job.created = '2020-05-29 10:00:00.0' harvest_job.save() context = { 'model': model, 'session': model.Session, 'ignore_auth': True, 'user': '' } data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job.id, 'source_id': harvest_source.id } job_obj = HarvestJob.get(harvest_job.id) job = toolkit.get_action('harvest_jobs_run')(context, data_dict) assert not mock_error_log.called status = toolkit.get_action('harvest_source_show_status')( context, { 'id': harvest_source.id }) assert status['last_job']['status'] == 'Running' assert status['last_job']['stats']['errored'] == 0
def test_harvest_jobs_run_does_not_timeout_if_within_time( self, mock_error_log): harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) harvest_job = factories.HarvestJobObj(source=harvest_source, run=True) # job has just been created, so no timeout expected context = { 'model': model, 'session': model.Session, 'ignore_auth': True, 'user': '' } data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job.id, 'source_id': harvest_source.id } job_obj = HarvestJob.get(harvest_job.id) job = toolkit.get_action('harvest_jobs_run')(context, data_dict) assert not mock_error_log.called status = toolkit.get_action('harvest_source_show_status')( context, { 'id': harvest_source.id }) assert status['last_job']['status'] == 'Running' assert status['last_job']['stats']['errored'] == 0
def harvest_job_exists(value, context): '''Check if a harvest job exists and returns the model if it does''' result = HarvestJob.get(value) if not result: raise Invalid('Harvest Job with id %r does not exist.' % str(value)) return result
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')(context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def test_harvester(self): job = HarvestJob(source = self.source) harvester = InventoryHarvester() # Gather all of the datasets from the XML content and make sure # we have created some harvest objects result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml')) self.assertEqual(len(result), 79) # We only want one for testing harvest_object_id = result[0] harvest_obj = HarvestObject.get(harvest_object_id) # Run the fetch stage fetch_result = harvester.fetch_stage(harvest_obj) self.assertTrue(fetch_result) # Make sure we can create a dataset by running the import stage harvester.import_stage(harvest_obj) self.assertIsNotNone(harvest_obj.package_id) # Get the newly created package and make sure it is in the correct # organisation pkg = toolkit.get_action('package_show')( { 'ignore_auth': True, 'user': self.sysadmin['name'] }, { 'id': harvest_obj.package_id }, ) self.assertEqual(pkg['organization']['id'], self.publisher['id'])
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')( context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def run_test_harvester(source_id_or_name, force_import): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) # Determine the job try: job_dict = tk.get_action("harvest_job_create")( context, { "source_id": source["id"] }) except HarvestJobExists: running_jobs = tk.get_action("harvest_job_list")( context, { "source_id": source["id"], "status": "Running" }) if running_jobs: print('\nSource "{0}" apparently has a "Running" job:\n{1}'.format( source.get("name") or source["id"], running_jobs)) if six.PY2: resp = raw_input("Abort it? (y/n)") else: resp = input("Abort it? (y/n)") if not resp.lower().startswith("y"): sys.exit(1) job_dict = tk.get_action("harvest_job_abort")( context, { "source_id": source["id"] }) else: print("Reusing existing harvest job") jobs = tk.get_action("harvest_job_list")(context, { "source_id": source["id"], "status": "New" }) assert (len(jobs) == 1 ), 'Multiple "New" jobs for this source! {0}'.format(jobs) job_dict = jobs[0] job_obj = HarvestJob.get(job_dict["id"]) if force_import: job_obj.force_import = force_import harvester = queue.get_harvester(source["source_type"]) assert harvester, "No harvester found for type: {0}".format( source["source_type"]) lib.run_harvest_job(job_obj, harvester)
def test_harvester_4gather_oaipmh(self): self.harv = MetadataHarvester() self.harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://foo" harvest_job.source.type = "Metadata" client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) self.gathered = self.harv.gather_stage(harvest_job) self.assert_(len(self.gathered) > 1) self.assert_(isinstance(self.harv.harvester, OAIPMHHarvester))
def harvest_job_create(context, data_dict): ''' Creates a Harvest Job for a Harvest Source and runs it (by putting it on the gather queue) :param source_id: id of the harvest source to create a job for :type source_id: string :param run: whether to also run it or not (default: True) :type run: bool ''' log.info('Harvest job create: %r', data_dict) check_access('harvest_job_create', context, data_dict) source_id = data_dict['source_id'] run_it = data_dict.get('run', True) # Check if source exists source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise toolkit.NotFound('Harvest source %s does not exist' % source_id) # Check if the source is active if not source.active: log.warn('Harvest job cannot be created for inactive source %s', source_id) raise HarvestSourceInactiveError('Can not create jobs on inactive sources') # Check if there already is an unrun or currently running job for this # source exists = _check_for_existing_jobs(context, source_id) if exists: log.warn('There is already an unrun job %r for this source %s', exists, source_id) raise HarvestJobExists('There already is an unrun job for this source') job = HarvestJob() job.source = source job.save() log.info('Harvest job saved %s', job.id) if run_it: toolkit.get_action('harvest_send_job_to_gather_queue')( context, {'id': job.id}) return harvest_job_dictize(job, context)
def run_test_harvest(self): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob # Determine the source if len(self.args) >= 2: source_id_or_name = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) context = { 'model': model, 'session': model.Session, 'user': self.admin_user['name'] } source = get_action('harvest_source_show')(context, { 'id': source_id_or_name }) # Determine the job try: job_dict = get_action('harvest_job_create')( context, { 'source_id': source['id'] }) except HarvestJobExists: running_jobs = get_action('harvest_job_list')( context, { 'source_id': source['id'], 'status': 'Running' }) if running_jobs: print '\nSource "%s" apparently has a "Running" job:\n%r' \ % (source.get('name') or source['id'], running_jobs) resp = raw_input('Abort it? (y/n)') if not resp.lower().startswith('y'): sys.exit(1) job_dict = get_action('harvest_job_abort')( context, { 'source_id': source['id'] }) else: print 'Reusing existing harvest job' jobs = get_action('harvest_job_list')(context, { 'source_id': source['id'], 'status': 'New' }) assert len(jobs) == 1, \ 'Multiple "New" jobs for this source! %r' % jobs job_dict = jobs[0] job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) assert harvester, \ 'No harvester found for type: %s' % source['source_type'] lib.run_harvest_job(job_obj, harvester)
def _create_job(self, source_id): # Create a job context = {"model": model, "session": Session, "user": u"harvest"} job_dict = get_action("harvest_job_create")(context, {"source_id": source_id}) job = HarvestJob.get(job_dict["id"]) assert job return job
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = [ 'url', 'title', 'description', 'user_id', 'publisher_id', 'frequency', 'time' ] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f, data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False try: obj = HarvestObject.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during fetch of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # check if job has been set to finished job = HarvestJob.get(obj.harvest_job_id) if job.status == 'Finished': obj.state = "ERROR" obj.report_status = "errored" obj.save() log.error( 'Job {0} was aborted or timed out, object {1} set to error'.format( job.id, obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def setup(self): print ("") print ("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict=get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id}) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model' : model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid' : 'guid', 'content' : self.contentDataset, 'job_id' : self.oHarvestJob.id, 'extras' : { 'a key' : 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model':model, 'session': model.Session, 'user':u'harvest', 'schema':package_schema, 'api_version': '2' }
def get_job_object(context, data_dict = {}): if not 'job' in context: model = context['model'] id = data_dict.get('id',None) job = HarvestJob.get(id) if not job: raise NotFound else: job = context['job'] return job
def get_job_object(context, data_dict={}): if not 'job' in context: model = context['model'] id = data_dict.get('id', None) job = HarvestJob.get(id) if not job: raise NotFound else: job = context['job'] return job
def _create_job(self,source_id): # Create a job context ={'model':model, 'session':Session, 'user':u'harvest'} job_dict=get_action('harvest_job_create')(context,{'source_id':source_id}) job = HarvestJob.get(job_dict['id']) assert job return job
def _update_harvest_source_object(context, data_dict): ''' Updates an actual HarvestSource object with the data dict of the harvest_source dataset. All validation and authorization checks should be used by now, so this function is not to be used directly to update harvest sources. :param data_dict: A standard package data_dict :returns: The created HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise logic.NotFound('Harvest source %s does not exist' % source_id) fields = ['url', 'title', 'description', 'user_id', 'publisher_id', 'frequency'] for f in fields: if f in data_dict and data_dict[f] is not None: if f == 'url': data_dict[f] = data_dict[f].strip() source.__setattr__(f,data_dict[f]) # Avoids clashes with the dataset type if 'source_type' in data_dict: source.type = data_dict['source_type'] if 'config' in data_dict: source.config = data_dict['config'] # Don't change state unless explicitly set in the dict if 'state' in data_dict: source.active = data_dict.get('state') == 'active' # Don't commit yet, let package_create do it source.add() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.add() return source
def gather_callback(message_data, message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get rid of any old session state that may still be around. This is # a simple alternative to creating a new session for this callback. model.Session.expire_all() # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) return # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin' 's gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id': id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg, job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def harvest_job_report(context, data_dict): check_access("harvest_job_show", context, data_dict) model = context["model"] id = data_dict.get("id") job = HarvestJob.get(id) if not job: raise NotFound report = {"gather_errors": [], "object_errors": {}} # Gather errors q = ( model.Session.query(harvest_model.HarvestGatherError) .join(harvest_model.HarvestJob) .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id) .order_by(harvest_model.HarvestGatherError.created.desc()) ) for error in q.all(): report["gather_errors"].append({"message": error.message}) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()["name"] == job.source.type: if hasattr(harvester, "get_original_url"): original_url_builder = harvester.get_original_url q = ( model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) .join(harvest_model.HarvestObject) .filter(harvest_model.HarvestObject.harvest_job_id == job.id) .order_by(harvest_model.HarvestObjectError.harvest_object_id) ) for error, guid in q.all(): if not error.harvest_object_id in report["object_errors"]: report["object_errors"][error.harvest_object_id] = {"guid": guid, "errors": []} if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report["object_errors"][error.harvest_object_id]["original_url"] = url report["object_errors"][error.harvest_object_id]["errors"].append( {"message": error.message, "line": error.line, "type": error.stage} ) return report
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() job = HarvestJob() job.save() model.repo.commit_and_remove() job = model.Session.query(HarvestJob).first() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), harvest_job=job, guid='test-guid', content='<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def harvest_job_show(context, data_dict): check_access("harvest_job_show", context, data_dict) id = data_dict.get("id") attr = data_dict.get("attr", None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def harvest_source_update(context, data_dict): check_access('harvest_source_update', context, data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors, _error_summary(errors)) fields = ['url', 'title', 'type', 'description', 'user_id', 'publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f, data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source, status=u'New') log.info( 'Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source, context)
def harvest_job_show(context, data_dict): check_access('harvest_job_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def harvest_job_show(context,data_dict): p.toolkit.check_access('harvest_job_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) job = HarvestJob.get(id,attr=attr) if not job: raise NotFound return harvest_job_dictize(job,context)
def _get_source_status(source, context): ''' TODO: Deprecated, use harvest_source_show_status instead ''' model = context.get('model') out = dict() job_count = HarvestJob.filter(source=source).count() out = { 'job_count': 0, 'next_harvest': '', 'last_harvest_request': '', } if not job_count: out['msg'] = 'No jobs yet' return out else: out['job_count'] = job_count # Get next scheduled job next_job = HarvestJob.filter(source=source, status=u'New').first() if next_job: out['next_harvest'] = 'Scheduled' else: out['next_harvest'] = 'Not yet scheduled' # Get the last finished job last_job = HarvestJob.filter(source=source, status=u'Finished') \ .order_by(HarvestJob.created.desc()).first() if last_job: out['last_harvest_request'] = str(last_job.gather_finished) else: out['last_harvest_request'] = 'Not yet harvested' return out
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: if f == 'url': data[f] = data[f].strip() source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() # Ensure sqlalchemy writes to the db immediately, since the gather/fetch # runs in a different process and needs the latest source info. Not sure if # this works, but try it. model.repo.commit_and_remove() return harvest_source_dictize(source,context)
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org',type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org', type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def test_zzcomplete(self): raise SkipTest('Takes ages, do not run') urllib2.urlopen = realopen harv = DDIHarvester() harv.config = "{}" harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://www.fsd.uta.fi/fi/aineistot/luettelo/fsd-ddi-records-uris-fi.txt" harvest_job.source.config = '' harvest_job.source.type = "DDI" Session.add(harvest_job) gathered = harv.gather_stage(harvest_job) diffs = [] for gath in gathered: harvest_object = HarvestObject.get(gath) print json.loads(harvest_object.content)['url'] before = datetime.now() harv.fetch_stage(harvest_object) harv.import_stage(harvest_object) diff = datetime.now() - before print diff diffs.append(diff) print sum(diffs, timedelta)
def gather_callback(message_data,message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except: log.error('Harvest job does not exist: %s' % id) else: # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def _create_job(self, source_id): ''' :param source_id: ''' # Create a job context = {u'user': u'harvest'} job_dict = toolkit.get_action(u'harvest_job_create')( context, { u'source_id': source_id }) job = HarvestJob.get(job_dict[u'id']) assert job return job
def run_test_harvest(self): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob # Determine the source if len(self.args) >= 2: source_id_or_name = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) context = {'model': model, 'session': model.Session, 'user': self.admin_user['name']} source = get_action('harvest_source_show')( context, {'id': source_id_or_name}) # Determine the job try: job_dict = get_action('harvest_job_create')( context, {'source_id': source['id']}) except HarvestJobExists: running_jobs = get_action('harvest_job_list')( context, {'source_id': source['id'], 'status': 'Running'}) if running_jobs: print '\nSource "%s" apparently has a "Running" job:\n%r' \ % (source.get('name') or source['id'], running_jobs) resp = raw_input('Abort it? (y/n)') if not resp.lower().startswith('y'): sys.exit(1) job_dict = get_action('harvest_job_abort')( context, {'source_id': source['id']}) else: print 'Reusing existing harvest job' jobs = get_action('harvest_job_list')( context, {'source_id': source['id'], 'status': 'New'}) assert len(jobs) == 1, \ 'Multiple "New" jobs for this source! %r' % jobs job_dict = jobs[0] job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) assert harvester, \ 'No harvester found for type: %s' % source['source_type'] lib.run_harvest_job(job_obj, harvester)
def harvest_source_update(context,data_dict): check_access('harvest_source_update',context,data_dict) model = context['model'] session = context['session'] source_id = data_dict.get('id') schema = context.get('schema') or default_harvest_source_schema() log.info('Harvest source %s update: %r', source_id, data_dict) source = HarvestSource.get(source_id) if not source: log.error('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: if f in data and data[f] is not None: source.__setattr__(f,data[f]) if 'active' in data_dict: source.active = data['active'] if 'config' in data_dict: source.config = data['config'] source.save() # Abort any pending jobs if not source.active: jobs = HarvestJob.filter(source=source,status=u'New') log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count()) if jobs: for job in jobs: job.status = u'Aborted' job.save() return harvest_source_dictize(source,context)
def test_error_mail_sent(self, mock_mailer_mail_recipient): context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing() # create a HarvestGatherError job_model = HarvestJob.get(job['id']) msg = 'System error - No harvester could be found for source type %s' % job_model.source.type err = HarvestGatherError(message=msg, job=job_model) err.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def _delete_harvest_source_object(context, data_dict): ''' Deletes an actual HarvestSource object with the id provided on the data dict of the harvest_source dataset. Similarly to the datasets, the source object is not actually deleted, just flagged as inactive. All validation and authorization checks should be used by now, so this function is not to be used directly to delete harvest sources. :param data_dict: A standard package data_dict :returns: The deleted HarvestSource object :rtype: HarvestSource object ''' source_id = data_dict.get('id') log.info('Deleting harvest source: %s', source_id) source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise p.toolkit.ObjectNotFound('Harvest source %s does not exist' % source_id) # Don't actually delete the record, just flag it as inactive source.active = False source.save() # Abort any pending jobs jobs = HarvestJob.filter(source=source, status=u'New') if jobs: log.info('Aborting %i jobs due to deleted harvest source', jobs.count()) for job in jobs: job.status = u'Aborted' job.save() log.debug('Harvest source %s deleted', source_id) return source
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except sqlalchemy.exc.OperationalError, e: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" log.exception(e) log.error('Connection Error during gather of job %s: %r %r', id, e, e.args) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return
def harvest_source_delete(context,data_dict): log.info('Deleting harvest source: %r', data_dict) check_access('harvest_source_delete',context,data_dict) source_id = data_dict.get('id') source = HarvestSource.get(source_id) if not source: log.warn('Harvest source %s does not exist', source_id) raise NotFound('Harvest source %s does not exist' % source_id) # Don't actually delete the record, just flag it as inactive source.active = False source.save() # Abort any pending jobs jobs = HarvestJob.filter(source=source,status=u'New') if jobs: log.info('Aborting %i jobs due to deleted harvest source', jobs.count()) for job in jobs: job.status = u'Aborted' job.save() log.info('Harvest source %s deleted', source_id) return True
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if config.get('ckan.harvest.status_mail') == 'errors' \ and status['last_job']['stats']['errored']: subject, body = prepare_error_mail( context, job_obj.source_id, status, 'emails/error_email.txt') log.info('Sending error mail') send_mail(context, job_obj.source.id, subject, body) if config.get('ckan.harvest.status_mail') == 'all': subject, body = prepare_summary_mail( context, job_obj.source.id, status, 'emails/summary_email.txt') log.info('Sending summary email') send_mail(context, job_obj.source.id, subject, body) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility