def harvest_job_abort(context, data_dict): ''' Aborts a harvest job. Given a harvest source_id, it looks for the latest one and (assuming it not already Finished) marks it as Finished. It also marks any of that source's harvest objects and (if not complete or error) marks them "ERROR", so any left in limbo are cleaned up. Does not actually stop running any queued harvest fetchs/objects. :param source_id: the name or id of the harvest source with a job to abort :type source_id: string ''' check_access('harvest_job_abort', context, data_dict) model = context['model'] source_id = data_dict.get('source_id') source = harvest_source_show(context, {'id': source_id}) # HarvestJob set status to 'Finished' # Don not use harvest_job_list since it can use a lot of memory last_job = model.Session.query(HarvestJob) \ .filter_by(source_id=source['id']) \ .order_by(HarvestJob.created.desc()).first() if not last_job: raise NotFound('Error: source has no jobs') job = get_action('harvest_job_show')(context, {'id': last_job.id}) if job['status'] != 'Finished': # i.e. New or Running job_obj = HarvestJob.get(job['id']) job_obj.status = new_status = 'Finished' model.repo.commit_and_remove() log.info('Harvest job changed status from "%s" to "%s"', job['status'], new_status) else: log.info('Harvest job unchanged. Source %s status is: "%s"', job['id'], job['status']) # HarvestObjects set to ERROR job_obj = HarvestJob.get(job['id']) objs = job_obj.objects for obj in objs: if obj.state not in ('COMPLETE', 'ERROR'): old_state = obj.state obj.state = 'ERROR' log.info('Harvest object changed state from "%s" to "%s": %s', old_state, obj.state, obj.id) else: log.info('Harvest object not changed from "%s": %s', obj.state, obj.id) model.repo.commit_and_remove() job_obj = HarvestJob.get(job['id']) return harvest_job_dictize(job_obj, context)
def test_harvest_jobs_run_does_not_timeout_if_within_time( self, mock_error_log): harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) harvest_job = factories.HarvestJobObj(source=harvest_source, run=True) # job has just been created, so no timeout expected context = { 'model': model, 'session': model.Session, 'ignore_auth': True, 'user': '' } data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job.id, 'source_id': harvest_source.id } job_obj = HarvestJob.get(harvest_job.id) job = toolkit.get_action('harvest_jobs_run')(context, data_dict) assert not mock_error_log.called status = toolkit.get_action('harvest_source_show_status')( context, { 'id': harvest_source.id }) assert status['last_job']['status'] == 'Running' assert status['last_job']['stats']['errored'] == 0
def test_harvest_jobs_run_does_not_timeout_if_timeout_not_set( self, mock_error_log): harvest_source = factories.HarvestSourceObj(**SOURCE_DICT.copy()) harvest_job = factories.HarvestJobObj(source=harvest_source, run=True) # date in the past, assumes ckan.harvest.timeout has been set to 5 minutes harvest_job.created = '2020-05-29 10:00:00.0' harvest_job.save() context = { 'model': model, 'session': model.Session, 'ignore_auth': True, 'user': '' } data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job.id, 'source_id': harvest_source.id } job_obj = HarvestJob.get(harvest_job.id) job = toolkit.get_action('harvest_jobs_run')(context, data_dict) assert not mock_error_log.called status = toolkit.get_action('harvest_source_show_status')( context, { 'id': harvest_source.id }) assert status['last_job']['status'] == 'Running' assert status['last_job']['stats']['errored'] == 0
def harvest_job_report(context, data_dict): check_access('harvest_job_show', context, data_dict) model = context['model'] id = data_dict.get('id') job = HarvestJob.get(id) if not job: raise NotFound report = { 'gather_errors': [], 'object_errors': {} } # Gather errors q = model.Session.query(harvest_model.HarvestGatherError) \ .join(harvest_model.HarvestJob) \ .filter(harvest_model.HarvestGatherError.harvest_job_id==job.id) \ .order_by(harvest_model.HarvestGatherError.created.desc()) for error in q.all(): report['gather_errors'].append({ 'message': error.message }) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: if hasattr(harvester, 'get_original_url'): original_url_builder = harvester.get_original_url q = model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) \ .join(harvest_model.HarvestObject) \ .filter(harvest_model.HarvestObject.harvest_job_id==job.id) \ .order_by(harvest_model.HarvestObjectError.harvest_object_id) for error, guid in q.all(): if not error.harvest_object_id in report['object_errors']: report['object_errors'][error.harvest_object_id] = { 'guid': guid, 'errors': [] } if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report['object_errors'][error.harvest_object_id]['original_url'] = url report['object_errors'][error.harvest_object_id]['errors'].append({ 'message': error.message, 'line': error.line, 'type': error.stage }) return report
def harvest_job_exists(value, context): '''Check if a harvest job exists and returns the model if it does''' result = HarvestJob.get(value) if not result: raise Invalid('Harvest Job with id %r does not exist.' % str(value)) return result
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')( context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def harvest_send_job_to_gather_queue(context, data_dict): ''' Sends a harvest job to the gather queue. :param id: the id of the harvest job :type id: string ''' log.info('Send job to gather queue: %r', data_dict) job_id = logic.get_or_bust(data_dict, 'id') job = toolkit.get_action('harvest_job_show')(context, {'id': job_id}) check_access('harvest_send_job_to_gather_queue', context, job) # gather queue publisher = get_gather_publisher() # Check the source is active source = harvest_source_show(context, {'id': job['source_id']}) if not source['active']: raise toolkit.ValidationError('Source is not active') job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue', job['id']) return harvest_job_dictize(job_obj, context)
def run_test_harvester(source_id_or_name, force_import): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) # Determine the job try: job_dict = tk.get_action("harvest_job_create")( context, { "source_id": source["id"] }) except HarvestJobExists: running_jobs = tk.get_action("harvest_job_list")( context, { "source_id": source["id"], "status": "Running" }) if running_jobs: print('\nSource "{0}" apparently has a "Running" job:\n{1}'.format( source.get("name") or source["id"], running_jobs)) if six.PY2: resp = raw_input("Abort it? (y/n)") else: resp = input("Abort it? (y/n)") if not resp.lower().startswith("y"): sys.exit(1) job_dict = tk.get_action("harvest_job_abort")( context, { "source_id": source["id"] }) else: print("Reusing existing harvest job") jobs = tk.get_action("harvest_job_list")(context, { "source_id": source["id"], "status": "New" }) assert (len(jobs) == 1 ), 'Multiple "New" jobs for this source! {0}'.format(jobs) job_dict = jobs[0] job_obj = HarvestJob.get(job_dict["id"]) if force_import: job_obj.force_import = force_import harvester = queue.get_harvester(source["source_type"]) assert harvester, "No harvester found for type: {0}".format( source["source_type"]) lib.run_harvest_job(job_obj, harvester)
def run_test_harvest(self): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob # Determine the source if len(self.args) >= 2: source_id_or_name = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) context = { 'model': model, 'session': model.Session, 'user': self.admin_user['name'] } source = get_action('harvest_source_show')(context, { 'id': source_id_or_name }) # Determine the job try: job_dict = get_action('harvest_job_create')( context, { 'source_id': source['id'] }) except HarvestJobExists: running_jobs = get_action('harvest_job_list')( context, { 'source_id': source['id'], 'status': 'Running' }) if running_jobs: print '\nSource "%s" apparently has a "Running" job:\n%r' \ % (source.get('name') or source['id'], running_jobs) resp = raw_input('Abort it? (y/n)') if not resp.lower().startswith('y'): sys.exit(1) job_dict = get_action('harvest_job_abort')( context, { 'source_id': source['id'] }) else: print 'Reusing existing harvest job' jobs = get_action('harvest_job_list')(context, { 'source_id': source['id'], 'status': 'New' }) assert len(jobs) == 1, \ 'Multiple "New" jobs for this source! %r' % jobs job_dict = jobs[0] job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) assert harvester, \ 'No harvester found for type: %s' % source['source_type'] lib.run_harvest_job(job_obj, harvester)
def _create_job(self, source_id): # Create a job context = {"model": model, "session": Session, "user": u"harvest"} job_dict = get_action("harvest_job_create")(context, {"source_id": source_id}) job = HarvestJob.get(job_dict["id"]) assert job return job
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False try: obj = HarvestObject.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during fetch of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # check if job has been set to finished job = HarvestJob.get(obj.harvest_job_id) if job.status == 'Finished': obj.state = "ERROR" obj.report_status = "errored" obj.save() log.error( 'Job {0} was aborted or timed out, object {1} set to error'.format( job.id, obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def setup(self): print ("") print ("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict=get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict=get_action('harvest_job_create')(context,{'source_id': self.oHarvestSource.id}) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model' : model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid' : 'guid', 'content' : self.contentDataset, 'job_id' : self.oHarvestJob.id, 'extras' : { 'a key' : 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model':model, 'session': model.Session, 'user':u'harvest', 'schema':package_schema, 'api_version': '2' }
def _create_job(self,source_id): # Create a job context ={'model':model, 'session':Session, 'user':u'harvest'} job_dict=get_action('harvest_job_create')(context,{'source_id':source_id}) job = HarvestJob.get(job_dict['id']) assert job return job
def get_job_object(context, data_dict={}): if not 'job' in context: model = context['model'] id = data_dict.get('id', None) job = HarvestJob.get(id) if not job: raise NotFound else: job = context['job'] return job
def get_job_object(context, data_dict = {}): if not 'job' in context: model = context['model'] id = data_dict.get('id',None) job = HarvestJob.get(id) if not job: raise NotFound else: job = context['job'] return job
def harvest_job_report(context, data_dict): check_access("harvest_job_show", context, data_dict) model = context["model"] id = data_dict.get("id") job = HarvestJob.get(id) if not job: raise NotFound report = {"gather_errors": [], "object_errors": {}} # Gather errors q = ( model.Session.query(harvest_model.HarvestGatherError) .join(harvest_model.HarvestJob) .filter(harvest_model.HarvestGatherError.harvest_job_id == job.id) .order_by(harvest_model.HarvestGatherError.created.desc()) ) for error in q.all(): report["gather_errors"].append({"message": error.message}) # Object errors # Check if the harvester for this job's source has a method for returning # the URL to the original document original_url_builder = None for harvester in PluginImplementations(IHarvester): if harvester.info()["name"] == job.source.type: if hasattr(harvester, "get_original_url"): original_url_builder = harvester.get_original_url q = ( model.Session.query(harvest_model.HarvestObjectError, harvest_model.HarvestObject.guid) .join(harvest_model.HarvestObject) .filter(harvest_model.HarvestObject.harvest_job_id == job.id) .order_by(harvest_model.HarvestObjectError.harvest_object_id) ) for error, guid in q.all(): if not error.harvest_object_id in report["object_errors"]: report["object_errors"][error.harvest_object_id] = {"guid": guid, "errors": []} if original_url_builder: url = original_url_builder(error.harvest_object_id) if url: report["object_errors"][error.harvest_object_id]["original_url"] = url report["object_errors"][error.harvest_object_id]["errors"].append( {"message": error.message, "line": error.line, "type": error.stage} ) return report
def gather_callback(message_data, message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get rid of any old session state that may still be around. This is # a simple alternative to creating a new session for this callback. model.Session.expire_all() # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) return # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin' 's gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id': id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg, job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def harvest_job_show(context,data_dict): p.toolkit.check_access('harvest_job_show',context,data_dict) id = data_dict.get('id') attr = data_dict.get('attr',None) job = HarvestJob.get(id,attr=attr) if not job: raise NotFound return harvest_job_dictize(job,context)
def harvest_job_show(context, data_dict): check_access('harvest_job_show', context, data_dict) id = data_dict.get('id') attr = data_dict.get('attr', None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def harvest_job_show(context, data_dict): check_access("harvest_job_show", context, data_dict) id = data_dict.get("id") attr = data_dict.get("attr", None) job = HarvestJob.get(id, attr=attr) if not job: raise NotFound return harvest_job_dictize(job, context)
def gather_callback(message_data,message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except: log.error('Harvest job does not exist: %s' % id) else: # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def _create_job(self, source_id): ''' :param source_id: ''' # Create a job context = {u'user': u'harvest'} job_dict = toolkit.get_action(u'harvest_job_create')( context, { u'source_id': source_id }) job = HarvestJob.get(job_dict[u'id']) assert job return job
def run_test_harvest(self): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob # Determine the source if len(self.args) >= 2: source_id_or_name = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) context = {'model': model, 'session': model.Session, 'user': self.admin_user['name']} source = get_action('harvest_source_show')( context, {'id': source_id_or_name}) # Determine the job try: job_dict = get_action('harvest_job_create')( context, {'source_id': source['id']}) except HarvestJobExists: running_jobs = get_action('harvest_job_list')( context, {'source_id': source['id'], 'status': 'Running'}) if running_jobs: print '\nSource "%s" apparently has a "Running" job:\n%r' \ % (source.get('name') or source['id'], running_jobs) resp = raw_input('Abort it? (y/n)') if not resp.lower().startswith('y'): sys.exit(1) job_dict = get_action('harvest_job_abort')( context, {'source_id': source['id']}) else: print 'Reusing existing harvest job' jobs = get_action('harvest_job_list')( context, {'source_id': source['id'], 'status': 'New'}) assert len(jobs) == 1, \ 'Multiple "New" jobs for this source! %r' % jobs job_dict = jobs[0] job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) assert harvester, \ 'No harvester found for type: %s' % source['source_type'] lib.run_harvest_job(job_obj, harvester)
def test_error_mail_sent(self, mock_mailer_mail_recipient): context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing() # create a HarvestGatherError job_model = HarvestJob.get(job['id']) msg = 'System error - No harvester could be found for source type %s' % job_model.source.type err = HarvestGatherError(message=msg, job=job_model) err.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def run_test_harvest(self): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob # Determine the source if len(self.args) >= 2: source_id_or_name = unicode(self.args[1]) else: print "Please provide a source id" sys.exit(1) context = {"model": model, "session": model.Session, "user": self.admin_user["name"]} source = get_action("harvest_source_show")(context, {"id": source_id_or_name}) # Determine the job try: job_dict = get_action("harvest_job_create")(context, {"source_id": source["id"]}) except HarvestJobExists: running_jobs = get_action("harvest_job_list")(context, {"source_id": source["id"], "status": "Running"}) if running_jobs: print '\nSource "%s" apparently has a "Running" job:\n%r' % ( source.get("name") or source["id"], running_jobs, ) resp = raw_input("Abort it? (y/n)") if not resp.lower().startswith("y"): sys.exit(1) job_dict = get_action("harvest_job_abort")(context, {"source_id": source["id"]}) else: print "Reusing existing harvest job" jobs = get_action("harvest_job_list")(context, {"source_id": source["id"], "status": "New"}) assert len(jobs) == 1, 'Multiple "New" jobs for this source! %r' % jobs job_dict = jobs[0] job_obj = HarvestJob.get(job_dict["id"]) harvester = queue.get_harvester(source["source_type"]) assert harvester, "No harvester found for type: %s" % source["source_type"] lib.run_harvest_job(job_obj, harvester)
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except sqlalchemy.exc.OperationalError, e: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" log.exception(e) log.error('Connection Error during gather of job %s: %r %r', id, e, e.args) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # recreate job for datajson collection or the like. source = job_obj.source source_config = json.loads(source.config or '{}') datajson_collection = source_config.get( 'datajson_collection') if datajson_collection == 'parents_run': new_job = HarvestJob() new_job.source = source new_job.save() source_config['datajson_collection'] = 'children_run' source.config = json.dumps(source_config) source.save() elif datajson_collection: # reset the key if 'children_run', or anything. source_config.pop("datajson_collection", None) source.config = json.dumps(source_config) source.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise NoNewHarvestJobError('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def test_fetch_doesnt_process_remaining_objects_if_job_status_finished( self): # make sure queues/exchanges are created first and are empty consumer = queue.get_gather_consumer() consumer_fetch = queue.get_fetch_consumer() consumer.queue_purge(queue=queue.get_gather_queue_name()) consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name()) user = logic.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {})['name'] context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'ignore_auth': True } source_dict = { 'title': 'Test Job Finished', 'name': 'test-job-finished', 'url': 'basic_test_1', 'source_type': 'test-nose', } harvest_source = logic.get_action('harvest_source_create')(context, source_dict) assert harvest_source['source_type'] == 'test-nose', harvest_source assert harvest_source['url'] == 'basic_test_1', harvest_source harvest_job = logic.get_action('harvest_job_create')( context, { 'source_id': harvest_source['id'], 'run': True }) job_id = harvest_job['id'] assert harvest_job['source_id'] == harvest_source['id'], harvest_job assert harvest_job['status'] == u'Running' assert logic.get_action('harvest_job_show')(context, { 'id': job_id })['status'] == u'Running' # pop on item off the queue and run the callback reply = consumer.basic_get(queue='ckan.harvest.gather') queue.gather_callback(consumer, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'WAITING' assert all_objects[1].state == 'WAITING' assert all_objects[2].state == 'WAITING' # artificially set the job to finished to simulate a job abort or timeout job_obj = HarvestJob.get(harvest_job['id']) job_obj.status = 'Finished' job_obj.save() original_dataset_count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() # do three times as three harvest objects reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch') queue.fetch_callback(consumer_fetch, *reply) all_objects = model.Session.query(HarvestObject).filter( HarvestObject.harvest_job_id == harvest_job['id']).all() assert len(all_objects) == 3 assert all_objects[0].state == 'ERROR' assert all_objects[1].state == 'ERROR' assert all_objects[2].state == 'ERROR' count = model.Session.query(model.Package) \ .filter(model.Package.type == 'dataset') \ .count() assert count == original_dataset_count # fire run again to check if job is set to Finished logic.get_action('harvest_jobs_run')(context, { 'source_id': harvest_source['id'] }) harvest_job = logic.get_action('harvest_job_show')(context, { 'id': job_id }) assert_equal(harvest_job['status'], u'Finished') assert_equal( harvest_job['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) harvest_source_dict = logic.get_action('harvest_source_show')( context, { 'id': harvest_source['id'] }) assert_equal( harvest_source_dict['status']['last_job']['stats'], { 'added': 0, 'updated': 0, 'not modified': 0, 'errored': 3, 'deleted': 0 }) assert_equal(harvest_source_dict['status']['total_datasets'], 0) assert_equal(harvest_source_dict['status']['job_count'], 1)
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if timeout: created = datetime.datetime.strptime(job['created'], '%Y-%m-%d %H:%M:%S.%f') now = datetime.datetime.now() if now - created > datetime.timedelta(minutes=int(timeout)): msg = 'Job timeout: %s is taking longer than %s minutes' % ( job['id'], timeout) log.error(msg) job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\ and (status['last_job']['stats']['errored']): send_error_mail(context, job_obj.source.id, status) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility
def harvest_jobs_run(context, data_dict): log.info("Harvest job run: %r", data_dict) check_access("harvest_jobs_run", context, data_dict) session = context["session"] source_id = data_dict.get("source_id", None) if not source_id: _make_scheduled_jobs(context, data_dict) context["return_objects"] = False # Flag finished jobs as such jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job["gather_finished"]: objects = ( session.query(HarvestObject.id) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR"))) .order_by(HarvestObject.import_finished.desc()) ) if objects.count() == 0: job_obj = HarvestJob.get(job["id"]) job_obj.status = u"Finished" last_object = ( session.query(HarvestObject) .filter(HarvestObject.harvest_job_id == job["id"]) .filter(HarvestObject.import_finished != None) .order_by(HarvestObject.import_finished.desc()) .first() ) if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if "extras_as_string" in context: del context["extras_as_string"] context.update({"validate": False, "ignore_auth": True}) package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"}) if len(jobs) == 0: log.info("No new harvest jobs.") raise Exception("There are no new harvesting jobs") # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context["detailed"] = False source = harvest_source_show(context, {"id": job["source_id"]}) if source["active"]: job_obj = HarvestJob.get(job["id"]) job_obj.status = job["status"] = u"Running" job_obj.save() publisher.send({"harvest_job_id": job["id"]}) log.info("Sent job %s to the gather queue" % job["id"]) sent_jobs.append(job) publisher.close() return sent_jobs
def test_harvester(test_config, expected_count): """ Test the harvester by running it for real with mocked requests. We need to convert some blocks to helper functions or fixtures, but this is an easy way to verify that a harvester does what it's supposed to over the course of one or more runs, and we should build on it for future tests. """ helpers.reset_db() context = {} context.setdefault('user', 'test_user') context.setdefault('ignore_auth', True) context['model'] = model context['session'] = model.Session user = {} user['name'] = 'test_user' user['email'] = '*****@*****.**' user['password'] = '******' helpers.call_action('user_create', context, **user) org = {'name': 'gome2_test_org', 'url': 'http://example.com/gome2'} owner_org = helpers.call_action('organization_create', context, **org) config = json.dumps(test_config) source = { 'url': 'http://example.com/gome2_test_harvester', 'name': 'gome2_test_harvester', 'owner_org': owner_org['id'], 'source_type': 'gome2', 'config': config } harvest_source_create(context, source) source = harvest_source_show(context, {'id': source['name']}) job_dict = get_action('harvest_job_create')(context, { 'source_id': source['id'] }) job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) with requests_mock.Mocker(real_http=True) as m: m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) lib.run_harvest_job(job_obj, harvester) source = harvest_source_show(context, {'id': source['name']}) assert source['status']['last_job']['status'] == 'Finished' assert source['status']['last_job']['stats']['added'] == expected_count # Re-run the harvester without forcing updates job_dict = get_action('harvest_job_create')(context, { 'source_id': source['id'] }) job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) with requests_mock.Mocker(real_http=True) as m: m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_O3', # noqa: E501 text=o3_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_NO2', # noqa: E501 text=no2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_TropNO2', # noqa: E501 text=tropno2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2', # noqa: E501 text=so2_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-01&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-02&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) m.register_uri( 'GET', 'https://wdc.dlr.de/data_products/VIEWER/missing_days.php?start_date=2018-01-03&wpid=GOME2_SO2mass', # noqa: E501 text=so2mass_response) lib.run_harvest_job(job_obj, harvester) source = harvest_source_show(context, {'id': source['name']}) assert source['status']['last_job']['status'] == 'Finished' assert source['status']['last_job']['stats']['added'] == 0 assert source['status']['last_job']['stats']['updated'] == 0 # Verify that the org has the expected number of datasets now org_response = helpers.call_action('organization_show', context, **{'id': org['name']}) assert org_response['package_count'] == expected_count
def harvest_jobs_run(context,data_dict): log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run',context,data_dict) session = context['session'] source_id = data_dict.get('source_id',None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'}) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string'in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')(context, {'id': job_obj.source.id}) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_gather_publisher() sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context,{'id':job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({'harvest_job_id': job['id']}) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: job_obj = HarvestJob.get(job['id']) if timeout: last_time = job_obj.get_last_action_time() now = datetime.datetime.utcnow() if now - last_time > datetime.timedelta(minutes=int(timeout)): msg = 'Job {} timeout ({} minutes)\n'.format( job_obj.id, timeout) msg += '\tJob created: {}\n'.format(job_obj.created) msg += '\tJob gather finished: {}\n'.format( job_obj.created) msg += '\tJob last action time: {}\n'.format(last_time) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) notify_all = toolkit.asbool( config.get('ckan.harvest.status_mail.all')) notify_errors = toolkit.asbool( config.get('ckan.harvest.status_mail.errored')) last_job_errors = status['last_job']['stats'].get( 'errored', 0) log.debug( 'Notifications: All:{} On error:{} Errors:{}'.format( notify_all, notify_errors, last_job_errors)) if last_job_errors > 0 and (notify_all or notify_errors): # send_error_mail_ncar(context, job_obj) # get_mail_extra_vars(context, job_obj.source.id, status) send_error_email(context, job_obj.source.id, status) elif notify_all: send_summary_email(context, job_obj.source.id, status) else: log.debug('%d Ongoing jobs for %s (source:%s)', num_objects_in_progress, job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() # log.debug('Start of commit and close') # session.commit() # log.debug(' (Start of close)') # session.close() # log.debug('End of commit and close') return [] # merely for backwards compatibility
def test_harvester(self): """ Test the harvester by running it for real with mocked requests. We need to convert some blocks to helper functions or fixtures, but this is an easy way to verify that a harvester does what it's supposed to over the course of one or more runs, and we should build on it for future tests. """ helpers.reset_db() context = {} context.setdefault('user', 'test_user') context.setdefault('ignore_auth', True) context['model'] = model context['session'] = model.Session user = {} user['name'] = 'test_user' user['email'] = '*****@*****.**' user['password'] = '******' helpers.call_action('user_create', context, **user) org = {'name': 'test_org', 'url': 'https://www.example.com'} owner_org = helpers.call_action('organization_create', context, **org) config_dict = { 'source': 'esa_scihub', 'update_all': False, 'datasets_per_job': 10, 'timeout': 10, 'skip_raw': False } config = json.dumps(config_dict) source = { 'url': 'http://www.scihub.org', 'name': 'scihub_test_harvester', 'owner_org': owner_org['id'], 'source_type': 'esasentinel', 'config': config } harvest_source_create(context, source) source = harvest_source_show(context, {'id': 'scihub_test_harvester'}) job_dict = get_action('harvest_job_create')(context, { 'source_id': source['id'] }) job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) with requests_mock.Mocker(real_http=True) as m: m.register_uri('GET', '/dhus/search?q', text=self.raw_results) lib.run_harvest_job(job_obj, harvester) source = harvest_source_show(context, {'id': 'scihub_test_harvester'}) assert source['status']['last_job']['status'] == 'Finished' assert source['status']['last_job']['stats']['added'] == 10 # Re-run the harvester job_dict = get_action('harvest_job_create')(context, { 'source_id': source['id'] }) job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) with requests_mock.Mocker(real_http=True) as m: m.register_uri('GET', '/dhus/search?q', text=self.raw_results) lib.run_harvest_job(job_obj, harvester) source = harvest_source_show(context, {'id': 'scihub_test_harvester'}) assert source['status']['last_job']['status'] == 'Finished' assert source['status']['last_job']['stats']['added'] == 0 assert source['status']['last_job']['stats']['updated'] == 0 # Re-run the harvester but force updates config_dict = { 'source': 'esa_scihub', 'update_all': True, 'datasets_per_job': 10, 'timeout': 10, 'skip_raw': False } config = json.dumps(config_dict) source['config'] = config harvest_source_update(context, source) job_dict = get_action('harvest_job_create')(context, { 'source_id': source['id'] }) job_obj = HarvestJob.get(job_dict['id']) harvester = queue.get_harvester(source['source_type']) with requests_mock.Mocker(real_http=True) as m: m.register_uri('GET', '/dhus/search?q', text=self.raw_results) lib.run_harvest_job(job_obj, harvester) source = harvest_source_show(context, {'id': 'scihub_test_harvester'}) assert source['status']['last_job']['status'] == 'Finished' assert source['status']['last_job']['stats']['added'] == 0 assert source['status']['last_job']['stats']['updated'] == 10 # Verify that the org now has 10 datasets now org = helpers.call_action('organization_show', context, **{'id': 'test_org'}) assert org['package_count'] == 10
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list( context, {'source_id': source_id, 'status': u'Running'}) if len(jobs): for job in jobs: if job['gather_finished']: objects = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished != None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() log.info('Marking job as finished: %s', job_obj) # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, {'id': job_obj.source.id}) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) # resubmit old redis tasks resubmit_jobs() return [] # merely for backwards compatibility
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages
def distributed_harvest_jobs_run(context, data_dict): log.info('Harvest job run: %r', data_dict) check_access('distributed_harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id', None) routing_key = data_dict.get('gather_routing_key', None) exchange_name = data_dict.get('exchange_name', None) fetch_routing_key = data_dict.get('fetch_routing_key', None) if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): package_index = PackageSearchIndex() for job in jobs: if job['gather_finished']: objects = session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(and_((HarvestObject.state!=u'COMPLETE'), (HarvestObject.state!=u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id==job['id']) \ .filter(HarvestObject.import_finished!=None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished job_obj.save() # Reindex the harvest source dataset so it has the latest # status if 'extras_as_string' in context: del context['extras_as_string'] context.update({'validate': False, 'ignore_auth': True}) package_dict = logic.get_action('package_show')( context, { 'id': job_obj.source.id }) if package_dict: package_index.index_package(package_dict) # resubmit old redis tasks resubmit_jobs() # Check if there are pending harvest jobs jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'New' }) if len(jobs) == 0: log.info('No new harvest jobs.') raise Exception('There are no new harvesting jobs') # Send each job to the gather queue publisher = get_distributed_gather_publisher(exchange_name, routing_key) sent_jobs = [] for job in jobs: context['detailed'] = False source = harvest_source_show(context, {'id': job['source_id']}) if source['active']: job_obj = HarvestJob.get(job['id']) job_obj.status = job['status'] = u'Running' job_obj.save() publisher.send({ 'harvest_job_id': job['id'], 'exchange_name': exchange_name, 'fetch_routing_key': fetch_routing_key }) log.info('Sent job %s to the gather queue' % job['id']) sent_jobs.append(job) publisher.close() return sent_jobs
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() try: harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id ) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during gather of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester = get_harvester(job.source.type) if harvester: try: harvest_object_ids = gather_stage(harvester, job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) raise if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) else: # This can occur if you: # * remove a harvester and it still has sources that are then refreshed # * add a new harvester and restart CKAN but not the gather queue. msg = 'System error - No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() try: harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug( 'Received from plugin gather_stage: {0} objects (first: {1} last: {2})' .format(len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id': id}) log.debug('Sent {0} objects to the fetch queue'.format( len(harvest_object_ids))) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg, job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if config.get('ckan.harvest.status_mail') == 'errors' \ and status['last_job']['stats']['errored']: subject, body = prepare_error_mail( context, job_obj.source_id, status, 'emails/error_email.txt') log.info('Sending error mail') send_mail(context, job_obj.source.id, subject, body) if config.get('ckan.harvest.status_mail') == 'all': subject, body = prepare_summary_mail( context, job_obj.source.id, status, 'emails/summary_email.txt') log.info('Sending summary email') send_mail(context, job_obj.source.id, subject, body) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if job['gather_finished']: objects = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .order_by(HarvestObject.import_finished.desc()) if objects.count() == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(HarvestObject.import_finished != None) \ .order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() log.info('Marking job as finished: %s', job_obj) # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) # resubmit old redis tasks resubmit_jobs() return [] # merely for backwards compatibility
def setup(self): print("") print("TestUM:setup() before each test method") # Add sysadmin user self.harvestUser = model.User(name=u'harvest', password=u'test', sysadmin=True) model.Session.add(self.harvestUser) model.Session.commit() source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'xml/sample.xml', 'source_type': u'ngds' } context = { 'model': model, 'session': model.Session, 'user': u'harvest' } if config.get('ckan.harvest.auth.profile') == u'publisher' \ and not 'publisher_id' in source_fixture: source_fixture['publisher_id'] = self.publisher.id source_dict = get_action('harvest_source_create')(context, source_fixture) self.oHarvestSource = HarvestSource.get(source_dict['id']) job_dict = get_action('harvest_job_create')( context, { 'source_id': self.oHarvestSource.id }) self.oHarvestJob = HarvestJob.get(job_dict['id']) context = { 'model': model, 'session': model.Session, 'ignore_auth': True, } data_dict = { 'guid': 'guid', 'content': self.contentDataset, 'job_id': self.oHarvestJob.id, 'extras': { 'a key': 'a value' }, } oHarvestObject = toolkit.get_action('harvest_object_create')(context, data_dict) self.oHarvestObject = HarvestObject.get(oHarvestObject['id']) package_schema = default_update_package_schema() self.context = { 'model': model, 'session': model.Session, 'user': u'harvest', 'schema': package_schema, 'api_version': '2' }