def _save_gather_error(self,message,job): ''' Helper function to create an error during the gather stage. ''' err = HarvestGatherError(message=message,job=job) err.save() log.error(message)
def _save_gather_error(self, message, job): ''' Helper function to create an error during the gather stage. ''' err = HarvestGatherError(message=message, job=job) err.save() log.error(message)
def _save_gather_error(self, message, job): err = HarvestGatherError(message=message, job=job) try: err.save() except InvalidRequestError: Session.rollback() err.save() finally: log.error(message)
def gather_callback(message_data, message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get rid of any old session state that may still be around. This is # a simple alternative to creating a new session for this callback. model.Session.expire_all() # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) return # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin' 's gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id': id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg, job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def gather_callback(message_data,message): try: id = message_data['harvest_job_id'] log.debug('Received harvest job id: %s' % id) # Get rid of any old session state that may still be around. This is # a simple alternative to creating a new session for this callback. model.Session.expire_all() # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) return # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.now() harvest_object_ids = harvester.gather_stage(job) job.gather_finished = datetime.datetime.now() job.save() log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids) if harvest_object_ids and len(harvest_object_ids) > 0: for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent object %s to the fetch queue' % id) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) job.status = u'Finished' job.save() finally: publisher.close() except KeyError: log.error('No harvest job id received') finally: message.ack()
def test_error_mail_sent(self, mock_mailer_mail_recipient): context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing() # create a HarvestGatherError job_model = HarvestJob.get(job['id']) msg = 'System error - No harvester could be found for source type %s' % job_model.source.type err = HarvestGatherError(message=msg, job=job_model) err.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) else: # This can occur if you: # * remove a harvester and it still has sources that are then refreshed # * add a new harvester and restart CKAN but not the gather queue. msg = 'System error - No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag) def get_harvester(harvest_source_type): for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == harvest_source_type: return harvester def gather_stage(harvester, job):
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() try: job = HarvestJob.get(id) except sqlalchemy.exc.DatabaseError: # Occasionally we see: sqlalchemy.exc.OperationalError # "SSL connection has been closed unexpectedly" # or DatabaseError "connection timed out" log.exception('Connection Error during gather of job %s', id) # By not sending the ack, it will be retried later. # Try to clear the issue with a remove. model.Session.remove() return if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester = get_harvester(job.source.type) if harvester: try: harvest_object_ids = gather_stage(harvester, job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) raise if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) else: # This can occur if you: # * remove a harvester and it still has sources that are then refreshed # * add a new harvester and restart CKAN but not the gather queue. msg = 'System error - No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: if timeout: created = datetime.datetime.strptime(job['created'], '%Y-%m-%d %H:%M:%S.%f') now = datetime.datetime.now() if now - created > datetime.timedelta(minutes=int(timeout)): msg = 'Job timeout: %s is taking longer than %s minutes' % ( job['id'], timeout) log.error(msg) job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj = HarvestJob.get(job['id']) job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\ and (status['last_job']['stats']['errored']): send_error_mail(context, job_obj.source.id, status) else: log.debug('Ongoing job:%s source:%s', job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() return [] # merely for backwards compatibility
def _save_gather_error(self,message,job): err = HarvestGatherError(message=message,job=job) err.save() log.error(message)
.order_by("gather_finished desc").first() # We thought about using the document's modified date to see if it is # unchanged from the previous harvest, but it's hard to tell if the # previous harvest was not successful due to whatever reason, so don't # skip the doc because of its modified date. # We create a new HarvestObject for each inv:Dataset within the # Inventory document ids = [] harvested_identifiers = set() for dataset_node in doc.dataset_nodes(): dataset = doc.dataset_to_dict(dataset_node) if dataset['identifier'] in harvested_identifiers: HarvestGatherError.create( 'Dataset with duplicate identifier "%s" - discarding' % dataset['identifier'], harvest_job) continue harvested_identifiers.add(dataset['identifier']) guid = self.build_guid(doc_metadata['identifier'], dataset['identifier']) # Use the most recent modification date out of the doc and dataset, # since they might have forgotten to enter or update the dataset # date. dataset_last_modified = dataset['modified'] or doc_last_modified if dataset_last_modified and doc_last_modified: dataset_last_modified = max(dataset_last_modified, doc_last_modified) if previous: # object may be in the previous harvest, or an older one
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() try: harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id ) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format( len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id':id}) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg,job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)
.order_by("gather_finished desc").first() # We thought about using the document's modified date to see if it is # unchanged from the previous harvest, but it's hard to tell if the # previous harvest was not successful due to whatever reason, so don't # skip the doc because of its modified date. # We create a new HarvestObject for each inv:Dataset within the # Inventory document ids = [] harvested_identifiers = set() for dataset_node in doc.dataset_nodes(): dataset = doc.dataset_to_dict(dataset_node) if dataset['identifier'] in harvested_identifiers: HarvestGatherError.create( 'Dataset with duplicate identifier "%s" - discarding' % dataset['identifier'], harvest_job) continue harvested_identifiers.add(dataset['identifier']) guid = self.build_guid(doc_metadata['identifier'], dataset['identifier']) # Use the most recent modification date out of the doc and dataset, # since they might have forgotten to enter or update the dataset # date. dataset_last_modified = dataset['modified'] or doc_last_modified if dataset_last_modified and doc_last_modified: dataset_last_modified = max(dataset_last_modified, doc_last_modified) if previous: # object may be in the previous harvest, or an older one existing_object = model.Session.query(HarvestObject)\ .filter_by(guid=guid)\
def harvest_jobs_run(context, data_dict): ''' Runs scheduled jobs, checks if any jobs need marking as finished, and resubmits queue items if needed. If ckanext.harvest.timeout is set: Check if the duration of the job is longer than ckanext.harvest.timeout, then mark that job as finished as there is probably an underlying issue with the harvest process. This should be called every few minutes (e.g. by a cron), or else jobs will never show as finished. This used to also 'run' new jobs created by the web UI, putting them onto the gather queue, but now this is done by default when you create a job. If you need to send do this explicitly, then use ``harvest_send_job_to_gather_queue``. :param source_id: the id of the harvest source, if you just want to check for its finished jobs (optional) :type source_id: string ''' log.info('Harvest job run: %r', data_dict) check_access('harvest_jobs_run', context, data_dict) timeout = config.get('ckan.harvest.timeout') session = context['session'] source_id = data_dict.get('source_id') # Scheduled jobs if not source_id: _make_scheduled_jobs(context, data_dict) context['return_objects'] = False # Flag finished jobs as such jobs = harvest_job_list(context, { 'source_id': source_id, 'status': u'Running' }) if len(jobs): for job in jobs: job_obj = HarvestJob.get(job['id']) if timeout: last_time = job_obj.get_last_action_time() now = datetime.datetime.utcnow() if now - last_time > datetime.timedelta(minutes=int(timeout)): msg = 'Job {} timeout ({} minutes)\n'.format( job_obj.id, timeout) msg += '\tJob created: {}\n'.format(job_obj.created) msg += '\tJob gather finished: {}\n'.format( job_obj.created) msg += '\tJob last action time: {}\n'.format(last_time) job_obj.status = u'Finished' job_obj.finished = now job_obj.save() err = HarvestGatherError(message=msg, job=job_obj) err.save() log.info('Marking job as finished due to error: %s %s', job_obj.source.url, job_obj.id) continue if job['gather_finished']: num_objects_in_progress = \ session.query(HarvestObject.id) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter(and_((HarvestObject.state != u'COMPLETE'), (HarvestObject.state != u'ERROR'))) \ .count() if num_objects_in_progress == 0: job_obj.status = u'Finished' log.info('Marking job as finished %s %s', job_obj.source.url, job_obj.id) # save the time of finish, according to the last running # object last_object = session.query(HarvestObject) \ .filter(HarvestObject.harvest_job_id == job['id']) \ .filter( HarvestObject.import_finished != None # noqa: E711 ).order_by(HarvestObject.import_finished.desc()) \ .first() if last_object: job_obj.finished = last_object.import_finished else: job_obj.finished = job['gather_finished'] job_obj.save() # Reindex the harvest source dataset so it has the latest # status get_action('harvest_source_reindex')( context, { 'id': job_obj.source.id }) status = get_action('harvest_source_show_status')( context, { 'id': job_obj.source.id }) notify_all = toolkit.asbool( config.get('ckan.harvest.status_mail.all')) notify_errors = toolkit.asbool( config.get('ckan.harvest.status_mail.errored')) last_job_errors = status['last_job']['stats'].get( 'errored', 0) log.debug( 'Notifications: All:{} On error:{} Errors:{}'.format( notify_all, notify_errors, last_job_errors)) if last_job_errors > 0 and (notify_all or notify_errors): # send_error_mail_ncar(context, job_obj) # get_mail_extra_vars(context, job_obj.source.id, status) send_error_email(context, job_obj.source.id, status) elif notify_all: send_summary_email(context, job_obj.source.id, status) else: log.debug('%d Ongoing jobs for %s (source:%s)', num_objects_in_progress, job['id'], job['source_id']) log.debug('No jobs to send to the gather queue') # Resubmit old redis tasks resubmit_jobs() # Resubmit pending objects missing from Redis resubmit_objects() # log.debug('Start of commit and close') # session.commit() # log.debug(' (Start of close)') # session.close() # log.debug('End of commit and close') return [] # merely for backwards compatibility
def gather_stage(self, harvest_job): ''' analyze the source, return a list of IDs and create one HarvestObject per dataset ''' logger.info('Starts Gather SIU Transp') # load paths self.set_paths() self.siu_data_lib.get_query_files() # basic things you'll need self.source = harvest_job.source self.source_config = json.loads(self.source.config) # allow to get config from URL # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json config_from_url = self.source_config.get('from_url', None) if config_from_url is not None: logger.info('Updating config from URL') response = requests.get(config_from_url) update_config = response.json() self.source_config.update(update_config) self.siu_data_lib.base_url = self.source.url self.siu_data_lib.username = self.source_config['username'] self.siu_data_lib.password = self.source_config['password'] # #################################### # get previous harvested packages pfr = self.get_packages_for_source(harvest_source_id=self.source.id) prev_names = [pkg['name'] for pkg in pfr['results']] logger.info('Get previous harvested objects {}'.format(prev_names)) # TODO # #################################### object_ids = [] # lista de IDs a procesar, esto se devuelve en esta funcion self.source_dataset = get_harvest_source(self.source.id) owner_org = self.source_dataset.get('owner_org') logger.info('Gather SIU Transp to ORG {}'.format(owner_org)) # Iterar por cada query para obtener diferentes conjuntos de datos # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar report = [] # resumen de todos los resultados logger.info('Iter files') # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo override = self.source_config.get('override', {}) logger.info("General override {}".format(override)) for qf in self.siu_data_lib.query_files: only_files = self.source_config.get('only_files', None) query_file_name = qf.split('/')[-1] if only_files is not None: if query_file_name not in only_files: logger.info('Skipping file by config {}'.format(query_file_name)) continue logger.info('Gather SIU Transp FILE {}'.format(qf)) stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf) # open to read query params stqf.open() # request all data stqf.request_all(results_folder_path=self.results_folder_path) for err in stqf.errors: hgerr = HarvestGatherError(message=err, job=harvest_job) hgerr.save() # ====== Prepare dict to override datasets metadata ============ override_this = override.get(query_file_name, {}) logger.info("To override {}: {}".format(query_file_name, override_this)) # extras need to be {"key": "extra name", "value": "extra value"} extras = override_this.get('extras', {}) new_extras = [] for extra_key, extra_value in extras.iteritems(): logger.info("Override extra found {}: {}".format(extra_key, extra_value)) if not isinstance(extra_value, str): extra_value = str(extra_value) new_extras.append({"key": extra_key, "value": extra_value}) if len(new_extras) > 0: override_this['extras'] = new_extras # tags need to be {"name": "tag name"} tags = override_this.get('tags', []) new_tags = [] for tag in tags: logger.info("Override tag found {}".format(unicode(tag).encode("utf-8"))) new_tags.append({"name": tag}) if len(new_tags) > 0: override_this['tags'] = new_tags # groups need to be {"name": "tag name"} groups = override_this.get('groups', []) new_groups = [] for group in groups: logger.info("Override group found {}".format(group)) # check if groups must be created context = {'model': model, 'session': model.Session, 'user': self._get_user_name()} try: p.toolkit.get_action('group_create')(context, {"name": group}) except Exception as e: logger.error('Error creating group (skipped) {}: {}'.format(group, e)) new_groups.append({"name": group}) if len(new_groups) > 0: override_this['groups'] = new_groups # ================================ report += stqf.requests for dataset in stqf.datasets: if dataset['name'] in prev_names: action = 'update' # leave this list just with packages to remove prev_names.remove(dataset['name']) else: action = 'create' logger.info('Dataset {} to {}'.format(dataset['name'], action)) ho_dict = { 'title': dataset['title'], 'name': dataset['name'], 'owner_org': owner_org, 'notes': dataset['notes'], 'tags': dataset['tags'], 'resources': dataset['resources'], 'action': action } # fix extras if they exists ho_dict.update(override_this) logger.info("Overrided ho_dict {}".format(ho_dict)) # Each harvest object will be passed to other stages in harvest process obj = HarvestObject(guid=dataset['name'], job=harvest_job, content=json.dumps(ho_dict)) obj.save() logger.info('Objects ID appends {}'.format(obj.id)) object_ids.append(obj.id) # TODO compare with previous harvested data to remove dataset no more at harvest source # resumen final logger.info('REQUESTS: \n{}'.format('\n\t'.join(report))) return object_ids
def gather_callback(channel, method, header, body): try: id = json.loads(body)['harvest_job_id'] log.debug('Received harvest job id: %s' % id) except KeyError: log.error('No harvest job id received') channel.basic_ack(method.delivery_tag) return False # Get a publisher for the fetch queue publisher = get_fetch_publisher() job = HarvestJob.get(id) if not job: log.error('Harvest job does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False # Send the harvest job to the plugins that implement # the Harvester interface, only if the source type # matches harvester_found = False for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == job.source.type: harvester_found = True # Get a list of harvest object ids from the plugin job.gather_started = datetime.datetime.utcnow() try: harvest_object_ids = harvester.gather_stage(job) except (Exception, KeyboardInterrupt): channel.basic_ack(method.delivery_tag) harvest_objects = model.Session.query(HarvestObject).filter_by( harvest_job_id=job.id) for harvest_object in harvest_objects: model.Session.delete(harvest_object) model.Session.commit() raise finally: job.gather_finished = datetime.datetime.utcnow() job.save() if not isinstance(harvest_object_ids, list): log.error('Gather stage failed') publisher.close() channel.basic_ack(method.delivery_tag) return False if len(harvest_object_ids) == 0: log.info('No harvest objects to fetch') publisher.close() channel.basic_ack(method.delivery_tag) return False log.debug( 'Received from plugin gather_stage: {0} objects (first: {1} last: {2})' .format(len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:])) for id in harvest_object_ids: # Send the id to the fetch queue publisher.send({'harvest_object_id': id}) log.debug('Sent {0} objects to the fetch queue'.format( len(harvest_object_ids))) if not harvester_found: msg = 'No harvester could be found for source type %s' % job.source.type err = HarvestGatherError(message=msg, job=job) err.save() log.error(msg) model.Session.remove() publisher.close() channel.basic_ack(method.delivery_tag)