def handle_worker_offline(event): """ Celery event handler for 'worker-offline' events. The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not emitted when a worker is killed instantly. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Otherwise, a worker is shutting down, and a _delete_worker() task is dispatched so that the resource manager will remove the record, and handle any work cleanup associated with a worker going offline. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return msg = _("Worker '%(worker_name)s' shutdown") % event_info _logger.info(msg) _delete_worker.apply_async(args=(event_info['worker_name'], ), kwargs={'normal_shutdown': True}, queue=RESOURCE_MANAGER_QUEUE)
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds' ) % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta( seconds=self.WORKER_TIMEOUT_SECONDS) worker_criteria = Criteria( filters={'last_heartbeat': { '$lt': oldest_heartbeat_time }}, fields=('_id', 'last_heartbeat', 'num_reservations')) worker_list = list(resources.filter_workers(worker_criteria)) for worker in worker_list: msg = _( "Workers '%s' has gone missing, removing from list of workers" ) % worker.name _logger.error(msg) _delete_worker.apply_async(args=(worker.name, ), queue=RESOURCE_MANAGER_QUEUE)
def handle_worker_offline(event): """ Celery event handler for 'worker-offline' events. The 'worker-offline' event is emitted when a worker gracefully shuts down. It is not emitted when a worker is killed instantly. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Otherwise, a worker is shutting down, and a _delete_worker() task is dispatched so that the resource manager will remove the record, and handle any work cleanup associated with a worker going offline. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return msg = _("Worker '%(worker_name)s' shutdown") % event_info _logger.info(msg) _delete_worker.apply_async(args=(event_info['worker_name'],), kwargs={'normal_shutdown': True}, queue=RESOURCE_MANAGER_QUEUE)
def check_workers(self): """ Look for missing workers, and dispatch a cleanup task if one goes missing. To find a missing worker, filter the Workers model for entries older than utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is a comparable datetime. For each missing worker found, dispatch a _delete_worker task requesting that the resource manager delete the Worker and cleanup any associated work. This method logs and the debug and error levels. """ msg = _('Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS _logger.debug(msg) oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS) worker_criteria = Criteria(filters={'last_heartbeat': {'$lt': oldest_heartbeat_time}}, fields=('_id', 'last_heartbeat', 'num_reservations')) worker_list = list(resources.filter_workers(worker_criteria)) for worker in worker_list: msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name _logger.error(msg) _delete_worker.apply_async(args=(worker.name,), queue=RESOURCE_MANAGER_QUEUE)