def _record_heartbeat(self, consumer): """ This method creates or updates the worker record :param consumer: The consumer instance :type consumer: celery.worker.consumer.Consumer """ name = consumer.hostname # Update the worker record timestamp and handle logging new workers worker_watcher.handle_worker_heartbeat(name) # If the worker is a resource manager, update the associated ResourceManagerLock timestamp if name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).update_one(set__timestamp=datetime.utcnow(), upsert=False)
def _record_heartbeat(self, consumer): """ This method creates or updates the worker record :param worker: The consumer instance :type worker: celery.worker.consumer.Consumer """ name = consumer.hostname # Update the worker record timestamp and handle logging new workers worker_watcher.handle_worker_heartbeat(name) # If the worker is a resource manager, update the associated ResourceManagerLock timestamp if name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).update_one( set__timestamp=datetime.utcnow(), upsert=False)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _( 'The worker named %(name)s is missing. Canceling the tasks in its queue.' ) msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects( worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'])
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. We override the SIGTERM signal handler so that that the worker record will be immediately cleaned up if the process is killed while in this states. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) with custom_sigterm_handler(name): # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" % name) _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _( "Resource manager '%s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %d seconds until " "the lock can be acquired." % (name, constants.CELERY_CHECK_INTERVAL)) _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def _delete_worker(name, normal_shutdown=False): """ Delete the Worker with _id name from the database, cancel any associated tasks and reservations If the worker shutdown normally, no message is logged, otherwise an error level message is logged. Default is to assume the worker did not shut down normally. Any resource reservations associated with this worker are cleaned up by this function. Any tasks associated with this worker are explicitly canceled. :param name: The name of the worker you wish to delete. :type name: basestring :param normal_shutdown: True if the worker shutdown normally, False otherwise. Defaults to False. :type normal_shutdown: bool """ if normal_shutdown is False: msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.') msg = msg % {'name': name} _logger.error(msg) else: msg = _("Cleaning up shutdown worker '%s'.") % name _logger.info(msg) # Delete the worker document Worker.objects(name=name).delete() # Delete all reserved_resource documents for the worker ReservedResource.objects(worker_name=name).delete() # If the worker is a resource manager, we also need to delete the associated lock if name.startswith(RESOURCE_MANAGER_WORKER_NAME): ResourceManagerLock.objects(name=name).delete() # If the worker is a scheduler, we also need to delete the associated lock if name.startswith(SCHEDULER_WORKER_NAME): CeleryBeatLock.objects(name=name).delete() # Cancel all of the tasks that were assigned to this worker's queue for task_status in TaskStatus.objects(worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES): cancel(task_status['task_id'], revoke_task=False)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta( seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one( set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _( "Resource manager '%s' has acquired the resource manager lock" ) % name _logger.debug(msg) if not _first_check: msg = _( "Failover occurred: '%s' is now the primary resource manager" ) % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info( _("Hot spare pulp_resource_manager instance '%(name)s' detected." ) % {'name': name}) _first_check = False time.sleep(constants.PULP_PROCESS_HEARTBEAT_INTERVAL)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.info(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: msg = _("Resource manager '%(name)s' attempted to acquire the the resource manager " "lock but was unable to do so. It will retry every %(interval)d seconds " "until the lock can be acquired.") % \ {'name': name, 'interval': constants.CELERY_CHECK_INTERVAL} _logger.info(msg) _first_check = False time.sleep(constants.CELERY_CHECK_INTERVAL)
def get_resource_manager_lock(name): """ Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it will wait until the currently active instance becomes unavailable, at which point the worker cleanup routine will clear the lock for us to acquire. A worker record will be created so that the waiting resource manager will appear in the Status API. This worker record will be cleaned up through the regular worker shutdown routine. :param name: The hostname of the worker :type name: basestring """ assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME) lock = ResourceManagerLock(name=name) # Whether this is the first lock availability check for this instance _first_check = True while True: now = dateutils.ensure_tz(datetime.utcnow()) old_timestamp = now - timedelta(seconds=PULP_PROCESS_TIMEOUT_INTERVAL) ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete() # Create / update the worker record so that Pulp knows we exist Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(), upsert=True) try: lock.timestamp = now lock.save() msg = _("Resource manager '%s' has acquired the resource manager lock") % name _logger.debug(msg) if not _first_check: msg = _("Failover occurred: '%s' is now the primary resource manager") % name _logger.warning(msg) break except mongoengine.NotUniqueError: # Only log the message the first time if _first_check: _logger.info(_("Hot spare pulp_resource_manager instance '%(name)s' detected.") % {'name': name}) _first_check = False time.sleep(PULP_PROCESS_HEARTBEAT_INTERVAL)