Пример #1
0
def _delete_worker(name, normal_shutdown=False):
    """
    Delete the Worker with _id name from the database, cancel any associated tasks and reservations

    If the worker shutdown normally, no message is logged, otherwise an error level message is
    logged. Default is to assume the worker did not shut down normally.

    Any resource reservations associated with this worker are cleaned up by this function.

    Any tasks associated with this worker are explicitly canceled.

    :param name:            The name of the worker you wish to delete.
    :type  name:            basestring
    :param normal_shutdown: True if the worker shutdown normally, False otherwise.  Defaults to
                            False.
    :type normal_shutdown:  bool
    """
    if normal_shutdown is False:
        msg = _(
            'The worker named %(name)s is missing. Canceling the tasks in its queue.'
        )
        msg = msg % {'name': name}
        _logger.error(msg)

    # Delete the worker document
    Worker.objects(name=name).delete()

    # Delete all reserved_resource documents for the worker
    ReservedResource.objects(worker_name=name).delete()

    # Cancel all of the tasks that were assigned to this worker's queue
    for task_status in TaskStatus.objects(
            worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES):
        cancel(task_status['task_id'])
Пример #2
0
def handle_worker_heartbeat(worker_name):
    """
    This is a generic function for updating worker heartbeat records.

    Existing Worker objects are searched for one to update. If an existing one is found, it is
    updated. Otherwise a new Worker entry is created. Logging at the info level is also done.

    :param worker_name: The hostname of the worker
    :type  worker_name: basestring
    """
    start = datetime.utcnow()
    existing_worker = Worker.objects(name=worker_name).first()

    if not existing_worker:
        msg = _("New worker '%s' discovered") % worker_name
        _logger.info(msg)

    timestamp = datetime.utcnow()
    msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp,
                                                                         name=worker_name)
    _logger.debug(msg)

    Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp,
                                                upsert=True)

    if(datetime.utcnow() - start > timedelta(seconds=PULP_PROCESS_HEARTBEAT_INTERVAL)):
        sec = (datetime.utcnow() - start).total_seconds()
        msg = _("Worker {name} heartbeat time {time}s exceeds heartbeat interval. Consider "
                "adjusting the worker_timeout setting.").format(time=sec, name=worker_name)
        _logger.warn(msg)
Пример #3
0
def _delete_worker(name, normal_shutdown=False):
    """
    Delete the Worker with _id name from the database, cancel any associated tasks and reservations

    If the worker shutdown normally, no message is logged, otherwise an error level message is
    logged. Default is to assume the worker did not shut down normally.

    Any resource reservations associated with this worker are cleaned up by this function.

    Any tasks associated with this worker are explicitly canceled.

    :param name:            The name of the worker you wish to delete.
    :type  name:            basestring
    :param normal_shutdown: True if the worker shutdown normally, False otherwise.  Defaults to
                            False.
    :type normal_shutdown:  bool
    """
    if normal_shutdown is False:
        msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.')
        msg = msg % {'name': name}
        _logger.error(msg)

    # Delete the worker document
    Worker.objects(name=name).delete()

    # Delete all reserved_resource documents for the worker
    ReservedResource.objects(worker_name=name).delete()

    # Cancel all of the tasks that were assigned to this worker's queue
    for task_status in TaskStatus.objects(worker_name=name,
                                          state__in=constants.CALL_INCOMPLETE_STATES):
        cancel(task_status['task_id'])

    # Delete working directory
    common_utils.delete_worker_working_directory(name)
Пример #4
0
def get_resource_manager_lock(name):
    """
    Tries to acquire the resource manager lock.

    If the lock cannot be acquired immediately, it will wait until the
    currently active instance becomes unavailable, at which point the worker
    cleanup routine will clear the lock for us to acquire. A worker record will
    be created so that the waiting resource manager will appear in the Status
    API. This worker record will be cleaned up through the regular worker
    shutdown routine.

    :param name:   The hostname of the worker
    :type  name:   basestring
    """
    assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME)

    lock = ResourceManagerLock(name=name)

    # Whether this is the first lock availability check for this instance
    _first_check = True

    while True:

        now = dateutils.ensure_tz(datetime.utcnow())
        old_timestamp = now - timedelta(
            seconds=constants.PULP_PROCESS_TIMEOUT_INTERVAL)

        ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete()

        # Create / update the worker record so that Pulp knows we exist
        Worker.objects(name=name).update_one(
            set__last_heartbeat=datetime.utcnow(), upsert=True)
        try:
            lock.timestamp = now
            lock.save()

            msg = _(
                "Resource manager '%s' has acquired the resource manager lock"
            ) % name
            _logger.debug(msg)

            if not _first_check:
                msg = _(
                    "Failover occurred: '%s' is now the primary resource manager"
                ) % name
                _logger.warning(msg)

            break
        except mongoengine.NotUniqueError:
            # Only log the message the first time
            if _first_check:
                _logger.info(
                    _("Hot spare pulp_resource_manager instance '%(name)s' detected."
                      ) % {'name': name})
                _first_check = False

            time.sleep(constants.PULP_PROCESS_HEARTBEAT_INTERVAL)
Пример #5
0
    def test_deletes_workers(self, mock_worker, mock_delete_worker):
        mock_worker.objects.all.return_value = [
            Worker(name='name1', last_heartbeat=datetime.utcnow() - timedelta(seconds=400)),
            Worker(name='name2', last_heartbeat=datetime.utcnow()),
        ]

        scheduler.CeleryProcessTimeoutMonitor().check_celery_processes()

        # make sure _delete_worker is only called for the old worker
        mock_delete_worker.assert_has_calls([mock.call('name1')])
Пример #6
0
    def test_deletes_workers(self, mock_worker, mock_delete_worker):
        mock_worker.objects.return_value = [
            Worker('name1', datetime.utcnow()),
            Worker('name2', datetime.utcnow()),
        ]

        scheduler.WorkerTimeoutMonitor().check_workers()

        # make sure _delete_worker is only called for the two expected calls
        mock_delete_worker.assert_has_calls([mock.call('name1'), mock.call('name2')])
Пример #7
0
    def test_resource_not_in_resource_map(self):
        """
        Test _release_resource() with a resource that is not in the database. This should be
        gracefully handled, and result in no changes to the database.
        """
        # Set up two workers
        worker_1 = Worker(WORKER_1, datetime.utcnow())
        worker_1.save()
        worker_2 = Worker(WORKER_2, datetime.utcnow())
        worker_2.save()
        # Set up two resource reservations, using our workers from above
        reserved_resource_1 = ReservedResource(str(uuid.uuid4()), worker_1.name, 'resource_1')
        reserved_resource_1.save()
        reserved_resource_2 = ReservedResource(str(uuid.uuid4()), worker_2.name, 'resource_2')
        reserved_resource_2.save()

        # This should not raise any Exception, but should also not alter either the Worker
        # collection or the ReservedResource collection
        tasks._release_resource('made_up_resource_id')

        # Make sure that the workers collection has not been altered
        self.assertEqual(Worker.objects().count(), 2)
        worker_1 = Worker.objects().get(name=worker_1.name)
        self.assertTrue(worker_1)
        worker_2 = Worker.objects().get(name=worker_2.name)
        self.assertTrue(worker_2)
        # Make sure that the reserved resources collection has not been altered
        self.assertEqual(ReservedResource.objects.count(), 2)
        rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id)
        self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name)
        self.assertEqual(rr_1['resource_id'], 'resource_1')
        rr_2 = ReservedResource.objects.get(task_id=reserved_resource_2.task_id)
        self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name)
        self.assertEqual(rr_2['resource_id'], 'resource_2')
Пример #8
0
    def test_logs_resource_manager_missing(self, mock__logger, mock_worker, mock_delete_worker):
        mock_worker.objects.all.return_value = [
            Worker(name=constants.SCHEDULER_WORKER_NAME, last_heartbeat=datetime.utcnow()),
            Worker(name='name2', last_heartbeat=datetime.utcnow()),
        ]

        scheduler.CeleryProcessTimeoutMonitor().check_celery_processes()

        mock__logger.error.assert_called_once_with(
            'There are 0 pulp_resource_manager processes running. Pulp will not operate '
            'correctly without at least one pulp_resource_mananger process running.')
Пример #9
0
Файл: app.py Проект: alexxa/pulp
def get_resource_manager_lock(name):
    """
    Tries to acquire the resource manager lock.

    If the lock cannot be acquired immediately, it will wait until the
    currently active instance becomes unavailable, at which point the worker
    cleanup routine will clear the lock for us to acquire. A worker record will
    be created so that the waiting resource manager will appear in the Status
    API. This worker record will be cleaned up through the regular worker
    shutdown routine.

    :param name:   The hostname of the worker
    :type  name:   basestring
    """
    assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME)

    lock = ResourceManagerLock(name=name)

    # Whether this is the first lock availability check for this instance
    _first_check = True

    while True:

        now = dateutils.ensure_tz(datetime.utcnow())
        old_timestamp = now - timedelta(seconds=PULP_PROCESS_TIMEOUT_INTERVAL)

        ResourceManagerLock.objects(timestamp__lte=old_timestamp).delete()

        # Create / update the worker record so that Pulp knows we exist
        Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(),
                                             upsert=True)
        try:
            lock.timestamp = now
            lock.save()

            msg = _("Resource manager '%s' has acquired the resource manager lock") % name
            _logger.debug(msg)

            if not _first_check:
                msg = _("Failover occurred: '%s' is now the primary resource manager") % name
                _logger.warning(msg)

            break
        except mongoengine.NotUniqueError:
            # Only log the message the first time
            if _first_check:
                _logger.info(_("Hot spare pulp_resource_manager instance '%(name)s' detected.")
                             % {'name': name})
                _first_check = False

            time.sleep(PULP_PROCESS_HEARTBEAT_INTERVAL)
Пример #10
0
    def test_resource_in_resource_map(self):
        """
        Test _release_resource() with a valid resource. This should remove the resource from the
        database.
        """
        # Set up two workers
        now = datetime.utcnow()
        worker_1 = Worker(name=WORKER_1, last_heartbeat=now)
        worker_1.save()
        worker_2 = Worker(name=WORKER_2, last_heartbeat=now)
        worker_2.save()
        # Set up two reserved resources
        reserved_resource_1 = ReservedResource(task_id=str(uuid.uuid4()),
                                               worker_name=worker_1.name, resource_id='resource_1')
        reserved_resource_1.save()
        reserved_resource_2 = ReservedResource(task_id=str(uuid.uuid4()),
                                               worker_name=worker_2.name, resource_id='resource_2')
        reserved_resource_2.save()

        # This should remove resource_2 from the _resource_map.
        tasks._release_resource(reserved_resource_2.task_id)

        # resource_2 should have been removed from the database
        self.assertEqual(ReservedResource.objects.count(), 1)
        rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id)
        self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name)
        self.assertEqual(rr_1['resource_id'], 'resource_1')
Пример #11
0
    def test_debug_logging(self, mock__logger, mock_worker, mock_delete_worker):
        mock_worker.objects.all.return_value = [
            Worker(name='name1', last_heartbeat=datetime.utcnow() - timedelta(seconds=400)),
            Worker(name='name2', last_heartbeat=datetime.utcnow()),
            Worker(name=RESOURCE_MANAGER_WORKER_NAME, last_heartbeat=datetime.utcnow()),
            Worker(name=SCHEDULER_WORKER_NAME, last_heartbeat=datetime.utcnow()),
        ]

        scheduler.CeleryProcessTimeoutMonitor().check_celery_processes()
        mock__logger.debug.assert_has_calls([
            mock.call('Checking if pulp_workers, pulp_celerybeat, or '
                      'pulp_resource_manager processes are missing for more than 300 seconds'),
            mock.call('1 pulp_worker processes, 1 pulp_celerybeat processes, '
                      'and 1 pulp_resource_manager processes')
        ])
Пример #12
0
 def test_get_worker_for_reservation_breaks_out_of_loop(self):
     self.mock_get_worker_for_reservation.return_value = Worker(
         'worker1', datetime.utcnow())
     tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id',
                                [1, 2], {'a': 2})
     self.assertTrue(not self.mock_get_unreserved_worker.called)
     self.assertTrue(not self.mock_time.sleep.called)
Пример #13
0
def _get_unreserved_worker():
    """
    Return the Worker instance that has no reserved_resource entries
    associated with it. If there are no unreserved workers a
    pulp.server.exceptions.NoWorkers exception is raised.

    :raises NoWorkers: If all workers have reserved_resource entries associated with them.

    :returns:          The Worker instance that has no reserved_resource
                       entries associated with it.
    :rtype:            pulp.server.db.model.resources.Worker
    """

    # Build a mapping of queue names to Worker objects
    workers_dict = dict((worker['name'], worker) for worker in Worker.objects())
    worker_names = workers_dict.keys()
    reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()]

    # Find an unreserved worker using set differences of the names, and filter
    # out workers that should not be assigned work.
    # NB: this is a little messy but set comprehensions are in python 2.7+
    unreserved_workers = set(filter(_is_worker, worker_names)) - set(reserved_names)

    try:
        return workers_dict[unreserved_workers.pop()]
    except KeyError:
        # All workers are reserved
        raise NoWorkers()
Пример #14
0
 def test_dispatches_inner_task(self):
     self.mock_get_worker_for_reservation.return_value = Worker(
         name='worker1', last_heartbeat=datetime.utcnow())
     tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2})
     apply_async = self.mock_celery.tasks['task_name'].apply_async
     apply_async.assert_called_once_with(1, 2, a=2, routing_key='worker1', task_id='my_task_id',
                                         exchange='C.dq')
Пример #15
0
 def test_dispatches__release_resource(self):
     self.mock_get_worker_for_reservation.return_value = Worker(
         name='worker1', last_heartbeat=datetime.utcnow())
     tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id',
                                [1, 2], {'a': 2})
     self.mock__release_resource.apply_async.assert_called_once_with(
         ('my_task_id', ), routing_key='worker1', exchange='C.dq')
Пример #16
0
def _get_unreserved_worker():
    """
    Return the Worker instance that has no reserved_resource entries
    associated with it. If there are no unreserved workers a
    pulp.server.exceptions.NoWorkers exception is raised.

    :raises NoWorkers: If all workers have reserved_resource entries associated with them.

    :returns:          The Worker instance that has no reserved_resource
                       entries associated with it.
    :rtype:            pulp.server.db.model.resources.Worker
    """

    # Build a mapping of queue names to Worker objects
    workers_dict = dict(
        (worker['name'], worker) for worker in Worker.objects())
    worker_names = workers_dict.keys()
    reserved_names = [r['worker_name'] for r in ReservedResource.objects.all()]

    # Find an unreserved worker using set differences of the names, and filter
    # out workers that should not be assigned work.
    # NB: this is a little messy but set comprehensions are in python 2.7+
    unreserved_workers = set(filter(_is_worker,
                                    worker_names)) - set(reserved_names)

    try:
        return workers_dict[unreserved_workers.pop()]
    except KeyError:
        # All workers are reserved
        raise NoWorkers()
Пример #17
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _('Looking for workers missing for more than %s seconds'
                ) % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(
            seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time)
        for worker in worker_list:
            msg = _(
                "Workers '%s' has gone missing, removing from list of workers"
            ) % worker.name
            _logger.error(msg)
            _delete_worker(worker.name)
Пример #18
0
    def test_update_repo_and_plugins(self, distributor_update,
                                     mock_get_worker_for_reservation):
        """
        Tests the aggregate call to update a repo and its plugins.
        """
        mock_get_worker_for_reservation.return_value = Worker(
            'some_queue', datetime.datetime.now())
        self.manager.create_repo('repo-1', 'Original', 'Original Description')

        importer_manager = manager_factory.repo_importer_manager()
        distributor_manager = manager_factory.repo_distributor_manager()

        importer_manager.set_importer('repo-1', 'mock-importer',
                                      {'key-i1': 'orig-1'})
        distributor_manager.add_distributor('repo-1',
                                            'mock-distributor',
                                            {'key-d1': 'orig-1'},
                                            True,
                                            distributor_id='dist-1')
        distributor_manager.add_distributor('repo-1',
                                            'mock-distributor',
                                            {'key-d2': 'orig-2'},
                                            True,
                                            distributor_id='dist-2')

        # Test
        repo_delta = {'display_name': 'Updated'}
        new_importer_config = {'key-i1': 'updated-1', 'key-i2': 'new-1'}
        new_distributor_configs = {
            'dist-1': {
                'key-d1': 'updated-1'
            },
        }  # only update one of the two distributors

        result = self.manager.update_repo_and_plugins('repo-1', repo_delta,
                                                      new_importer_config,
                                                      new_distributor_configs)

        self.assertTrue(isinstance(result, TaskResult))
        self.assertEquals(None, result.error)
        repo = result.return_value

        # Verify
        self.assertEqual(repo['id'], 'repo-1')
        self.assertEqual(repo['display_name'], 'Updated')
        self.assertEqual(repo['description'], 'Original Description')

        importer = importer_manager.get_importer('repo-1')
        self.assertEqual(importer['config'], new_importer_config)

        dist_1 = distributor_manager.get_distributor('repo-1', 'dist-1')
        self.assertEqual(dist_1['config'], new_distributor_configs['dist-1'])

        dist_2 = distributor_manager.get_distributor('repo-1', 'dist-2')
        self.assertEqual(dist_2['config'], {'key-d2': 'orig-2'})

        # There should have been a spawned task for the new distributor config
        expected_task_id = TaskStatus.objects.get(
            tags='pulp:repository_distributor:dist-1')['task_id']
        self.assertEqual(result.spawned_tasks, [{'task_id': expected_task_id}])
Пример #19
0
 def test_get_unreserved_worker_breaks_out_of_loop(self):
     self.mock_get_worker_for_reservation.side_effect = NoWorkers()
     self.mock_get_unreserved_worker.return_value = Worker(
         name='worker1', last_heartbeat=datetime.utcnow())
     tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id',
                                [1, 2], {'a': 2})
     self.assertTrue(not self.mock_time.sleep.called)
Пример #20
0
 def test_creates_and_saves_reserved_resource(self):
     self.mock_get_worker_for_reservation.return_value = Worker(
         name='worker1', last_heartbeat=datetime.utcnow())
     tasks._queue_reserved_task('task_name', 'my_task_id', 'my_resource_id', [1, 2], {'a': 2})
     self.mock_reserved_resource.assert_called_once_with(task_id='my_task_id',
                                                         worker_name='worker1',
                                                         resource_id='my_resource_id')
     self.mock_reserved_resource.return_value.save.assert_called_once_with()
Пример #21
0
def _delete_worker(name, normal_shutdown=False):
    """
    Delete the Worker with _id name from the database, cancel any associated tasks and reservations

    If the worker shutdown normally, no message is logged, otherwise an error level message is
    logged. Default is to assume the worker did not shut down normally.

    Any resource reservations associated with this worker are cleaned up by this function.

    Any tasks associated with this worker are explicitly canceled.

    :param name:            The name of the worker you wish to delete.
    :type  name:            basestring
    :param normal_shutdown: True if the worker shutdown normally, False otherwise.  Defaults to
                            False.
    :type normal_shutdown:  bool
    """
    if normal_shutdown is False:
        msg = _(
            'The worker named %(name)s is missing. Canceling the tasks in its queue.'
        )
        msg = msg % {'name': name}
        _logger.error(msg)
    else:
        msg = _("Cleaning up shutdown worker '%s'.") % name
        _logger.info(msg)

    # Delete the worker document
    Worker.objects(name=name).delete()

    # Delete all reserved_resource documents for the worker
    ReservedResource.objects(worker_name=name).delete()

    # If the worker is a resource manager, we also need to delete the associated lock
    if name.startswith(RESOURCE_MANAGER_WORKER_NAME):
        ResourceManagerLock.objects(name=name).delete()

    # If the worker is a scheduler, we also need to delete the associated lock
    if name.startswith(SCHEDULER_WORKER_NAME):
        CeleryBeatLock.objects(name=name).delete()

    # Cancel all of the tasks that were assigned to this worker's queue
    for task_status in TaskStatus.objects(
            worker_name=name, state__in=constants.CALL_INCOMPLETE_STATES):
        cancel(task_status['task_id'], revoke_task=False)
Пример #22
0
def get_resource_manager_lock(name):
    """
    Tries to acquire the resource manager lock. If the lock cannot be acquired immediately, it
    will wait until the currently active instance becomes unavailable, at which point the worker
    cleanup routine will clear the lock for us to acquire. A worker record will be created so that
    the waiting resource manager will appear in the Status API. We override the SIGTERM signal
    handler so that that the worker record will be immediately cleaned up if the process is killed
    while in this states.

    :param name:   The hostname of the worker
    :type  name:   basestring
    """
    assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME)

    lock = ResourceManagerLock(name=name)

    with custom_sigterm_handler(name):
        # Whether this is the first lock availability check for this instance
        _first_check = True

        while True:
            # Create / update the worker record so that Pulp knows we exist
            Worker.objects(name=name).update_one(
                set__last_heartbeat=datetime.utcnow(), upsert=True)
            try:
                lock.save()

                msg = _(
                    "Resource manager '%s' has acquired the resource manager lock"
                    % name)
                _logger.info(msg)
                break
            except mongoengine.NotUniqueError:
                # Only log the message the first time
                if _first_check:
                    msg = _(
                        "Resource manager '%s' attempted to acquire the the resource manager "
                        "lock but was unable to do so. It will retry every %d seconds until "
                        "the lock can be acquired." %
                        (name, constants.CELERY_CHECK_INTERVAL))
                    _logger.info(msg)
                    _first_check = False

                time.sleep(constants.CELERY_CHECK_INTERVAL)
Пример #23
0
def _delete_worker(name, normal_shutdown=False):
    """
    Delete the Worker with _id name from the database, cancel any associated tasks and reservations

    If the worker shutdown normally, no message is logged, otherwise an error level message is
    logged. Default is to assume the worker did not shut down normally.

    Any resource reservations associated with this worker are cleaned up by this function.

    Any tasks associated with this worker are explicitly canceled.

    :param name:            The name of the worker you wish to delete.
    :type  name:            basestring
    :param normal_shutdown: True if the worker shutdown normally, False otherwise.  Defaults to
                            False.
    :type normal_shutdown:  bool
    """
    if normal_shutdown is False:
        msg = _('The worker named %(name)s is missing. Canceling the tasks in its queue.')
        msg = msg % {'name': name}
        _logger.error(msg)
    else:
        msg = _("Cleaning up shutdown worker '%s'.") % name
        _logger.info(msg)

    # Delete the worker document
    Worker.objects(name=name).delete()

    # Delete all reserved_resource documents for the worker
    ReservedResource.objects(worker_name=name).delete()

    # If the worker is a resource manager, we also need to delete the associated lock
    if name.startswith(RESOURCE_MANAGER_WORKER_NAME):
        ResourceManagerLock.objects(name=name).delete()

    # If the worker is a scheduler, we also need to delete the associated lock
    if name.startswith(SCHEDULER_WORKER_NAME):
        CeleryBeatLock.objects(name=name).delete()

    # Cancel all of the tasks that were assigned to this worker's queue
    for task_status in TaskStatus.objects(worker_name=name,
                                          state__in=constants.CALL_INCOMPLETE_STATES):
        cancel(task_status['task_id'], revoke_task=False)
Пример #24
0
Файл: app.py Проект: pulp/pulp
def get_resource_manager_lock(name):
    """
    Tries to acquire the resource manager lock.

    If the lock cannot be acquired immediately, it will wait until the
    currently active instance becomes unavailable, at which point the worker
    cleanup routine will clear the lock for us to acquire. A worker record will
    be created so that the waiting resource manager will appear in the Status
    API. This worker record will be cleaned up through the regular worker
    shutdown routine.

    :param name:   The hostname of the worker
    :type  name:   basestring
    """
    assert name.startswith(constants.RESOURCE_MANAGER_WORKER_NAME)

    lock = ResourceManagerLock(name=name)

    # Whether this is the first lock availability check for this instance
    _first_check = True

    while True:
        # Create / update the worker record so that Pulp knows we exist
        Worker.objects(name=name).update_one(set__last_heartbeat=datetime.utcnow(),
                                             upsert=True)
        try:
            lock.save()

            msg = _("Resource manager '%s' has acquired the resource manager lock") % name
            _logger.info(msg)
            break
        except mongoengine.NotUniqueError:
            # Only log the message the first time
            if _first_check:
                msg = _("Resource manager '%(name)s' attempted to acquire the the resource manager "
                        "lock but was unable to do so. It will retry every %(interval)d seconds "
                        "until the lock can be acquired.") % \
                    {'name': name, 'interval': constants.CELERY_CHECK_INTERVAL}
                _logger.info(msg)
                _first_check = False

            time.sleep(constants.CELERY_CHECK_INTERVAL)
Пример #25
0
def handle_worker_heartbeat(event):
    """
    Celery event handler for 'worker-heartbeat' events.

    The event is first parsed and logged.  Then the existing Worker objects are
    searched for one to update. If an existing one is found, it is updated.
    Otherwise a new Worker entry is created. Logging at the info and debug
    level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)
    worker = Worker.objects(name=event_info['worker_name']).first()

    if not worker:
        msg = _("New worker '%(worker_name)s' discovered") % event_info
        _logger.info(msg)

    Worker.objects(name=event_info['worker_name']).\
        update_one(set__last_heartbeat=event_info['local_received'], upsert=True)
Пример #26
0
def handle_worker_heartbeat(event):
    """
    Celery event handler for 'worker-heartbeat' events.

    The event is first parsed and logged.  Then the existing Worker objects are
    searched for one to update. If an existing one is found, it is updated.
    Otherwise a new Worker entry is created. Logging at the info and debug
    level is also done.

    :param event: A celery event to handle.
    :type event: dict
    """
    event_info = _parse_and_log_event(event)
    worker = Worker.objects(name=event_info['worker_name']).first()

    if not worker:
        msg = _("New worker '%(worker_name)s' discovered") % event_info
        _logger.info(msg)

    Worker.objects(name=event_info['worker_name']).\
        update_one(set__last_heartbeat=event_info['local_received'], upsert=True)
Пример #27
0
    def test_debug_logging(self, mock__logger, mock_worker, mock_delete_worker):
        combined_delay = constants.PULP_PROCESS_TIMEOUT_INTERVAL + \
            constants.PULP_PROCESS_HEARTBEAT_INTERVAL
        now = datetime.utcnow()

        mock_worker.objects.all.return_value = [
            Worker(name='name1', last_heartbeat=now - timedelta(seconds=combined_delay)),
            Worker(name='name2', last_heartbeat=now),
            Worker(name=constants.RESOURCE_MANAGER_WORKER_NAME, last_heartbeat=now),
            Worker(name=constants.SCHEDULER_WORKER_NAME, last_heartbeat=now),
        ]

        scheduler.CeleryProcessTimeoutMonitor().check_celery_processes()
        mock__logger.debug.assert_has_calls([
            mock.call(
                'Checking if pulp_workers, pulp_celerybeat, or pulp_resource_manager processes are '
                'missing for more than %d seconds' % constants.PULP_PROCESS_TIMEOUT_INTERVAL
            ),
            mock.call(
                '1 pulp_worker processes, 1 pulp_celerybeat processes, '
                'and 1 pulp_resource_manager processes'
            )
        ])
Пример #28
0
def handle_worker_heartbeat(worker_name):
    """
    This is a generic function for updating worker heartbeat records.

    Existing Worker objects are searched for one to update. If an existing one is found, it is
    updated. Otherwise a new Worker entry is created. Logging at the info level is also done.

    :param worker_name: The hostname of the worker
    :type  worker_name: basestring
    """
    existing_worker = Worker.objects(name=worker_name).first()

    if not existing_worker:
        msg = _("New worker '%s' discovered") % worker_name
        _logger.info(msg)

    timestamp = datetime.utcnow()
    msg = _("Worker heartbeat from '{name}' at time {timestamp}").format(timestamp=timestamp,
                                                                         name=worker_name)
    _logger.debug(msg)

    Worker.objects(name=worker_name).update_one(set__last_heartbeat=timestamp,
                                                upsert=True)
Пример #29
0
def get_worker_for_reservation(resource_id):
    """
    Return the Worker instance that is associated with a reservation of type resource_id. If
    there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers
    exception is raised.

    :param resource_id:    The name of the resource you wish to reserve for your task.

    :raises NoWorkers:     If all workers have reserved_resource entries associated with them.

    :type resource_id:     basestring
    :returns:              The Worker instance that has a reserved_resource entry of type
                           `resource_id` associated with it.
    :rtype:                pulp.server.db.model.resources.Worker
    """
    reservation = ReservedResource.objects(resource_id=resource_id).first()
    if reservation:
        return Worker.objects(name=reservation['worker_name']).first()
    else:
        raise NoWorkers()
Пример #30
0
def get_worker_for_reservation(resource_id):
    """
    Return the Worker instance that is associated with a reservation of type resource_id. If
    there are no workers with that reservation_id type a pulp.server.exceptions.NoWorkers
    exception is raised.

    :param resource_id:    The name of the resource you wish to reserve for your task.

    :raises NoWorkers:     If all workers have reserved_resource entries associated with them.

    :type resource_id:     basestring
    :returns:              The Worker instance that has a reserved_resource entry of type
                           `resource_id` associated with it.
    :rtype:                pulp.server.db.model.resources.Worker
    """
    reservation = ReservedResource.objects(resource_id=resource_id).first()
    if reservation:
        return Worker.objects(name=reservation['worker_name']).first()
    else:
        raise NoWorkers()
Пример #31
0
def get_worker_for_reservation_list(resources):
    """
    Return the Worker instance that is associated with the reservations described by the 'resources'
    list. This will be either an existing Worker that is dealing with at least one of the specified
    resources, or an available idle Worker. We sleep and retry the request until it can be
    fulfilled.

    :param resources:   A list of the names of the resources you wish to reserve for your task.

    :type resources:    list
    :returns:           The Worker instance that has a reserved_resource entry associated with it
                        for each resource in 'resources'
    :rtype:             pulp.server.db.model.resources.Worker
    """

    _logger.debug('get_worker_for_reservation_list [%s]' % resources)
    # We leave this loop once we find a Worker to return - otherwise, sleep and try again
    while True:
        reservation_workers = set([
            reservation['worker_name']
            for reservation in ReservedResource.objects(
                resource_id__in=resources)
        ])
        _logger.debug('...num-RR is %d' % len(reservation_workers))
        if len(reservation_workers
               ) == 1:  # Exactly one worker holds any of the desired resources
            _logger.debug('...one-holds')
            return Worker.objects(name=list(reservation_workers)[0]).first()
        elif len(reservation_workers
                 ) == 0:  # No worker holds any of the desired resources
            _logger.debug('...zero-holds')
            try:
                worker = _get_unreserved_worker()
                return worker
            except NoWorkers:
                _logger.debug('...unresolved NoWorkers - WAIT')
                pass
        else:
            _logger.debug('...multiple-holds - WAIT')

        time.sleep(0.25)
Пример #32
0
    def check_workers(self):
        """
        Look for missing workers, and dispatch a cleanup task if one goes missing.

        To find a missing worker, filter the Workers model for entries older than
        utcnow() - WORKER_TIMEOUT_SECONDS. The heartbeat times are stored in native UTC, so this is
        a comparable datetime.

        For each missing worker found, dispatch a _delete_worker task requesting that the resource
        manager delete the Worker and cleanup any associated work.

        This method logs and the debug and error levels.
        """
        msg = _(
            'Looking for workers missing for more than %s seconds') % self.WORKER_TIMEOUT_SECONDS
        _logger.debug(msg)
        oldest_heartbeat_time = datetime.utcnow() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS)
        worker_list = Worker.objects(last_heartbeat__lt=oldest_heartbeat_time)
        for worker in worker_list:
            msg = _("Workers '%s' has gone missing, removing from list of workers") % worker.name
            _logger.error(msg)
            _delete_worker(worker.name)
Пример #33
0
    def get(self, request, task_id):
        """
        Return a response containing a single task.

        :param request: WSGI request object
        :type  request: django.core.handlers.wsgi.WSGIRequest
        :param task_id: The ID of the task you wish to cancel
        :type  task_id: basestring

        :return: Response containing a serialized dict of the requested task
        :rtype : django.http.HttpResponse
        :raises MissingResource: if task is not found
        """
        try:
            task = TaskStatus.objects.get(task_id=task_id)
        except DoesNotExist:
            raise MissingResource(task_id)

        task_dict = task_serializer(task)
        if 'worker_name' in task_dict:
            queue_name = Worker(name=task_dict['worker_name'],
                                last_heartbeat=datetime.now()).queue_name
            task_dict.update({'queue': queue_name})
        return generate_json_response_with_pulp_encoder(task_dict)
Пример #34
0
 def tearDown(self):
     Worker.objects().delete()
     ReservedResource.objects.delete()
     TaskStatus.objects().delete()
Пример #35
0
 def tearDown(self):
     Worker.objects().delete()
     ReservedResource.objects.delete()
     TaskStatus.objects().delete()
Пример #36
0
def get_workers():
    """
    :returns:          list of workers with their heartbeats
    :rtype:            list
    """
    return Worker.objects()
Пример #37
0
    def test_resource_not_in_resource_map(self):
        """
        Test _release_resource() with a resource that is not in the database. This should be
        gracefully handled, and result in no changes to the database.
        """
        # Set up two workers
        worker_1 = Worker(name=WORKER_1, last_heartbeat=datetime.utcnow())
        worker_1.save()
        worker_2 = Worker(name=WORKER_2, last_heartbeat=datetime.utcnow())
        worker_2.save()
        # Set up two resource reservations, using our workers from above
        reserved_resource_1 = ReservedResource(task_id=str(uuid.uuid4()),
                                               worker_name=worker_1.name, resource_id='resource_1')
        reserved_resource_1.save()
        reserved_resource_2 = ReservedResource(task_id=str(uuid.uuid4()),
                                               worker_name=worker_2.name, resource_id='resource_2')
        reserved_resource_2.save()

        # This should not raise any Exception, but should also not alter either the Worker
        # collection or the ReservedResource collection
        tasks._release_resource('made_up_resource_id')

        # Make sure that the workers collection has not been altered
        self.assertEqual(Worker.objects().count(), 2)
        worker_1 = Worker.objects().get(name=worker_1.name)
        self.assertTrue(worker_1)
        worker_2 = Worker.objects().get(name=worker_2.name)
        self.assertTrue(worker_2)
        # Make sure that the reserved resources collection has not been altered
        self.assertEqual(ReservedResource.objects.count(), 2)
        rr_1 = ReservedResource.objects.get(task_id=reserved_resource_1.task_id)
        self.assertEqual(rr_1['worker_name'], reserved_resource_1.worker_name)
        self.assertEqual(rr_1['resource_id'], 'resource_1')
        rr_2 = ReservedResource.objects.get(task_id=reserved_resource_2.task_id)
        self.assertEqual(rr_2['worker_name'], reserved_resource_2.worker_name)
        self.assertEqual(rr_2['resource_id'], 'resource_2')
Пример #38
0
def get_workers():
    """
    :returns:          list of workers with their heartbeats
    :rtype:            list
    """
    return Worker.objects()