def handle_worker_heartbeat(event): """ Celery event handler for 'worker-heartbeat' events. The event is first parsed and logged. If this event is from the resource manager, there is no further processing to be done. Then the existing AvailableQueue objects are searched for one to update. If an existing one is found, it is updated. Otherwise a new AvailableQueue entry is created. Logging at the info and debug level is also done. :param event: A celery event to handle. :type event: dict """ event_info = _parse_and_log_event(event) # if this is the resource_manager do nothing if _is_resource_manager(event): return find_worker_criteria = Criteria(filters={'_id': event_info['worker_name']}, fields=('_id', 'last_heartbeat', 'num_reservations')) find_worker_list = list(resources.filter_available_queues(find_worker_criteria)) if find_worker_list: AvailableQueue.get_collection().find_and_modify( query={'_id': event_info['worker_name']}, update={'$set': {'last_heartbeat': event_info['timestamp']}} ) else: new_available_queue = AvailableQueue(event_info['worker_name'], event_info['timestamp']) msg = _("New worker '%(worker_name)s' discovered") % event_info _logger.info(msg) new_available_queue.save()
def test__reserve_resource_without_existing_reservation(self): """ Test _reserve_resource() with a resource that does not have an existing reservation in the database. It should find the least busy queue, add a reservation to the database with that queue, and then return the queue. """ # Set up an available queue available_queue_1 = AvailableQueue(RESERVED_WORKER_1, 0) available_queue_1.save() queue = tasks._reserve_resource('resource_1') worker_1_queue_name = RESERVED_WORKER_1 + '.dq' self.assertEqual(queue, worker_1_queue_name) # Make sure that the AvailableQueue is correct aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 1) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 1) # Make sure the ReservedResource is also correct rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 1) rr_1 = rrc.find_one({'_id': 'resource_1'}) self.assertEqual(rr_1['assigned_queue'], worker_1_queue_name) self.assertEqual(rr_1['num_reservations'], 1)
def test__reserve_resource_with_existing_reservation(self): """ Test _reserve_resource() with a resource that has an existing reservation in the database. It should return the queue listed in the database, and increment the reservation counter. """ # Set up an available queue with a reservation count of 1 now = datetime.utcnow() available_queue_1 = AvailableQueue(RESERVED_WORKER_1, now, 1) available_queue_1.save() # Set up a resource reservation, using our available_queue from above reserved_resource_1 = ReservedResource('resource_1', available_queue_1.name, available_queue_1.num_reservations) reserved_resource_1.save() # This should increase the reserved_resource_1 num_reservations to 2, and should also # increase available_queue_1's num_reservations to 2. available_queue_1's name should be # returned queue = tasks._reserve_resource('resource_1') self.assertEqual(queue, RESERVED_WORKER_1) # Make sure that the AvailableQueue is correct aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 1) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 2) # Make sure the ReservedResource is also correct rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 1) rr_1 = rrc.find_one({'_id': reserved_resource_1.name}) self.assertEqual(rr_1['assigned_queue'], RESERVED_WORKER_1) self.assertEqual(rr_1['num_reservations'], 2)
def test__delete_queue(self, logger, cancel, active_queues, mock_add_consumer): """ Assert that the correct Tasks get canceled when their queue is deleted, and that the queue is removed from the database. """ # cause two workers to be added to the database as having available queues worker_watcher.handle_worker_heartbeat({ 'timestamp': time.time(), 'type': 'worker-heartbeat', 'hostname': RESERVED_WORKER_1, }) worker_watcher.handle_worker_heartbeat({ 'timestamp': time.time(), 'type': 'worker-heartbeat', 'hostname': RESERVED_WORKER_2, }) # Let's simulate three tasks being assigned to RESERVED_WORKER_2, with two of them being # in an incomplete state and one in a complete state. We will delete RESERVED_WORKER_2's # queue, which should cause the two to get canceled. Let's put task_1 in progress TaskStatusManager.create_task_status('task_1', RESERVED_WORKER_2, state=CALL_RUNNING_STATE) TaskStatusManager.create_task_status('task_2', RESERVED_WORKER_2, state=CALL_WAITING_STATE) # This task shouldn't get canceled because it isn't in an incomplete state TaskStatusManager.create_task_status('task_3', RESERVED_WORKER_2, state=CALL_FINISHED_STATE) # Let's make a task in a worker that is still present just to make sure it isn't touched. TaskStatusManager.create_task_status('task_4', RESERVED_WORKER_1, state=CALL_RUNNING_STATE) # Let's just make sure the setup worked and that we have an AvailableQueue with RR2 aqc = AvailableQueue.get_collection() self.assertEqual(aqc.find({'_id': RESERVED_WORKER_2}).count(), 1) # Now let's delete the queue named RESERVED_WORKER_2 tasks._delete_queue.apply_async(args=(RESERVED_WORKER_2,), queue=tasks.RESOURCE_MANAGER_QUEUE) # cancel() should have been called twice with task_1 and task_2 as parameters self.assertEqual(cancel.call_count, 2) # Let's build a set out of the two times that cancel was called. We can't know for sure # which order the Tasks got canceled in, but we can assert that the correct two tasks were # canceled (task_3 should not appear in this set). cancel_param_set = set([c[1] for c in cancel.mock_calls]) self.assertEqual(cancel_param_set, set([('task_1',), ('task_2',)])) # We should have logged that we are canceling the tasks self.assertEqual(logger.call_count, 0) self.assertTrue(RESERVED_WORKER_2 in logger.mock_calls[0][1][0]) self.assertTrue('Canceling the tasks' in logger.mock_calls[0][1][0]) # The queue should have been deleted self.assertEqual(aqc.find({'_id': RESERVED_WORKER_2}).count(), 0) # the queue for RW1 should remain self.assertEqual(aqc.find({'_id': RESERVED_WORKER_1}).count(), 1)
def test_babysit_creates_correct_records(self, add_consumer, active_queues): """ Test babysit() with a blank database. It should create the correct AvailableQueues. """ tasks.babysit() # babysit() should have called the active_queues() method active_queues.assert_called_once_with() # There should be three ActiveQueues, one for each reserved worker in the mock data aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 3) # Let's make sure their names and num_reservations counts are correct self.assertEqual(aqc.find_one({'_id': RESERVED_WORKER_1})['num_reservations'], 0) self.assertEqual(aqc.find_one({'_id': RESERVED_WORKER_2})['num_reservations'], 0) self.assertEqual(aqc.find_one({'_id': RESERVED_WORKER_3})['num_reservations'], 0) # Reserved worker 3 wasn't assigned to a queue, so babysit() should have assigned it to one add_consumer.assert_called_once_with(queue=RESERVED_WORKER_3, destination=(RESERVED_WORKER_3,))
def test__release_resource_task_count_two(self): """ Test _release_resource() with a resource that has a task count of two. This should simply decrement the task_count for the resource, but should not remove it from the database. """ # Set up two available queues now = datetime.utcnow() available_queue_1 = AvailableQueue(RESERVED_WORKER_1, now, 7) available_queue_1.save() available_queue_2 = AvailableQueue(RESERVED_WORKER_2, now, 2) available_queue_2.save() # Set up two resource reservations, using our available_queues from above reserved_resource_1 = ReservedResource('resource_1', available_queue_1.name, available_queue_1.num_reservations) reserved_resource_1.save() reserved_resource_2 = ReservedResource('resource_2', available_queue_2.name, available_queue_2.num_reservations) reserved_resource_2.save() # This should reduce the reserved_resource_2 num_reservations to 1, and should also reduce # available_queue_2's num_reservations to 1. tasks._release_resource('resource_2') # Make sure that the AvailableQueues are correct aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 2) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 7) aq_2 = aqc.find_one({'_id': available_queue_2.name}) self.assertEqual(aq_2['num_reservations'], 1) # Make sure the ReservedResources are also correct rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 2) rr_1 = rrc.find_one({'_id': reserved_resource_1.name}) self.assertEqual(rr_1['assigned_queue'], reserved_resource_1.assigned_queue) self.assertEqual(rr_1['num_reservations'], 7) rr_2 = rrc.find_one({'_id': reserved_resource_2.name}) self.assertEqual(rr_2['assigned_queue'], reserved_resource_2.assigned_queue) self.assertEqual(rr_2['num_reservations'], 1)
def test__release_resource_not_in__resource_map(self): """ Test _release_resource() with a resource that is not in the database. This should be gracefully handled, and result in no changes to the database. """ # Set up two available queues available_queue_1 = AvailableQueue(RESERVED_WORKER_1, datetime.utcnow(), 7) available_queue_1.save() available_queue_2 = AvailableQueue(RESERVED_WORKER_2, datetime.utcnow(), 3) available_queue_2.save() # Set up two resource reservations, using our available_queues from above reserved_resource_1 = ReservedResource('resource_1', available_queue_1.name, available_queue_1.num_reservations) reserved_resource_1.save() reserved_resource_2 = ReservedResource('resource_2', available_queue_2.name, available_queue_2.num_reservations) reserved_resource_2.save() # This should not raise any Exception, but should also not alter either the AvailableQueue # collection or the ReservedResource collection tasks._release_resource('made_up_resource_id') # Make sure that the available queues collection has not been altered aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 2) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 7) aq_2 = aqc.find_one({'_id': available_queue_2.name}) self.assertEqual(aq_2['num_reservations'], 3) # Make sure that the reserved resources collection has not been altered rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 2) rr_1 = rrc.find_one({'_id': reserved_resource_1.name}) self.assertEqual(rr_1['assigned_queue'], reserved_resource_1.assigned_queue) self.assertEqual(rr_1['num_reservations'], 7) rr_2 = rrc.find_one({'_id': reserved_resource_2.name}) self.assertEqual(rr_2['assigned_queue'], reserved_resource_2.assigned_queue) self.assertEqual(rr_2['num_reservations'], 3)
def test_babysit_resets_missing_since_on_reappearing_workers(self, add_consumer, active_queues): """ Let's simulate an AvailableQueue having been missing in the past by setting its missing_since attribute to two minutes ago. It is part of the mocked active_queues() call, so we expect babysit() to set its missing_since attribute back to None. Note that this one has been missing for more than five minutes, but it got lucky because it is back just in time to avoid being deleted. """ available_queue_2 = AvailableQueue(name=RESERVED_WORKER_2, missing_since=datetime.utcnow() - timedelta(minutes=6)) available_queue_2.save() tasks.babysit() # babysit() should have called the active_queues() method active_queues.assert_called_once_with() # There should be three ActiveQueues, one for each reserved worker in the mock data aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 3) # Make sure it's set back to None aq_2 = aqc.find_one({'_id': RESERVED_WORKER_2}) self.assertEqual(aq_2['num_reservations'], 0) self.assertEqual(aq_2['missing_since'], None)
def test__release_resource_task_count_one(self): """ Test _release_resource() with a resource that has a task count of one. This should remove the resource from the database. """ # Set up two available queues now = datetime.utcnow() available_queue_1 = AvailableQueue(RESERVED_WORKER_1, now, 7) available_queue_1.save() available_queue_2 = AvailableQueue(RESERVED_WORKER_2, now, 1) available_queue_2.save() # Set up two reserved resources reserved_resource_1 = ReservedResource('resource_1', available_queue_1.name, available_queue_1.num_reservations) reserved_resource_1.save() reserved_resource_2 = ReservedResource('resource_2', available_queue_2.name, available_queue_2.num_reservations) reserved_resource_2.save() # This should remove resource_2 from the _resource_map, and should reduce the queue's task # count to 0. tasks._release_resource('resource_2') # available_queue_2 should have had its num_reservations reduced to 0, and the other one # should have remained the same aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 2) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 7) aq_2 = aqc.find_one({'_id': available_queue_2.name}) self.assertEqual(aq_2['num_reservations'], 0) # resource_2 should have been removed from the database rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 1) rr_1 = rrc.find_one({'_id': reserved_resource_1.name}) self.assertEqual(rr_1['assigned_queue'], reserved_resource_1.assigned_queue) self.assertEqual(rr_1['num_reservations'], 7)
def test__release_resource_queue_task_count_zero(self): """ Test _release_resource() with a resource that has a queue with a task count of zero. This should not decrement the queue task count into the negative range. """ # Set up two available queues, the second with a task count of 0 now = datetime.utcnow() available_queue_1 = AvailableQueue(RESERVED_WORKER_1, now, 7) available_queue_1.save() available_queue_2 = AvailableQueue(RESERVED_WORKER_2, now, 0) available_queue_2.save() # Set up two reserved resources, and let's make it so the second one is out of sync with its # queue's task count by setting its num_reservations to 1 reserved_resource_1 = ReservedResource('resource_1', available_queue_1.name, available_queue_1.num_reservations) reserved_resource_1.save() reserved_resource_2 = ReservedResource('resource_2', available_queue_2.name, 1) reserved_resource_2.save() # This should remove resource_2 from the _resource_map, but should leave the queue's task # count at 0. tasks._release_resource('resource_2') # The _available_queue_task_counts should remain as they were before, since we don't want # queue lengths below zero aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 2) aq_1 = aqc.find_one({'_id': available_queue_1.name}) self.assertEqual(aq_1['num_reservations'], 7) aq_2 = aqc.find_one({'_id': available_queue_2.name}) self.assertEqual(aq_2['num_reservations'], 0) # resource_2 should have been removed from the database rrc = ReservedResource.get_collection() self.assertEqual(rrc.count(), 1) rr_1 = rrc.find_one({'_id': reserved_resource_1.name}) self.assertEqual(rr_1['assigned_queue'], reserved_resource_1.assigned_queue) self.assertEqual(rr_1['num_reservations'], 7)
def test_get(self): """ Test for the case when the requested queue does exist. """ # Let's add two AvailableQueues just to make sure that it doesn't return the wrong queue. aq_1 = AvailableQueue('queue_1') aq_1.save() missing_since = datetime(2013, 12, 16) aq_2 = AvailableQueue('queue_2', 7, missing_since) aq_2.save() aq_2 = resources.get_or_create_available_queue('queue_2') # Assert that the returned instance is correct self.assertEqual(type(aq_2), AvailableQueue) self.assertEqual(aq_2.name, 'queue_2') # Make sure the instance attributes are correct self.assertEqual(aq_2.num_reservations, 7) self.assertEqual(aq_2.missing_since, missing_since) # Now we need to assert that the DB is still correct aqc = aq_2.get_collection() aq_bson = aqc.find_one({'_id': 'queue_2'}) self.assertEqual(aq_bson['num_reservations'], 7) self.assertEqual(aq_bson['missing_since'], missing_since)
def tearDown(self): AvailableQueue.get_collection().remove() ReservedResource.get_collection().remove() TaskStatus.get_collection().remove()
def test_babysit_deletes_correct_records(self, add_consumer, _delete_queue_apply_async, active_queues): """ Test babysit() with pre-existing state. It should create the correct AvailableQueues, and delete other ones, and leave others in place. """ # This AvailableQueue should remain in the DB available_queue_2 = AvailableQueue(name=RESERVED_WORKER_2) available_queue_2.save() # This AvailableQueue doesn't exist anymore since it's not in the mock results, and it's # been missing for five minutes, so it should get deleted available_queue_4 = AvailableQueue(name='%s4' % tasks.RESERVED_WORKER_NAME_PREFIX, missing_since=datetime.utcnow() - timedelta(minutes=5)) available_queue_4.save() # This AvailableQueue doesn't exist anymore since it's not in the mock results, but it's # been missing for less than five minutes, so it should not get deleted available_queue_5 = AvailableQueue(name='%s5' % tasks.RESERVED_WORKER_NAME_PREFIX, missing_since=datetime.utcnow() - timedelta(minutes=2)) available_queue_5.save() # This AvailableQueue doesn't exist anymore since it's not in the mock results, but it # hasn't been missing before (i.e., it's missing_since attribute is None), so it should not # get deleted. It's missing_since attribute should be set to a datetime, however. available_queue_6 = AvailableQueue(name='%s6' % tasks.RESERVED_WORKER_NAME_PREFIX, missing_since=None) available_queue_6.save() # This should cause queue 4 to get deleted, and 6 to get marked as missing. tasks.babysit() # babysit() should have called the active_queues() method active_queues.assert_called_once_with() # There should be five ActiveQueues, one for each reserved worker in the mock data (3), and # numbers 5 and 6 that we created above should also remain because they have been missing # for less than five minutes. aqc = AvailableQueue.get_collection() self.assertEqual(aqc.count(), 5) # Let's make sure their names, num_reservations counts, and missing_since attributes are # correct aq_1 = aqc.find_one({'_id': RESERVED_WORKER_1}) self.assertEqual(aq_1['num_reservations'], 0) self.assertEqual(aq_1['missing_since'], None) aq_2 = aqc.find_one({'_id': RESERVED_WORKER_2}) self.assertEqual(aq_2['num_reservations'], 0) self.assertEqual(aq_2['missing_since'], None) aq_3 = aqc.find_one({'_id': RESERVED_WORKER_3}) self.assertEqual(aq_3['num_reservations'], 0) self.assertEqual(aq_3['missing_since'], None) # Numbers 5 and 6 should exist, with non-null missing_since attributes aq_5 = aqc.find_one({'_id': '%s5' % tasks.RESERVED_WORKER_NAME_PREFIX}) self.assertEqual(aq_5['num_reservations'], 0) self.assertEqual(type(aq_5['missing_since']), datetime) self.assertTrue(aq_5['missing_since'] < datetime.utcnow() - timedelta(minutes=2)) aq_6 = aqc.find_one({'_id': '%s6' % tasks.RESERVED_WORKER_NAME_PREFIX}) self.assertEqual(aq_6['num_reservations'], 0) self.assertEqual(type(aq_6['missing_since']), datetime) self.assertTrue(aq_6['missing_since'] < datetime.utcnow()) # Reserved worker 3 wasn't assigned to a queue, so babysit() should have assigned it to one add_consumer.assert_called_once_with(queue=RESERVED_WORKER_3, destination=(RESERVED_WORKER_3,)) # Make sure that _delete_queue was called for #4, and that the delete task was sent to the # RESOURCE_MANAGER_QUEUE _delete_queue_apply_async.assert_called_once_with( args=('%s4' % tasks.RESERVED_WORKER_NAME_PREFIX,), queue=tasks.RESOURCE_MANAGER_QUEUE)