def test_handle_task_update(self): """Tests calling TaskManager.handle_task_update()""" task_id = 'task_1' task_name = 'My Task' agent_id = 'agent_1' task_1 = ImplementedTask(task_id, task_name, agent_id) when_launched = now() manager = TaskManager() manager.launch_tasks([task_1], when_launched) when_finished = datetime.timedelta(seconds=1) update_1 = job_test_utils.create_task_status_update( task_1.id, task_1.agent_id, TaskStatusUpdate.FINISHED, when=when_finished) manager.handle_task_update(update_1) self.assertTrue(task_1.has_ended) self.assertEqual(task_1._ended, when_finished) update_2 = job_test_utils.create_task_status_update( 'task_2', 'New Agent', TaskStatusUpdate.RUNNING, when=now()) manager.handle_task_update(update_2) # Should ignore, no error
def test_job_exe_clean_task(self): """Tests the NodeManager where a cleanup task is returned to clean up a job execution""" when = now() node_mgr = NodeManager() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr = CleanupManager() cleanup_mgr.update_nodes(node_mgr.get_nodes()) tasks = node_mgr.get_next_tasks(when) task_mgr = TaskManager() # Complete initial cleanup tasks for task in tasks: task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task_mgr.handle_task_update(update) node_mgr.handle_task_update(update) # Mark image pull done to get rid of image tasks for node in node_mgr.get_nodes(): node._image_pull_completed() node._update_state() job_exe = job_test_utils.create_running_job_exe(agent_id=self.agent_1, node=self.node_1) # Add a job execution to clean up and get the cleanup task for it cleanup_mgr.add_job_execution(job_exe) tasks = node_mgr.get_next_tasks(when) self.assertEqual(len(tasks), 1) task = tasks[0] self.assertEqual(task.agent_id, self.agent_1.agent_id) self.assertFalse(task.is_initial_cleanup) self.assertEqual(len(task.job_exes), 1)
class TestIngestJobType(TestCase): """Tests things related to the ingest job type""" fixtures = [ 'basic_errors.json', 'basic_job_errors.json', 'ingest_job_types.json', 'ingest_errors.json' ] def setUp(self): django.setup() reset_error_cache() self.task_mgr = TaskManager() def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() running_job_exe = job_test_utils.create_running_job_exe( agent_id='agent_1', job_type=ingest_job_type, num_exes=1) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertFalse(running_job_exe.is_finished() ) # Not finished until killed task update arrives self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready()) # Killed task update arrives, job execution is now finished job_task_kill = when_timed_out + timedelta(seconds=1) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.KILLED, job_task_kill) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready())
class TestIngestJobType(TestCase): """Tests things related to the ingest job type""" fixtures = [ 'basic_errors.json', 'basic_job_errors.json', 'ingest_job_types.json', 'ingest_errors.json' ] def setUp(self): django.setup() self.task_mgr = TaskManager() def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() job = job_test_utils.create_job(job_type=ingest_job_type, num_exes=1) job_exe = job_test_utils.create_job_exe(job=job) running_job_exe = RunningJobExecution(job_exe) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=job_exe.id) self.assertEqual('FAILED', job_exe.status) self.assertEqual('ingest-timeout', job_exe.error.name) self.assertEqual(when_timed_out, job_exe.ended)
def test_pull_task_change_agent_id(self): """Tests the NodeManager where a node's agent ID changes during a pull task""" when = now() manager = NodeManager() manager.register_agents([self.agent_1, self.agent_2]) manager.sync_with_database(scheduler_mgr.config) for node in manager.get_nodes(): node._last_health_task = when node._initial_cleanup_completed() node._update_state() tasks = manager.get_next_tasks(when) task_mgr = TaskManager() task_2 = None for task in tasks: task_mgr.launch_tasks([task], when) if task.agent_id == self.agent_2.agent_id: task_2 = task # Node 2 changes agent ID to 3 manager.lost_node(self.agent_2.agent_id) manager.register_agents([self.agent_3]) manager.sync_with_database(scheduler_mgr.config) for node in manager.get_nodes(): node._last_health_task = when node._initial_cleanup_completed() node._update_state() # Should get new Docker pull task for node 2 tasks = manager.get_next_tasks(when) self.assertEqual(len(tasks), 1) new_task_2 = tasks[0] self.assertEqual(new_task_2.agent_id, self.agent_3.agent_id) # Task update comes back for original node 2 Docker pull task, manager should ignore with no exception update = job_test_utils.create_task_status_update( task_2.id, task_2.agent_id, TaskStatusUpdate.FAILED, when) task_mgr.handle_task_update(update) manager.handle_task_update(update)
class TestNode(TestCase): def setUp(self): django.setup() self.node_agent = 'agent_1' self.node = node_test_utils.create_node(hostname='host_1', slave_id=self.node_agent) self.job_exe = job_test_utils.create_job_exe(node=self.node) self.task_mgr = TaskManager() @patch('scheduler.node.conditions.now') def test_generate_status_json(self, mock_now): """Tests calling generate_status_json() successfully""" right_now = now() mock_now.return_value = right_now num_job_exes = JOB_EXES_WARNING_THRESHOLD + 1 node = Node(self.node_agent, self.node) node._conditions.handle_pull_task_failed() node._conditions.update_cleanup_count(num_job_exes) node._update_state() nodes_list = [] node.generate_status_json(nodes_list) expected_results = [{ 'id': node.id, 'hostname': node.hostname, 'agent_id': self.node_agent, 'is_active': True, 'state': { 'name': 'DEGRADED', 'title': Node.DEGRADED.title, 'description': Node.DEGRADED.description }, 'errors': [{ 'name': 'IMAGE_PULL', 'title': NodeConditions.IMAGE_PULL_ERR.title, 'description': NodeConditions.IMAGE_PULL_ERR.description, 'started': datetime_to_string(right_now), 'last_updated': datetime_to_string(right_now) }], 'warnings': [{ 'name': 'CLEANUP', 'title': NodeConditions.CLEANUP_WARNING.title, 'description': NodeConditions.CLEANUP_WARNING.description % num_job_exes, 'started': datetime_to_string(right_now), 'last_updated': datetime_to_string(right_now) }] }] self.assertListEqual(nodes_list, expected_results) def test_handle_failed_cleanup_task(self): """Tests handling failed cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) task_1_id = task.id # Fail task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No new cleanup task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._is_initial_cleanup_completed) # After error threshold, we should get new cleanup task new_time = when + Node.CLEANUP_ERR_THRESHOLD + datetime.timedelta( seconds=5) node._last_heath_task = new_time # Get rid of health check task task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) def test_handle_initial_cleanup_task(self): """Tests handling the initial cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertTrue(task.is_initial_cleanup) self.assertEqual(task.agent_id, self.node_agent) # Schedule initial cleanup and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertFalse(node._is_initial_cleanup_completed) # Complete initial clean up, verify no new cleanup task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) for task in node.get_next_tasks(when): self.assertFalse(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertTrue(node._is_initial_cleanup_completed) def test_handle_killed_cleanup_task(self): """Tests handling killed cleanup task""" when = now() node = Node(self.node_agent, self.node) # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) def test_handle_lost_cleanup_tasks(self): """Tests handling lost cleanup tasks""" when = now() node = Node(self.node_agent, self.node) # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) task_1_id = task.id # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) # Lose task with scheduling and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) # Lose task after running and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # No task since there are no job executions to clean self.assertListEqual([], node.get_next_tasks(when)) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node.add_job_execution(job_exe) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No task since all job executions have been cleaned self.assertListEqual([], node.get_next_tasks(when)) def test_paused_node_cleanup_task(self): """Tests not returning cleanup task when its node is paused""" when = now() paused_node = node_test_utils.create_node(hostname='host_1', slave_id=self.node_agent) paused_node.is_paused = True node = Node(self.node_agent, paused_node) # Turn off health task node._last_heath_task = when # No task due to paused node self.assertListEqual([], node.get_next_tasks(when)) def test_handle_failed_health_task(self): """Tests handling failed health task""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in node._conditions._active_errors) # No new health task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._conditions.is_health_check_normal) # After error threshold, we should get new health task new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta( seconds=5) task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) def test_handle_failed_health_task_bad_daemon(self): """Tests handling a failed health task where the Docker daemon is bad""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with bad daemon exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.BAD_DAEMON_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.BAD_DAEMON_ERR.name in node._conditions._active_errors) def test_handle_failed_health_task_bad_logstash(self): """Tests handling a failed health task where logstash is unreachable""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with bad logstash exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.BAD_LOGSTASH_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in node._conditions._active_errors) def test_handle_failed_health_task_low_docker_space(self): """Tests handling a failed health task where Docker has low disk space""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with low Docker space exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.LOW_DOCKER_SPACE_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in node._conditions._active_errors) def test_handle_successful_health_task(self): """Tests handling the health task successfully""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule health task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal) def test_handle_killed_health_task(self): """Tests handling killed health task""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) def test_handle_lost_health_task(self): """Tests handling lost health task""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task with scheduling and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task after running and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) def test_handle_failed_pull_task(self): """Tests handling failed Docker pull task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No new pull task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._is_image_pulled) # After error threshold, we should get new pull task new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta( seconds=5) node._last_heath_task = new_time # Get rid of health check task task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) def test_handle_successful_pull_task(self): """Tests handling the Docker pull task successfully""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule pull task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertFalse(node._is_image_pulled) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._is_image_pulled) # Node should now be ready self.assertEqual(node._state, Node.READY) def test_handle_killed_pull_task(self): """Tests handling killed cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) def test_handle_lost_pull_task(self): """Tests handling lost pull task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task with scheduling and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task after running and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) def test_paused_node_pull_task(self): """Tests not returning pull task when its node is paused""" when = now() paused_node = node_test_utils.create_node(hostname='host_1', slave_id=self.node_agent) paused_node.is_paused = True node = Node(self.node_agent, paused_node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() tasks = node.get_next_tasks(when) # No task due to paused node self.assertListEqual([], tasks) def test_node_that_is_not_cleaned_yet_no_pull_task(self): """Tests not returning pull task when the node hasn't been cleaned up yet""" when = now() node = Node(self.node_agent, self.node) tasks = node.get_next_tasks(when) # No pull task due to node not cleaned yet for task in tasks: self.assertFalse(task.id.startswith(PULL_TASK_ID_PREFIX))
class TestJobExecutionManager(TransactionTestCase): """Tests the JobExecutionManager class""" fixtures = ['basic_errors.json', 'basic_job_errors.json'] def setUp(self): django.setup() # Clear error cache so tests work correctly reset_error_cache() self.agent_id = 'agent' self.node_model_1 = node_test_utils.create_node() self.job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_1) self.node_model_2 = node_test_utils.create_node() self.job_exe_2 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_2) self.task_mgr = TaskManager() self.job_exe_mgr = JobExecutionManager() def test_check_for_starvation(self): """Tests calling check_for_starvation() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Start and complete first task of execution task_1_launched = now() task_1 = self.job_exe_1.start_next_task() self.task_mgr.launch_tasks([task_1], task_1_launched) task_1_started = task_1_launched + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.task_mgr.handle_task_update(update) self.job_exe_mgr.handle_task_update(update) task_1_completed = task_1_started + timedelta(seconds=10) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FINISHED, task_1_completed) self.task_mgr.handle_task_update(update) self.job_exe_mgr.handle_task_update(update) # Check after the time threshold has passed and task 2 has still not been launched check_time = task_1_completed + RESOURCE_STARVATION_THRESHOLD + timedelta(seconds=1) finished_job_exes = self.job_exe_mgr.check_for_starvation(check_time) # Check that execution 1 was failed for starvation self.assertEqual(len(finished_job_exes), 1) starved_job_exe = finished_job_exes[0] self.assertEqual(starved_job_exe.id, self.job_exe_1.id) self.assertEqual(starved_job_exe.status, 'FAILED') self.assertEqual(starved_job_exe.error.name, 'resource-starvation') self.assertEqual(starved_job_exe.finished, check_time) def test_generate_status_json(self): """Tests calling generate_status_json() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) json_dict = [{'id': self.node_model_1.id}, {'id': self.node_model_2.id}] self.job_exe_mgr.generate_status_json(json_dict, now()) for node_dict in json_dict: self.assertEqual(node_dict['job_executions']['running']['total'], 1) def test_get_messages_for_canceled_job_exes(self): """Tests calling get_messages() successfully when canceled job_exes have been added""" job_exe_ends = [] for _ in range(int(MAX_NUM * 2.5)): # Should result in 3 messages job_exe = job_test_utils.create_job_exe() job_exe_ends.append(job_exe.create_canceled_job_exe_end_model(now())) self.job_exe_mgr.add_canceled_job_exes(job_exe_ends) messages = self.job_exe_mgr.get_messages() self.assertEqual(len(messages), 3) def test_handle_task_timeout(self): """Tests calling handle_task_timeout() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task = self.job_exe_1.start_next_task() self.job_exe_mgr.handle_task_timeout(task, now()) self.assertEqual(self.job_exe_1.status, 'FAILED') def test_handle_task_update(self): """Tests calling handle_task_update() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Start tasks task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) task_2 = self.job_exe_2.start_next_task() # Shortcut job exe 2 so that there is only one task to complete self.job_exe_2._remaining_tasks = [] task_2_started = now() - timedelta(minutes=5) update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.RUNNING, task_2_started) # Job execution is not finished, so None should be returned and no message is available result = self.job_exe_mgr.handle_task_update(update_1) self.assertIsNone(result) result = self.job_exe_mgr.handle_task_update(update_2) self.assertIsNone(result) self.assertListEqual(self.job_exe_mgr.get_messages(), []) # Fail task 1 for job exe 1 task_1_failed = task_1_started + timedelta(seconds=1) update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed, exit_code=1) # Complete task 2 for job exe 2 task_2_completed = task_2_started + timedelta(seconds=1) update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.FINISHED, task_2_completed) # Job executions are finished, so they should be returned and a create_job_exe_ends message, a failed_jobs # message, and a completed_jobs message is available result = self.job_exe_mgr.handle_task_update(update_1) self.assertEqual(self.job_exe_1.id, result.id) result = self.job_exe_mgr.handle_task_update(update_2) self.assertEqual(self.job_exe_2.id, result.id) messages = self.job_exe_mgr.get_messages() self.assertEqual(len(messages), 3) job_exe_ends_msg = messages[0] self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends') self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id) self.assertEqual(job_exe_ends_msg._job_exe_ends[1].job_exe_id, self.job_exe_2.id) completed_jobs_msg = messages[1] self.assertEqual(completed_jobs_msg.type, 'completed_jobs') self.assertEqual(completed_jobs_msg._completed_jobs[0].job_id, self.job_exe_2.job_id) failed_jobs_msg = messages[2] self.assertEqual(failed_jobs_msg.type, 'failed_jobs') self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id) def test_init_with_database(self): """Tests calling init_with_database() successfully""" self.job_exe_mgr.init_with_database() def test_lost_node(self): """Tests calling lost_node() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Lose node and get lost task update self.job_exe_mgr.lost_node(self.node_model_1.id, now()) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.LOST, task_1_started) lost_job_exe = self.job_exe_mgr.handle_task_update(update) self.assertEqual(lost_job_exe.id, self.job_exe_1.id) self.assertEqual(lost_job_exe.status, 'FAILED') self.assertEqual(lost_job_exe.error.name, 'node-lost') # Make sure a create_job_exe_ends message and failed_jobs message exists for the lost job execution messages = self.job_exe_mgr.get_messages() self.assertEqual(len(messages), 2) job_exe_ends_msg = messages[0] self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends') self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id) failed_jobs_msg = messages[1] self.assertEqual(failed_jobs_msg.type, 'failed_jobs') self.assertTrue(get_builtin_error('node-lost').id in failed_jobs_msg._failed_jobs) self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id) def test_schedule_job_exes(self): """Tests calling schedule_job_exes() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Both executions should be in the manager and ready self.assertEqual(len(self.job_exe_mgr.get_running_job_exes()), 2) self.assertIsNotNone(self.job_exe_mgr.get_running_job_exe(self.job_exe_1.cluster_id)) self.assertIsNotNone(self.job_exe_mgr.get_running_job_exe(self.job_exe_2.cluster_id)) def test_sync_with_database(self): """Tests calling sync_with_database() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Cancel job_exe_1 and job_exe_2 and have manager sync with database Job.objects.update_jobs_to_canceled([self.job_exe_1._job_exe.job, self.job_exe_2._job_exe.job], now()) finished_job_exes = self.job_exe_mgr.sync_with_database() self.assertEqual(self.job_exe_1.status, 'CANCELED') self.assertFalse(self.job_exe_1.is_finished()) self.assertEqual(self.job_exe_2.status, 'CANCELED') self.assertTrue(self.job_exe_2.is_finished()) # Only job_exe_2 is finished, job_exe_1 has a task to kill self.assertEqual(len(finished_job_exes), 1) self.assertEqual(finished_job_exes[0].id, self.job_exe_2.id) # Make sure a create_job_exe_ends message exists for job_exe_2 message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_2.id) # Task killed for job_exe_1 task_1_killed = task_1_started + timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.KILLED, task_1_killed) self.job_exe_mgr.handle_task_update(update) # Make sure a create_job_exe_ends message exists for job_exe_1 self.assertTrue(self.job_exe_1.is_finished()) message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
class TestSystemTaskManager(TestCase): def setUp(self): django.setup() self.agent_id = 'agent_1' self.system_task_mgr = SystemTaskManager() self.task_mgr = TaskManager() # Make sure messaging service is "off" for these tests scheduler_mgr.config.num_message_handlers = 0 def test_handle_completed_db_update_task(self): """Tests handling completed database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Schedule database update task and make sure there are no more system tasks task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], self.system_task_mgr.get_tasks_to_schedule(now())) # Complete task, verify no new tasks update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) self.assertListEqual([], self.system_task_mgr.get_tasks_to_schedule(now())) self.assertTrue(self.system_task_mgr._is_db_update_completed) def test_handle_failed_db_update_task(self): """Tests handling failed database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Fail task after running and get different task next time task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) # No new database update right away tasks = self.system_task_mgr.get_tasks_to_schedule(when + datetime.timedelta( seconds=5)) self.assertListEqual([], tasks) self.assertFalse(self.system_task_mgr._is_db_update_completed) # After error threshold, we should get new database update task new_time = when + SystemTaskManager.DATABASE_UPDATE_ERR_THRESHOLD + datetime.timedelta( seconds=5) task = self.system_task_mgr.get_tasks_to_schedule(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) self.assertFalse(self.system_task_mgr._is_db_update_completed) def test_handle_killed_db_update_task(self): """Tests handling killed database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(self.system_task_mgr._is_db_update_completed) def test_handle_lost_db_update_task(self): """Tests handling lost database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Lose task after scheduling and get different task next time task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] task_2_id = task.id self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(self.system_task_mgr._is_db_update_completed) # Lose task after running and get different task next time task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertNotEqual(task.id, task_2_id) self.assertFalse(self.system_task_mgr._is_db_update_completed)