def test_determine_error(self): """Tests that a pre-task successfully determines the correct error""" scale_errors = [ ScaleDatabaseError(), ScaleIOError(), ScaleOperationalError(), MissingSetting('') ] for scale_error in scale_errors: config = ExecutionConfiguration() config.create_tasks(['pre']) config.set_task_ids(self.job_exe.get_cluster_id()) task = PreTask('agent_1', self.job_exe, self.job_exe.job_type, config) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) task.update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=scale_error.exit_code) error = task.determine_error(update) self.assertEqual(scale_error.error_name, error.name)
def test_handle_failed_cleanup_task(self): """Tests handling failed cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) task_1_id = task.id # Fail task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No new cleanup task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._is_initial_cleanup_completed) # After error threshold, we should get new cleanup task new_time = when + Node.CLEANUP_ERR_THRESHOLD + datetime.timedelta( seconds=5) node._last_heath_task = new_time # Get rid of health check task task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
def test_lost_task(self): """Tests running through a job execution that has a task that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id job_task_started = pre_task_completed + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) running_job_exe.task_update(update) self.assertTrue(task.has_started) # Lose task and make sure the same task is the next one to schedule again when_lost = job_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.LOST, when_lost) running_job_exe.task_update(update) self.assertFalse(task.has_started) task = running_job_exe.start_next_task() self.assertEqual(job_task_id, task.id)
def test_lost_job_execution(self): """Tests running through a job execution that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task and then execution gets lost when_lost = pre_task_completed + timedelta(seconds=1) job_task = running_job_exe.start_next_task() lost_task = running_job_exe.execution_lost(when_lost) self.assertEqual(job_task.id, lost_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual('FAILED', job_exe.status) self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id) self.assertEqual(when_lost, job_exe.ended)
def test_check_for_starvation(self): """Tests calling check_for_starvation() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Start and complete first task of execution task_1_launched = now() task_1 = self.job_exe_1.start_next_task() self.task_mgr.launch_tasks([task_1], task_1_launched) task_1_started = task_1_launched + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.task_mgr.handle_task_update(update) self.job_exe_mgr.handle_task_update(update) task_1_completed = task_1_started + timedelta(seconds=10) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FINISHED, task_1_completed) self.task_mgr.handle_task_update(update) self.job_exe_mgr.handle_task_update(update) # Check after the time threshold has passed and task 2 has still not been launched check_time = task_1_completed + RESOURCE_STARVATION_THRESHOLD + timedelta(seconds=1) finished_job_exes = self.job_exe_mgr.check_for_starvation(check_time) # Check that execution 1 was failed for starvation self.assertEqual(len(finished_job_exes), 1) starved_job_exe = finished_job_exes[0] self.assertEqual(starved_job_exe.id, self.job_exe_1.id) self.assertEqual(starved_job_exe.status, 'FAILED') self.assertEqual(starved_job_exe.error.name, 'resource-starvation') self.assertEqual(starved_job_exe.finished, check_time)
def test_handle_failed_pull_task(self): """Tests handling failed Docker pull task""" when = now() node = Node(self.node_agent, self.node, self.scheduler) node._last_health_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No new pull task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._is_image_pulled) # After error threshold, we should get new pull task new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta(seconds=5) node._last_health_task = new_time # Get rid of health check task task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
def test_sync_with_database(self): """Tests calling sync_with_database() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Cancel job_exe_1 and have manager sync with database Job.objects.update_jobs_to_canceled([self.job_exe_1.job_id], now()) tasks_to_kill = self.job_exe_mgr.sync_with_database() self.assertEqual(self.job_exe_1.status, 'CANCELED') self.assertEqual(len(tasks_to_kill), 1) self.assertEqual(tasks_to_kill[0].id, task_1.id) # No message yet since we wait for the canceled task to be killed self.assertListEqual(self.job_exe_mgr.get_messages(), []) # Task killed task_1_killed = task_1_started + timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.KILLED, task_1_killed) self.job_exe_mgr.handle_task_update(update) # Make sure a create_job_exe_ends message exists for the canceled job execution message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
def test_handle_task_update(self): """Tests calling handle_task_update() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Start task task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) # Job execution is not finished, so None should be returned and no message is available result = self.job_exe_mgr.handle_task_update(update) self.assertIsNone(result) self.assertListEqual(self.job_exe_mgr.get_messages(), []) # Fail task task_1_failed = task_1_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed, exit_code=1) # Job execution is finished, so it should be returned and a create_job_exe_ends message is available result = self.job_exe_mgr.handle_task_update(update) self.assertEqual(self.job_exe_1.id, result.id) message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # No task since there are no job executions to clean self.assertListEqual([], node.get_next_tasks(when)) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node.add_job_execution(job_exe) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No task since all job executions have been cleaned self.assertListEqual([], node.get_next_tasks(when))
def test_handle_failed_health_task_low_docker_space(self): """Tests handling a failed health task where Docker has low disk space""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with low Docker space exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.LOW_DOCKER_SPACE_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in node._conditions._active_errors)
def test_handle_killed_pull_task(self): """Tests handling killed cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled)
def test_handle_failed_health_task_bad_logstash(self): """Tests handling a failed health task where logstash is unreachable""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task with bad logstash exit code self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=HealthTask.BAD_LOGSTASH_CODE) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in node._conditions._active_errors)
def test_handle_completed_db_update_task(self): """Tests handling completed database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Schedule database update task and make sure there are no more system tasks task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], self.system_task_mgr.get_tasks_to_schedule(now())) # Complete task, verify no new tasks update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) self.assertListEqual([], self.system_task_mgr.get_tasks_to_schedule(now())) self.assertTrue(self.system_task_mgr._is_db_update_completed)
def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" node = Node(self.node_agent, self.node) node.initial_cleanup_completed() node_cleanup = NodeCleanup(node) # No task since there are no job executions to clean self.assertIsNone(node_cleanup.get_next_task()) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node_cleanup.add_job_execution(job_exe) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) node_cleanup.handle_task_update(update) # No task since all job executions have been cleaned self.assertIsNone(node_cleanup.get_next_task())
def test_handle_task_update(self): """Tests calling TaskManager.handle_task_update()""" task_id = 'task_1' task_name = 'My Task' agent_id = 'agent_1' task_1 = ImplementedTask(task_id, task_name, agent_id) when_launched = now() manager = TaskManager() manager.launch_tasks([task_1], when_launched) when_finished = datetime.timedelta(seconds=1) update_1 = job_test_utils.create_task_status_update( task_1.id, task_1.agent_id, TaskStatusUpdate.FINISHED, when=when_finished) manager.handle_task_update(update_1) self.assertTrue(task_1.has_ended) self.assertEqual(task_1._ended, when_finished) update_2 = job_test_utils.create_task_status_update( 'task_2', 'New Agent', TaskStatusUpdate.RUNNING, when=now()) manager.handle_task_update(update_2) # Should ignore, no error
def test_handle_killed_db_update_task(self): """Tests handling killed database update task""" # Get database update task when = now() self.assertFalse(self.system_task_mgr._is_db_update_completed) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) task_1_id = task.id # Kill task after running and get different task next time task.agent_id = self.agent_id self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) self.task_mgr.handle_task_update(update) self.system_task_mgr.handle_task_update(update) task = self.system_task_mgr.get_tasks_to_schedule(when)[0] self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(self.system_task_mgr._is_db_update_completed)
def test_handle_successful_health_task(self): """Tests handling the health task successfully""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule health task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._conditions.is_health_check_normal)
def test_handle_initial_cleanup_task(self): """Tests handling the initial cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertTrue(task.is_initial_cleanup) self.assertEqual(task.agent_id, self.node_agent) # Schedule initial cleanup and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertFalse(node._is_initial_cleanup_completed) # Complete initial clean up, verify no new cleanup task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) for task in node.get_next_tasks(when): self.assertFalse(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertTrue(node._is_initial_cleanup_completed)
def test_handle_successful_pull_task(self): """Tests handling the Docker pull task successfully""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get Docker pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.agent_id, self.node_agent) # Schedule pull task and make sure no new task is ready self.task_mgr.launch_tasks([task], now()) self.assertListEqual([], node.get_next_tasks(when)) self.assertFalse(node._is_image_pulled) # Complete pull task, verify no new task update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) self.assertListEqual([], node.get_next_tasks(when)) self.assertTrue(node._is_image_pulled) # Node should now be ready self.assertEqual(node._state, Node.READY)
def test_docker_terminated_error(self): """Tests running through a job execution where a Docker container terminates""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) # Pre-task Docker container terminates update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now(), reason='REASON_EXECUTOR_TERMINATED') running_job_exe.task_update(update) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-terminated')
def test_lost_node(self): """Tests calling lost_node() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Lose node and get lost task update self.job_exe_mgr.lost_node(self.node_model_1.id, now()) update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.LOST, task_1_started) lost_job_exe = self.job_exe_mgr.handle_task_update(update) self.assertEqual(lost_job_exe.id, self.job_exe_1.id) self.assertEqual(lost_job_exe.status, 'FAILED') self.assertEqual(lost_job_exe.error.name, 'node-lost') # Make sure a create_job_exe_ends message and failed_jobs message exists for the lost job execution messages = self.job_exe_mgr.get_messages() self.assertEqual(len(messages), 2) job_exe_ends_msg = messages[0] self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends') self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id) failed_jobs_msg = messages[1] self.assertEqual(failed_jobs_msg.type, 'failed_jobs') self.assertTrue(get_builtin_error('node-lost').id in failed_jobs_msg._failed_jobs) self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id)
def test_handle_task_update(self): """Tests calling handle_task_update() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) # Start tasks task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) task_2 = self.job_exe_2.start_next_task() # Shortcut job exe 2 so that there is only one task to complete self.job_exe_2._remaining_tasks = [] task_2_started = now() - timedelta(minutes=5) update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.RUNNING, task_2_started) # Job execution is not finished, so None should be returned and no message is available result = self.job_exe_mgr.handle_task_update(update_1) self.assertIsNone(result) result = self.job_exe_mgr.handle_task_update(update_2) self.assertIsNone(result) self.assertListEqual(self.job_exe_mgr.get_messages(), []) # Fail task 1 for job exe 1 task_1_failed = task_1_started + timedelta(seconds=1) update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed, exit_code=1) # Complete task 2 for job exe 2 task_2_completed = task_2_started + timedelta(seconds=1) update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.FINISHED, task_2_completed) # Job executions are finished, so they should be returned and a create_job_exe_ends message, a failed_jobs # message, and a completed_jobs message is available result = self.job_exe_mgr.handle_task_update(update_1) self.assertEqual(self.job_exe_1.id, result.id) result = self.job_exe_mgr.handle_task_update(update_2) self.assertEqual(self.job_exe_2.id, result.id) messages = self.job_exe_mgr.get_messages() self.assertEqual(len(messages), 3) job_exe_ends_msg = messages[0] self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends') self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id) self.assertEqual(job_exe_ends_msg._job_exe_ends[1].job_exe_id, self.job_exe_2.id) completed_jobs_msg = messages[1] self.assertEqual(completed_jobs_msg.type, 'completed_jobs') self.assertEqual(completed_jobs_msg._completed_jobs[0].job_id, self.job_exe_2.job_id) failed_jobs_msg = messages[2] self.assertEqual(failed_jobs_msg.type, 'failed_jobs') self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id)
def test_post_task_launch_error(self): """Tests running through a job execution where a post-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) running_job_exe.task_update(update) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FINISHED, job_task_completed) running_job_exe.task_update(update) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id # Post-task fails to launch update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.FAILED, now()) running_job_exe.task_update(update) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_handle_lost_health_task(self): """Tests handling lost health task""" when = now() node = Node(self.node_agent, self.node, self.scheduler) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get different task next time update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task with scheduling and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal) # Lose task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertTrue(node._conditions.is_health_check_normal)
def test_handle_lost_pull_task(self): """Tests handling lost pull task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._update_state() # Get pull task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task with scheduling and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled) # Lose task after running and get same task again self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX)) self.assertEqual(task.id, task_1_id) self.assertFalse(node._is_image_pulled)
def test_need_reconciliation(self): """Tests calling Task.need_reconciliation()""" task_1 = ImplementedTask('task_1', 'Task 1', 'agent_id') task_2 = ImplementedTask('task_2', 'Task 2', 'agent_id') task_3 = ImplementedTask('task_3', 'Task 3', 'agent_id') task_4 = ImplementedTask('task_4', 'Task 4', 'agent_id') task_5 = ImplementedTask('task_5', 'Task 5', 'agent_id') task_1_and_2_launch_time = now() task_3_and_5_launch_time = task_1_and_2_launch_time + RUNNING_RECON_THRESHOLD check_time = task_3_and_5_launch_time + datetime.timedelta(seconds=1) check_time_2 = check_time + datetime.timedelta(seconds=1) # Task 1 and 2 launch task_1.launch(task_1_and_2_launch_time) task_2.launch(task_1_and_2_launch_time) # The reconciliation threshold has now expired # Task 3 and 5 launches and a task update comes for task 2 task_3.launch(task_3_and_5_launch_time) task_5.launch(task_3_and_5_launch_time) update = job_test_utils.create_task_status_update( task_2.id, 'agent_id', TaskStatusUpdate.RUNNING, task_3_and_5_launch_time) task_2.update(update) # Task 5 gets force reconciliation call task_5.force_reconciliation() # A second later, we check for tasks needing reconciliation # Task 1 was launched a while ago (exceeding threshold) so it should be reconciled self.assertTrue(task_1.needs_reconciliation(check_time)) # Task 2 received an update 1 second ago so it should not be reconciled self.assertFalse(task_2.needs_reconciliation(check_time)) # Task 3 was launched 1 second ago so it should not be reconciled self.assertFalse(task_3.needs_reconciliation(check_time)) # Task 4 did not even launch so it should not be reconciled self.assertFalse(task_4.needs_reconciliation(check_time)) # Task 5 had force_reconciliation() called so it should be reconciled self.assertTrue(task_5.needs_reconciliation(check_time)) # Task 5 gets task update to clear force recon update = job_test_utils.create_task_status_update( task_5.id, 'agent_id', TaskStatusUpdate.RUNNING, check_time) task_5.update(update) # Task 5 received an update so force recon should be cleared and it should be not reconciled self.assertFalse(task_5.needs_reconciliation(check_time_2))
def test_change_agent_id(self): """Tests the NodeManager where a node's agent ID changes""" manager = CleanupManager() node_1 = Node(self.node_agent_1, self.node_1) node_2 = Node(self.node_agent_2, self.node_2) manager.update_nodes([node_1, node_2]) tasks = manager.get_next_tasks() task_1 = None for task in tasks: task.launch(now()) if task.agent_id == self.node_agent_1: task_1 = task # Node 1 changes agent ID node_1.update_from_mesos(agent_id=self.node_agent_3) manager.update_nodes([node_1, node_2]) # Should get new initial cleanup task for node 1 tasks = manager.get_next_tasks() self.assertEqual(len(tasks), 1) new_task_1 = tasks[0] self.assertEqual(new_task_1.agent_id, self.node_agent_3) # Task update comes back for original node 1 initial cleanup task, manager should ignore with no exception update = job_test_utils.create_task_status_update(task_1.id, task_1.agent_id, TaskStatusUpdate.FAILED, now()) manager.handle_task_update(update)
def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() job = job_test_utils.create_job(job_type=ingest_job_type, num_exes=1) job_exe = job_test_utils.create_job_exe(job=job) running_job_exe = RunningJobExecution(job_exe) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=job_exe.id) self.assertEqual('FAILED', job_exe.status) self.assertEqual('ingest-timeout', job_exe.error.name) self.assertEqual(when_timed_out, job_exe.ended)
def test_job_exe_clean_task(self): """Tests the NodeManager where a cleanup task is returned to clean up a job execution""" when = now() node_mgr = NodeManager() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr = CleanupManager() cleanup_mgr.update_nodes(node_mgr.get_nodes()) tasks = node_mgr.get_next_tasks(when) task_mgr = TaskManager() # Complete initial cleanup tasks for task in tasks: task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task_mgr.handle_task_update(update) node_mgr.handle_task_update(update) # Mark image pull done to get rid of image tasks for node in node_mgr.get_nodes(): node._image_pull_completed() node._update_state() job_exe = job_test_utils.create_running_job_exe(agent_id=self.agent_1, node=self.node_1) # Add a job execution to clean up and get the cleanup task for it cleanup_mgr.add_job_execution(job_exe) tasks = node_mgr.get_next_tasks(when) self.assertEqual(len(tasks), 1) task = tasks[0] self.assertEqual(task.agent_id, self.agent_1.agent_id) self.assertFalse(task.is_initial_cleanup) self.assertEqual(len(task.job_exes), 1)
def test_get_tasks_to_kill(self): """Tests calling get_tasks_to_kill() successfully""" # Start with 5 tasks scheduler_mgr.config.num_message_handlers = 5 service = MessagingService() tasks = service.get_tasks_to_schedule() task_mgr.launch_tasks(tasks, now()) # Lower number of desired tasks to 3, should get 2 to kill scheduler_mgr.config.num_message_handlers = 3 tasks_to_kill = service.get_tasks_to_kill() self.assertEqual(len(tasks_to_kill), 2) # Kill the 2 tasks for task in tasks_to_kill: update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) task_mgr.handle_task_update(update) service.handle_task_update(update) self.assertEqual(service.get_actual_task_count(), 3) # Increase desired tasks to 10, should not get any to kill scheduler_mgr.config.num_message_handlers = 10 tasks_to_kill = service.get_tasks_to_kill() self.assertEqual(len(tasks_to_kill), 0)
def test_general_algorithm_error(self): """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code) """ # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) running_job_exe.task_update(update) # Fail job-task job_task_failed = job_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FAILED, job_task_failed, exit_code=1) running_job_exe.task_update(update) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'algorithm-unknown')
def test_handle_lost_cleanup_tasks(self): """Tests handling lost cleanup tasks""" when = now() node = Node(self.node_agent, self.node, self.scheduler) # Get initial cleanup task task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) task_1_id = task.id # Lose task without scheduling and get different task next time update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) # Lose task with scheduling and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed) # Lose task after running and get different task next time self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.LOST, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node._is_initial_cleanup_completed)
def test_sync_with_database(self): """Tests calling sync_with_database() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], []) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Cancel job_exe_1 and job_exe_2 and have manager sync with database Job.objects.update_jobs_to_canceled_old( [self.job_exe_1.job_id, self.job_exe_2.job_id], now()) finished_job_exes = self.job_exe_mgr.sync_with_database() self.assertEqual(self.job_exe_1.status, 'CANCELED') self.assertFalse(self.job_exe_1.is_finished()) self.assertEqual(self.job_exe_2.status, 'CANCELED') self.assertTrue(self.job_exe_2.is_finished()) # Only job_exe_2 is finished, job_exe_1 has a task to kill self.assertEqual(len(finished_job_exes), 1) self.assertEqual(finished_job_exes[0].id, self.job_exe_2.id) # Make sure a create_job_exe_ends message exists for job_exe_2 message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_2.id) # Task killed for job_exe_1 task_1_killed = task_1_started + timedelta(minutes=5) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.KILLED, task_1_killed) self.job_exe_mgr.handle_task_update(update) # Make sure a create_job_exe_ends message exists for job_exe_1 self.assertTrue(self.job_exe_1.is_finished()) message = self.job_exe_mgr.get_messages()[0] self.assertEqual(message.type, 'create_job_exe_ends') self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
def test_handle_killed_task(self): """Tests handling killed cleanup task""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() task_1_id = task.id self.assertIsNotNone(task) # Kill task after running and get different task next time task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed)
def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() running_job_exe = job_test_utils.create_running_job_exe( agent_id='agent_1', job_type=ingest_job_type, num_exes=1) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertFalse(running_job_exe.is_finished() ) # Not finished until killed task update arrives self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready()) # Killed task update arrives, job execution is now finished job_task_kill = when_timed_out + timedelta(seconds=1) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.KILLED, job_task_kill) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready())
def test_handle_failed_health_task(self): """Tests handling failed health task""" when = now() node = Node(self.node_agent, self.node) node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # Get health task task = node.get_next_tasks(when)[0] task_1_id = task.id self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX)) # Fail task after running self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # Check node state self.assertEqual(node._state, Node.DEGRADED) self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in node._conditions._active_errors) # No new health task right away tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5)) self.assertListEqual([], tasks) self.assertFalse(node._conditions.is_health_check_normal) # After error threshold, we should get new health task new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta( seconds=5) task = node.get_next_tasks(new_time)[0] self.assertNotEqual(task.id, task_1_id) self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def test_parsing_container_name(self): """Tests that a task successfully parses container name from a RUNNING task update""" task_id = 'task_1' task_name = 'My Task' agent_id = 'agent_1' container_name = 'container_1234' data = {'Config': {'Env': ['DUMMY_ENV=DUMMY', 'MESOS_CONTAINER_NAME=' + container_name]}} task = ImplementedTask(task_id, task_name, agent_id) update = job_test_utils.create_task_status_update(task_id, agent_id, TaskStatusUpdate.RUNNING, now(), data=data) task.update(update) self.assertEqual(task.container_name, container_name)
def test_determine_error(self): """Tests that a pre-task successfully determines the correct error""" scale_errors = [ ScaleDatabaseError(), ScaleIOError(), ScaleOperationalError(), MissingSetting('') ] for scale_error in scale_errors: task = PreTask(self.job_exe) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) task.update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now(), exit_code=scale_error.exit_code) error = task.determine_error(update) self.assertEqual(scale_error.error_name, error.name)
def test_handle_lost_tasks(self): """Tests handling lost cleanup tasks""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed) # Lose task with scheduling and get same task again task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed) # Lose task after running and get same task again task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed)
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, pre_task_failed, exit_code=1) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertIsNotNone(job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_handle_initial_cleanup_task(self): """Tests handling the initial cleanup task""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertTrue(task.is_initial_cleanup) self.assertEqual(task.agent_id, self.node_agent) # Schedule initial cleanup and make sure no new task is ready task.launch(now()) self.assertIsNone(node_cleanup.get_next_task()) self.assertFalse(node.is_initial_cleanup_completed) # Complete initial clean up, verify no new task update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) node_cleanup.handle_task_update(update) self.assertIsNone(node_cleanup.get_next_task()) self.assertTrue(node.is_initial_cleanup_completed)
def test_no_job_exes_to_clean(self): """Tests the NodeManager where no cleanup tasks are returned due to no job executions to clean""" manager = CleanupManager() node_1 = Node(self.node_agent_1, self.node_1) node_2 = Node(self.node_agent_2, self.node_2) manager.update_nodes([node_1, node_2]) tasks = manager.get_next_tasks() # Complete initial cleanup tasks for task in tasks: task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) manager.handle_task_update(update) tasks = manager.get_next_tasks() self.assertListEqual(tasks, []) # No tasks since there are no job executions to clean up
def test_job_exe_clean_task(self): """Tests the NodeManager where a cleanup task is returned to clean up a job execution""" manager = CleanupManager() node_1 = Node(self.node_agent_1, self.node_1) node_2 = Node(self.node_agent_2, self.node_2) manager.update_nodes([node_1, node_2]) tasks = manager.get_next_tasks() # Complete initial cleanup tasks for task in tasks: task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) manager.handle_task_update(update) # Add a job execution to clean up and get the cleanup task for it manager.add_job_execution(RunningJobExecution(self.job_exe_1)) tasks = manager.get_next_tasks() self.assertEqual(len(tasks), 1) task = tasks[0] self.assertEqual(task.agent_id, self.node_agent_1) self.assertFalse(task.is_initial_cleanup) self.assertEqual(len(task.job_exes), 1)
def test_successful_normal_job_execution(self): """Tests running through a normal job execution successfully""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task sets updated command arguments updated_commands_args = '-arg updated' JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed, exit_code=1) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id self.assertEqual(task._command_arguments, updated_commands_args) # Make sure job task has updated command args self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Job-task running job_task_started = pre_task_completed + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FINISHED, job_task_completed, exit_code=2) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Post-task running post_task_started = job_task_completed + timedelta(seconds=1) update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.RUNNING, post_task_started) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete post-task post_task_completed = post_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.FINISHED, post_task_completed, exit_code=3) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_completed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual(job_task_started, job_exe.job_started) self.assertEqual(job_task_completed, job_exe.job_completed) self.assertEqual(2, job_exe.job_exit_code) self.assertEqual(post_task_started, job_exe.post_started) self.assertEqual(post_task_completed, job_exe.post_completed) self.assertEqual(3, job_exe.post_exit_code) self.assertEqual('COMPLETED', job_exe.status) self.assertGreater(job_exe.ended, post_task_completed)