Exemplo n.º 1
0
    def test_determine_error(self):
        """Tests that a pre-task successfully determines the correct error"""

        scale_errors = [
            ScaleDatabaseError(),
            ScaleIOError(),
            ScaleOperationalError(),
            MissingSetting('')
        ]

        for scale_error in scale_errors:
            config = ExecutionConfiguration()
            config.create_tasks(['pre'])
            config.set_task_ids(self.job_exe.get_cluster_id())
            task = PreTask('agent_1', self.job_exe, self.job_exe.job_type,
                           config)
            update = job_test_utils.create_task_status_update(
                task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
            task.update(update)
            update = job_test_utils.create_task_status_update(
                task.id,
                task.agent_id,
                TaskStatusUpdate.FAILED,
                now(),
                exit_code=scale_error.exit_code)
            error = task.determine_error(update)
            self.assertEqual(scale_error.error_name, error.name)
Exemplo n.º 2
0
    def test_handle_failed_cleanup_task(self):
        """Tests handling failed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Fail task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new cleanup task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_initial_cleanup_completed)

        # After error threshold, we should get new cleanup task
        new_time = when + Node.CLEANUP_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        node._last_heath_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
Exemplo n.º 3
0
    def test_lost_task(self):
        """Tests running through a job execution that has a task that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        job_task_started = pre_task_completed + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started)
        running_job_exe.task_update(update)
        self.assertTrue(task.has_started)

        # Lose task and make sure the same task is the next one to schedule again
        when_lost = job_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.LOST, when_lost)
        running_job_exe.task_update(update)
        self.assertFalse(task.has_started)
        task = running_job_exe.start_next_task()
        self.assertEqual(job_task_id, task.id)
Exemplo n.º 4
0
    def test_lost_job_execution(self):
        """Tests running through a job execution that gets lost"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task and then execution gets lost
        when_lost = pre_task_completed + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        lost_task = running_job_exe.execution_lost(when_lost)
        self.assertEqual(job_task.id, lost_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id)
        self.assertEqual(when_lost, job_exe.ended)
Exemplo n.º 5
0
    def test_check_for_starvation(self):
        """Tests calling check_for_starvation() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], [])

        # Start and complete first task of execution
        task_1_launched = now()
        task_1 = self.job_exe_1.start_next_task()
        self.task_mgr.launch_tasks([task_1], task_1_launched)
        task_1_started = task_1_launched + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.task_mgr.handle_task_update(update)
        self.job_exe_mgr.handle_task_update(update)
        task_1_completed = task_1_started + timedelta(seconds=10)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          task_1_completed)
        self.task_mgr.handle_task_update(update)
        self.job_exe_mgr.handle_task_update(update)

        # Check after the time threshold has passed and task 2 has still not been launched
        check_time = task_1_completed + RESOURCE_STARVATION_THRESHOLD + timedelta(seconds=1)
        finished_job_exes = self.job_exe_mgr.check_for_starvation(check_time)

        # Check that execution 1 was failed for starvation
        self.assertEqual(len(finished_job_exes), 1)
        starved_job_exe = finished_job_exes[0]
        self.assertEqual(starved_job_exe.id, self.job_exe_1.id)
        self.assertEqual(starved_job_exe.status, 'FAILED')
        self.assertEqual(starved_job_exe.error.name, 'resource-starvation')
        self.assertEqual(starved_job_exe.finished, check_time)
Exemplo n.º 6
0
    def test_handle_failed_pull_task(self):
        """Tests handling failed Docker pull task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        node._last_health_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No new pull task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._is_image_pulled)

        # After error threshold, we should get new pull task
        new_time = when + Node.IMAGE_PULL_ERR_THRESHOLD + datetime.timedelta(seconds=5)
        node._last_health_task = new_time  # Get rid of health check task
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
Exemplo n.º 7
0
    def test_sync_with_database(self):
        """Tests calling sync_with_database() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], [])

        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.job_exe_mgr.handle_task_update(update)

        # Cancel job_exe_1 and have manager sync with database
        Job.objects.update_jobs_to_canceled([self.job_exe_1.job_id], now())
        tasks_to_kill = self.job_exe_mgr.sync_with_database()

        self.assertEqual(self.job_exe_1.status, 'CANCELED')
        self.assertEqual(len(tasks_to_kill), 1)
        self.assertEqual(tasks_to_kill[0].id, task_1.id)
        # No message yet since we wait for the canceled task to be killed
        self.assertListEqual(self.job_exe_mgr.get_messages(), [])

        # Task killed
        task_1_killed = task_1_started + timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.KILLED, task_1_killed)
        self.job_exe_mgr.handle_task_update(update)

        # Make sure a create_job_exe_ends message exists for the canceled job execution
        message = self.job_exe_mgr.get_messages()[0]
        self.assertEqual(message.type, 'create_job_exe_ends')
        self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
Exemplo n.º 8
0
    def test_handle_task_update(self):
        """Tests calling handle_task_update() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], [])

        # Start task
        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)

        # Job execution is not finished, so None should be returned and no message is available
        result = self.job_exe_mgr.handle_task_update(update)
        self.assertIsNone(result)
        self.assertListEqual(self.job_exe_mgr.get_messages(), [])

        # Fail task
        task_1_failed = task_1_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed,
                                                          exit_code=1)

        # Job execution is finished, so it should be returned and a create_job_exe_ends message is available
        result = self.job_exe_mgr.handle_task_update(update)
        self.assertEqual(self.job_exe_1.id, result.id)
        message = self.job_exe_mgr.get_messages()[0]
        self.assertEqual(message.type, 'create_job_exe_ends')
        self.assertEqual(message._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
Exemplo n.º 9
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # No task since there are no job executions to clean
        self.assertListEqual([], node.get_next_tasks(when))

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node.add_job_execution(job_exe)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertListEqual([], node.get_next_tasks(when))
Exemplo n.º 10
0
    def test_handle_failed_health_task_low_docker_space(self):
        """Tests handling a failed health task where Docker has low disk space"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with low Docker space exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.LOW_DOCKER_SPACE_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.LOW_DOCKER_SPACE_ERR.name in
                        node._conditions._active_errors)
Exemplo n.º 11
0
    def test_handle_killed_pull_task(self):
        """Tests handling killed cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        task_1_id = task.id

        # Kill task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)
Exemplo n.º 12
0
    def test_handle_failed_health_task_bad_logstash(self):
        """Tests handling a failed health task where logstash is unreachable"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task with bad logstash exit code
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id,
            task.agent_id,
            TaskStatusUpdate.FAILED,
            now(),
            exit_code=HealthTask.BAD_LOGSTASH_CODE)
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.BAD_LOGSTASH_ERR.name in
                        node._conditions._active_errors)
Exemplo n.º 13
0
    def test_handle_completed_db_update_task(self):
        """Tests handling completed database update task"""

        # Get database update task
        when = now()
        self.assertFalse(self.system_task_mgr._is_db_update_completed)
        task = self.system_task_mgr.get_tasks_to_schedule(when)[0]
        self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX))
        task_1_id = task.id

        # Schedule database update task and make sure there are no more system tasks
        task.agent_id = self.agent_id
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([],
                             self.system_task_mgr.get_tasks_to_schedule(now()))

        # Complete task, verify no new tasks
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        self.system_task_mgr.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        self.system_task_mgr.handle_task_update(update)
        self.assertListEqual([],
                             self.system_task_mgr.get_tasks_to_schedule(now()))
        self.assertTrue(self.system_task_mgr._is_db_update_completed)
Exemplo n.º 14
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        node = Node(self.node_agent, self.node)
        node.initial_cleanup_completed()
        node_cleanup = NodeCleanup(node)

        # No task since there are no job executions to clean
        self.assertIsNone(node_cleanup.get_next_task())

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node_cleanup.add_job_execution(job_exe)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        task.launch(now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        node_cleanup.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        node_cleanup.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertIsNone(node_cleanup.get_next_task())
Exemplo n.º 15
0
    def test_handle_task_update(self):
        """Tests calling TaskManager.handle_task_update()"""

        task_id = 'task_1'
        task_name = 'My Task'
        agent_id = 'agent_1'
        task_1 = ImplementedTask(task_id, task_name, agent_id)

        when_launched = now()
        manager = TaskManager()
        manager.launch_tasks([task_1], when_launched)

        when_finished = datetime.timedelta(seconds=1)
        update_1 = job_test_utils.create_task_status_update(
            task_1.id,
            task_1.agent_id,
            TaskStatusUpdate.FINISHED,
            when=when_finished)
        manager.handle_task_update(update_1)

        self.assertTrue(task_1.has_ended)
        self.assertEqual(task_1._ended, when_finished)

        update_2 = job_test_utils.create_task_status_update(
            'task_2', 'New Agent', TaskStatusUpdate.RUNNING, when=now())
        manager.handle_task_update(update_2)  # Should ignore, no error
Exemplo n.º 16
0
    def test_handle_killed_db_update_task(self):
        """Tests handling killed database update task"""

        # Get database update task
        when = now()
        self.assertFalse(self.system_task_mgr._is_db_update_completed)
        task = self.system_task_mgr.get_tasks_to_schedule(when)[0]
        self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX))
        task_1_id = task.id

        # Kill task after running and get different task next time
        task.agent_id = self.agent_id
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        self.system_task_mgr.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        self.task_mgr.handle_task_update(update)
        self.system_task_mgr.handle_task_update(update)
        task = self.system_task_mgr.get_tasks_to_schedule(when)[0]
        self.assertTrue(task.id.startswith(DB_UPDATE_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(self.system_task_mgr._is_db_update_completed)
Exemplo n.º 17
0
    def test_handle_successful_health_task(self):
        """Tests handling the health task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # Get health task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule health task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._conditions.is_health_check_normal)
Exemplo n.º 18
0
    def test_handle_initial_cleanup_task(self):
        """Tests handling the initial cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when

        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(task.is_initial_cleanup)
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule initial cleanup and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_initial_cleanup_completed)

        # Complete initial clean up, verify no new cleanup task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        for task in node.get_next_tasks(when):
            self.assertFalse(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertTrue(node._is_initial_cleanup_completed)
Exemplo n.º 19
0
    def test_handle_successful_pull_task(self):
        """Tests handling the Docker pull task successfully"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()

        # Get Docker pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule pull task and make sure no new task is ready
        self.task_mgr.launch_tasks([task], now())
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertFalse(node._is_image_pulled)

        # Complete pull task, verify no new task
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        self.assertListEqual([], node.get_next_tasks(when))
        self.assertTrue(node._is_image_pulled)
        # Node should now be ready
        self.assertEqual(node._state, Node.READY)
Exemplo n.º 20
0
    def test_docker_terminated_error(self):
        """Tests running through a job execution where a Docker container terminates"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)

        # Pre-task Docker container terminates
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now(),
                                                          reason='REASON_EXECUTOR_TERMINATED')
        running_job_exe.task_update(update)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-terminated')
Exemplo n.º 21
0
    def test_lost_node(self):
        """Tests calling lost_node() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], [])

        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.job_exe_mgr.handle_task_update(update)

        # Lose node and get lost task update
        self.job_exe_mgr.lost_node(self.node_model_1.id, now())
        update = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.LOST, task_1_started)
        lost_job_exe = self.job_exe_mgr.handle_task_update(update)

        self.assertEqual(lost_job_exe.id, self.job_exe_1.id)
        self.assertEqual(lost_job_exe.status, 'FAILED')
        self.assertEqual(lost_job_exe.error.name, 'node-lost')

        # Make sure a create_job_exe_ends message and failed_jobs message exists for the lost job execution
        messages = self.job_exe_mgr.get_messages()
        self.assertEqual(len(messages), 2)
        job_exe_ends_msg = messages[0]
        self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends')
        self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
        failed_jobs_msg = messages[1]
        self.assertEqual(failed_jobs_msg.type, 'failed_jobs')
        self.assertTrue(get_builtin_error('node-lost').id in failed_jobs_msg._failed_jobs)
        self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id)
Exemplo n.º 22
0
    def test_handle_task_update(self):
        """Tests calling handle_task_update() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2], [])

        # Start tasks
        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.RUNNING,
                                                            task_1_started)
        task_2 = self.job_exe_2.start_next_task()
        # Shortcut job exe 2 so that there is only one task to complete
        self.job_exe_2._remaining_tasks = []
        task_2_started = now() - timedelta(minutes=5)
        update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.RUNNING,
                                                            task_2_started)

        # Job execution is not finished, so None should be returned and no message is available
        result = self.job_exe_mgr.handle_task_update(update_1)
        self.assertIsNone(result)
        result = self.job_exe_mgr.handle_task_update(update_2)
        self.assertIsNone(result)
        self.assertListEqual(self.job_exe_mgr.get_messages(), [])

        # Fail task 1 for job exe 1
        task_1_failed = task_1_started + timedelta(seconds=1)
        update_1 = job_test_utils.create_task_status_update(task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed,
                                                            exit_code=1)

        # Complete task 2 for job exe 2
        task_2_completed = task_2_started + timedelta(seconds=1)
        update_2 = job_test_utils.create_task_status_update(task_2.id, 'agent', TaskStatusUpdate.FINISHED,
                                                            task_2_completed)

        # Job executions are finished, so they should be returned and a create_job_exe_ends message, a failed_jobs
        # message, and a completed_jobs message is available
        result = self.job_exe_mgr.handle_task_update(update_1)
        self.assertEqual(self.job_exe_1.id, result.id)
        result = self.job_exe_mgr.handle_task_update(update_2)
        self.assertEqual(self.job_exe_2.id, result.id)

        messages = self.job_exe_mgr.get_messages()
        self.assertEqual(len(messages), 3)
        job_exe_ends_msg = messages[0]
        self.assertEqual(job_exe_ends_msg.type, 'create_job_exe_ends')
        self.assertEqual(job_exe_ends_msg._job_exe_ends[0].job_exe_id, self.job_exe_1.id)
        self.assertEqual(job_exe_ends_msg._job_exe_ends[1].job_exe_id, self.job_exe_2.id)
        completed_jobs_msg = messages[1]
        self.assertEqual(completed_jobs_msg.type, 'completed_jobs')
        self.assertEqual(completed_jobs_msg._completed_jobs[0].job_id, self.job_exe_2.job_id)
        failed_jobs_msg = messages[2]
        self.assertEqual(failed_jobs_msg.type, 'failed_jobs')
        self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, self.job_exe_1.job_id)
Exemplo n.º 23
0
    def test_post_task_launch_error(self):
        """Tests running through a job execution where a post-task fails to launch"""

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          job_task_started)
        running_job_exe.task_update(update)

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          job_task_completed)
        running_job_exe.task_update(update)

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id

        # Post-task fails to launch
        update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.FAILED, now())
        running_job_exe.task_update(update)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'docker-task-launch')
Exemplo n.º 24
0
    def test_handle_lost_health_task(self):
        """Tests handling lost health task"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get different task next time
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)

        # Lose task with scheduling and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)

        # Lose task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(node._conditions.is_health_check_normal)
Exemplo n.º 25
0
    def test_handle_lost_pull_task(self):
        """Tests handling lost pull task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._update_state()
        # Get pull task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get same task again
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task with scheduling and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)

        # Lose task after running and get same task again
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(PULL_TASK_ID_PREFIX))
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node._is_image_pulled)
Exemplo n.º 26
0
    def test_need_reconciliation(self):
        """Tests calling Task.need_reconciliation()"""

        task_1 = ImplementedTask('task_1', 'Task 1', 'agent_id')
        task_2 = ImplementedTask('task_2', 'Task 2', 'agent_id')
        task_3 = ImplementedTask('task_3', 'Task 3', 'agent_id')
        task_4 = ImplementedTask('task_4', 'Task 4', 'agent_id')
        task_5 = ImplementedTask('task_5', 'Task 5', 'agent_id')

        task_1_and_2_launch_time = now()
        task_3_and_5_launch_time = task_1_and_2_launch_time + RUNNING_RECON_THRESHOLD
        check_time = task_3_and_5_launch_time + datetime.timedelta(seconds=1)
        check_time_2 = check_time + datetime.timedelta(seconds=1)

        # Task 1 and 2 launch
        task_1.launch(task_1_and_2_launch_time)
        task_2.launch(task_1_and_2_launch_time)

        # The reconciliation threshold has now expired
        # Task 3 and 5 launches and a task update comes for task 2
        task_3.launch(task_3_and_5_launch_time)
        task_5.launch(task_3_and_5_launch_time)
        update = job_test_utils.create_task_status_update(
            task_2.id, 'agent_id', TaskStatusUpdate.RUNNING,
            task_3_and_5_launch_time)
        task_2.update(update)

        # Task 5 gets force reconciliation call
        task_5.force_reconciliation()

        # A second later, we check for tasks needing reconciliation
        # Task 1 was launched a while ago (exceeding threshold) so it should be reconciled
        self.assertTrue(task_1.needs_reconciliation(check_time))
        # Task 2 received an update 1 second ago so it should not be reconciled
        self.assertFalse(task_2.needs_reconciliation(check_time))
        # Task 3 was launched 1 second ago so it should not be reconciled
        self.assertFalse(task_3.needs_reconciliation(check_time))
        # Task 4 did not even launch so it should not be reconciled
        self.assertFalse(task_4.needs_reconciliation(check_time))
        # Task 5 had force_reconciliation() called so it should be reconciled
        self.assertTrue(task_5.needs_reconciliation(check_time))

        # Task 5 gets task update to clear force recon
        update = job_test_utils.create_task_status_update(
            task_5.id, 'agent_id', TaskStatusUpdate.RUNNING, check_time)
        task_5.update(update)
        # Task 5 received an update so force recon should be cleared and it should be not reconciled
        self.assertFalse(task_5.needs_reconciliation(check_time_2))
Exemplo n.º 27
0
    def test_change_agent_id(self):
        """Tests the NodeManager where a node's agent ID changes"""

        manager = CleanupManager()
        node_1 = Node(self.node_agent_1, self.node_1)
        node_2 = Node(self.node_agent_2, self.node_2)
        manager.update_nodes([node_1, node_2])
        tasks = manager.get_next_tasks()

        task_1 = None
        for task in tasks:
            task.launch(now())
            if task.agent_id == self.node_agent_1:
                task_1 = task

        # Node 1 changes agent ID
        node_1.update_from_mesos(agent_id=self.node_agent_3)
        manager.update_nodes([node_1, node_2])

        # Should get new initial cleanup task for node 1
        tasks = manager.get_next_tasks()
        self.assertEqual(len(tasks), 1)
        new_task_1 = tasks[0]
        self.assertEqual(new_task_1.agent_id, self.node_agent_3)

        # Task update comes back for original node 1 initial cleanup task, manager should ignore with no exception
        update = job_test_utils.create_task_status_update(task_1.id, task_1.agent_id, TaskStatusUpdate.FAILED, now())
        manager.handle_task_update(update)
Exemplo n.º 28
0
    def test_timed_out_system_job_task(self):
        """Tests running through a job execution where a system job task times out"""

        ingest_job_type = Ingest.objects.get_ingest_job_type()
        ingest_job_type.max_tries = 1
        ingest_job_type.save()
        job = job_test_utils.create_job(job_type=ingest_job_type, num_exes=1)
        job_exe = job_test_utils.create_job_exe(job=job)
        running_job_exe = RunningJobExecution(job_exe)

        # Start job-task and then task times out
        when_launched = now() + timedelta(seconds=1)
        job_task_started = when_launched + timedelta(seconds=1)
        when_timed_out = job_task_started + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        self.task_mgr.launch_tasks([job_task], when_launched)
        update = job_test_utils.create_task_status_update(
            job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started)
        self.task_mgr.handle_task_update(update)
        running_job_exe.task_update(update)
        running_job_exe.execution_timed_out(job_task, when_timed_out)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=job_exe.id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual('ingest-timeout', job_exe.error.name)
        self.assertEqual(when_timed_out, job_exe.ended)
Exemplo n.º 29
0
    def test_job_exe_clean_task(self):
        """Tests the NodeManager where a cleanup task is returned to clean up a job execution"""

        when = now()
        node_mgr = NodeManager()
        node_mgr.register_agents([self.agent_1, self.agent_2])
        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr = CleanupManager()
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        tasks = node_mgr.get_next_tasks(when)

        task_mgr = TaskManager()
        # Complete initial cleanup tasks
        for task in tasks:
            task_mgr.launch_tasks([task], now())
            update = job_test_utils.create_task_status_update(
                task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
            task_mgr.handle_task_update(update)
            node_mgr.handle_task_update(update)

        # Mark image pull done to get rid of image tasks
        for node in node_mgr.get_nodes():
            node._image_pull_completed()
            node._update_state()

        job_exe = job_test_utils.create_running_job_exe(agent_id=self.agent_1,
                                                        node=self.node_1)
        # Add a job execution to clean up and get the cleanup task for it
        cleanup_mgr.add_job_execution(job_exe)
        tasks = node_mgr.get_next_tasks(when)
        self.assertEqual(len(tasks), 1)
        task = tasks[0]
        self.assertEqual(task.agent_id, self.agent_1.agent_id)
        self.assertFalse(task.is_initial_cleanup)
        self.assertEqual(len(task.job_exes), 1)
Exemplo n.º 30
0
    def test_get_tasks_to_kill(self):
        """Tests calling get_tasks_to_kill() successfully"""

        # Start with 5 tasks
        scheduler_mgr.config.num_message_handlers = 5
        service = MessagingService()
        tasks = service.get_tasks_to_schedule()
        task_mgr.launch_tasks(tasks, now())

        # Lower number of desired tasks to 3, should get 2 to kill
        scheduler_mgr.config.num_message_handlers = 3
        tasks_to_kill = service.get_tasks_to_kill()
        self.assertEqual(len(tasks_to_kill), 2)

        # Kill the 2 tasks
        for task in tasks_to_kill:
            update = job_test_utils.create_task_status_update(
                task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
            task_mgr.handle_task_update(update)
            service.handle_task_update(update)
        self.assertEqual(service.get_actual_task_count(), 3)

        # Increase desired tasks to 10, should not get any to kill
        scheduler_mgr.config.num_message_handlers = 10
        tasks_to_kill = service.get_tasks_to_kill()
        self.assertEqual(len(tasks_to_kill), 0)
Exemplo n.º 31
0
    def test_general_algorithm_error(self):
        """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code)
        """

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id

        # Pre-task running
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id

        # Job-task running
        job_task_started = now()
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          job_task_started)
        running_job_exe.task_update(update)

        # Fail job-task
        job_task_failed = job_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FAILED,
                                                          job_task_failed, exit_code=1)
        running_job_exe.task_update(update)

        # Check results
        job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id)
        self.assertEqual(job_exe.status, 'FAILED')
        self.assertEqual(job_exe.error.name, 'algorithm-unknown')
Exemplo n.º 32
0
    def test_handle_lost_cleanup_tasks(self):
        """Tests handling lost cleanup tasks"""

        when = now()
        node = Node(self.node_agent, self.node, self.scheduler)
        # Get initial cleanup task
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        task_1_id = task.id

        # Lose task without scheduling and get different task next time
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)

        # Lose task with scheduling and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)

        # Lose task after running and get different task next time
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node._is_initial_cleanup_completed)
Exemplo n.º 33
0
    def test_sync_with_database(self):
        """Tests calling sync_with_database() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2],
                                           [])

        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(
            task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.job_exe_mgr.handle_task_update(update)

        # Cancel job_exe_1 and job_exe_2 and have manager sync with database
        Job.objects.update_jobs_to_canceled_old(
            [self.job_exe_1.job_id, self.job_exe_2.job_id], now())
        finished_job_exes = self.job_exe_mgr.sync_with_database()

        self.assertEqual(self.job_exe_1.status, 'CANCELED')
        self.assertFalse(self.job_exe_1.is_finished())
        self.assertEqual(self.job_exe_2.status, 'CANCELED')
        self.assertTrue(self.job_exe_2.is_finished())

        # Only job_exe_2 is finished, job_exe_1 has a task to kill
        self.assertEqual(len(finished_job_exes), 1)
        self.assertEqual(finished_job_exes[0].id, self.job_exe_2.id)
        # Make sure a create_job_exe_ends message exists for job_exe_2
        message = self.job_exe_mgr.get_messages()[0]
        self.assertEqual(message.type, 'create_job_exe_ends')
        self.assertEqual(message._job_exe_ends[0].job_exe_id,
                         self.job_exe_2.id)

        # Task killed for job_exe_1
        task_1_killed = task_1_started + timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(
            task_1.id, 'agent', TaskStatusUpdate.KILLED, task_1_killed)
        self.job_exe_mgr.handle_task_update(update)

        # Make sure a create_job_exe_ends message exists for job_exe_1
        self.assertTrue(self.job_exe_1.is_finished())
        message = self.job_exe_mgr.get_messages()[0]
        self.assertEqual(message.type, 'create_job_exe_ends')
        self.assertEqual(message._job_exe_ends[0].job_exe_id,
                         self.job_exe_1.id)
Exemplo n.º 34
0
    def test_handle_killed_task(self):
        """Tests handling killed cleanup task"""

        node = Node(self.node_agent, self.node)
        node_cleanup = NodeCleanup(node)
        # Get initial cleanup task
        task = node_cleanup.get_next_task()
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Kill task after running and get different task next time
        task.launch(now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        node_cleanup.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.KILLED, now())
        node_cleanup.handle_task_update(update)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertNotEqual(task.id, task_1_id)
        self.assertFalse(node.is_initial_cleanup_completed)
Exemplo n.º 35
0
    def test_timed_out_system_job_task(self):
        """Tests running through a job execution where a system job task times out"""

        ingest_job_type = Ingest.objects.get_ingest_job_type()
        ingest_job_type.max_tries = 1
        ingest_job_type.save()
        running_job_exe = job_test_utils.create_running_job_exe(
            agent_id='agent_1', job_type=ingest_job_type, num_exes=1)

        # Start job-task and then task times out
        when_launched = now() + timedelta(seconds=1)
        job_task_started = when_launched + timedelta(seconds=1)
        when_timed_out = job_task_started + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        self.task_mgr.launch_tasks([job_task], when_launched)
        update = job_test_utils.create_task_status_update(
            job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started)
        self.task_mgr.handle_task_update(update)
        running_job_exe.task_update(update)
        running_job_exe.execution_timed_out(job_task, when_timed_out)

        self.assertFalse(running_job_exe.is_finished()
                         )  # Not finished until killed task update arrives
        self.assertEqual(running_job_exe.status, 'FAILED')
        self.assertEqual(running_job_exe.error_category, 'SYSTEM')
        self.assertEqual(running_job_exe.error.name, 'ingest-timeout')
        self.assertEqual(running_job_exe.finished, when_timed_out)
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Killed task update arrives, job execution is now finished
        job_task_kill = when_timed_out + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(
            job_task.id, 'agent', TaskStatusUpdate.KILLED, job_task_kill)
        self.task_mgr.handle_task_update(update)
        running_job_exe.task_update(update)
        self.assertTrue(running_job_exe.is_finished())
        self.assertEqual(running_job_exe.status, 'FAILED')
        self.assertEqual(running_job_exe.error_category, 'SYSTEM')
        self.assertEqual(running_job_exe.error.name, 'ingest-timeout')
        self.assertEqual(running_job_exe.finished, when_timed_out)
        self.assertFalse(running_job_exe.is_next_task_ready())
Exemplo n.º 36
0
    def test_handle_failed_health_task(self):
        """Tests handling failed health task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()
        # Get health task
        task = node.get_next_tasks(when)[0]
        task_1_id = task.id
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))

        # Fail task after running
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FAILED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # Check node state
        self.assertEqual(node._state, Node.DEGRADED)
        self.assertTrue(NodeConditions.HEALTH_FAIL_ERR.name in
                        node._conditions._active_errors)

        # No new health task right away
        tasks = node.get_next_tasks(when + datetime.timedelta(seconds=5))
        self.assertListEqual([], tasks)
        self.assertFalse(node._conditions.is_health_check_normal)

        # After error threshold, we should get new health task
        new_time = when + Node.HEALTH_ERR_THRESHOLD + datetime.timedelta(
            seconds=5)
        task = node.get_next_tasks(new_time)[0]
        self.assertNotEqual(task.id, task_1_id)
        self.assertTrue(task.id.startswith(HEALTH_TASK_ID_PREFIX))
Exemplo n.º 37
0
    def test_canceled_job_execution(self):
        """Tests running through a job execution that gets canceled"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)

        # Start, run, and complete pre-task
        task = running_job_exe.start_next_task()
        pre_task_started = now()
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started)
        running_job_exe.task_update(update)
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed)
        running_job_exe.task_update(update)

        # Start job-task and then execution gets canceled
        job_task = running_job_exe.start_next_task()
        canceled_task = running_job_exe.execution_canceled()
        self.assertEqual(job_task.id, canceled_task.id)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())
Exemplo n.º 38
0
    def test_parsing_container_name(self):
        """Tests that a task successfully parses container name from a RUNNING task update"""

        task_id = 'task_1'
        task_name = 'My Task'
        agent_id = 'agent_1'
        container_name = 'container_1234'
        data = {'Config': {'Env': ['DUMMY_ENV=DUMMY', 'MESOS_CONTAINER_NAME=' + container_name]}}

        task = ImplementedTask(task_id, task_name, agent_id)
        update = job_test_utils.create_task_status_update(task_id, agent_id, TaskStatusUpdate.RUNNING, now(), data=data)
        task.update(update)

        self.assertEqual(task.container_name, container_name)
Exemplo n.º 39
0
    def test_determine_error(self):
        """Tests that a pre-task successfully determines the correct error"""

        scale_errors = [
            ScaleDatabaseError(),
            ScaleIOError(),
            ScaleOperationalError(),
            MissingSetting('')
        ]

        for scale_error in scale_errors:
            task = PreTask(self.job_exe)
            update = job_test_utils.create_task_status_update(
                task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
            task.update(update)
            update = job_test_utils.create_task_status_update(
                task.id,
                task.agent_id,
                TaskStatusUpdate.FAILED,
                now(),
                exit_code=scale_error.exit_code)
            error = task.determine_error(update)
            self.assertEqual(scale_error.error_name, error.name)
Exemplo n.º 40
0
    def test_handle_lost_tasks(self):
        """Tests handling lost cleanup tasks"""

        node = Node(self.node_agent, self.node)
        node_cleanup = NodeCleanup(node)
        # Get initial cleanup task
        task = node_cleanup.get_next_task()
        task_1_id = task.id
        self.assertIsNotNone(task)

        # Lose task without scheduling and get same task again
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node_cleanup.handle_task_update(update)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node.is_initial_cleanup_completed)

        # Lose task with scheduling and get same task again
        task.launch(now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node_cleanup.handle_task_update(update)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node.is_initial_cleanup_completed)

        # Lose task after running and get same task again
        task.launch(now())
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        node_cleanup.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now())
        node_cleanup.handle_task_update(update)
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertEqual(task.id, task_1_id)
        self.assertFalse(node.is_initial_cleanup_completed)
Exemplo n.º 41
0
    def test_failed_normal_job_execution(self):
        """Tests running through a normal job execution that fails"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Fail pre-task
        pre_task_failed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED,
                                                          pre_task_failed, exit_code=1)
        running_job_exe.task_update(update)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_failed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual('FAILED', job_exe.status)
        self.assertIsNotNone(job_exe.error_id)
        self.assertGreater(job_exe.ended, pre_task_failed)
Exemplo n.º 42
0
    def test_handle_initial_cleanup_task(self):
        """Tests handling the initial cleanup task"""

        node = Node(self.node_agent, self.node)
        node_cleanup = NodeCleanup(node)

        # Get initial cleanup task
        task = node_cleanup.get_next_task()
        self.assertIsNotNone(task)
        self.assertTrue(task.is_initial_cleanup)
        self.assertEqual(task.agent_id, self.node_agent)

        # Schedule initial cleanup and make sure no new task is ready
        task.launch(now())
        self.assertIsNone(node_cleanup.get_next_task())
        self.assertFalse(node.is_initial_cleanup_completed)

        # Complete initial clean up, verify no new task
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        node_cleanup.handle_task_update(update)
        update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        node_cleanup.handle_task_update(update)
        self.assertIsNone(node_cleanup.get_next_task())
        self.assertTrue(node.is_initial_cleanup_completed)
Exemplo n.º 43
0
    def test_no_job_exes_to_clean(self):
        """Tests the NodeManager where no cleanup tasks are returned due to no job executions to clean"""

        manager = CleanupManager()
        node_1 = Node(self.node_agent_1, self.node_1)
        node_2 = Node(self.node_agent_2, self.node_2)
        manager.update_nodes([node_1, node_2])
        tasks = manager.get_next_tasks()

        # Complete initial cleanup tasks
        for task in tasks:
            task.launch(now())
            update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
            manager.handle_task_update(update)

        tasks = manager.get_next_tasks()
        self.assertListEqual(tasks, [])  # No tasks since there are no job executions to clean up
Exemplo n.º 44
0
    def test_job_exe_clean_task(self):
        """Tests the NodeManager where a cleanup task is returned to clean up a job execution"""

        manager = CleanupManager()
        node_1 = Node(self.node_agent_1, self.node_1)
        node_2 = Node(self.node_agent_2, self.node_2)
        manager.update_nodes([node_1, node_2])
        tasks = manager.get_next_tasks()

        # Complete initial cleanup tasks
        for task in tasks:
            task.launch(now())
            update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
            manager.handle_task_update(update)

        # Add a job execution to clean up and get the cleanup task for it
        manager.add_job_execution(RunningJobExecution(self.job_exe_1))
        tasks = manager.get_next_tasks()
        self.assertEqual(len(tasks), 1)
        task = tasks[0]
        self.assertEqual(task.agent_id, self.node_agent_1)
        self.assertFalse(task.is_initial_cleanup)
        self.assertEqual(len(task.job_exes), 1)
Exemplo n.º 45
0
    def test_successful_normal_job_execution(self):
        """Tests running through a normal job execution successfully"""

        job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id)
        running_job_exe = RunningJobExecution(job_exe)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start pre-task
        task = running_job_exe.start_next_task()
        pre_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task running
        pre_task_started = now() - timedelta(minutes=5)  # Lots of time so now() called at completion is in future
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          pre_task_started)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Pre-task sets updated command arguments
        updated_commands_args = '-arg updated'
        JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args)

        # Complete pre-task
        pre_task_completed = pre_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          pre_task_completed, exit_code=1)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start job-task
        task = running_job_exe.start_next_task()
        job_task_id = task.id
        self.assertEqual(task._command_arguments, updated_commands_args)  # Make sure job task has updated command args
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Job-task running
        job_task_started = pre_task_completed + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          job_task_started)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete job-task
        job_task_completed = job_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          job_task_completed, exit_code=2)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertTrue(running_job_exe.is_next_task_ready())

        # Start post-task
        task = running_job_exe.start_next_task()
        post_task_id = task.id
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Post-task running
        post_task_started = job_task_completed + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.RUNNING,
                                                          post_task_started)
        running_job_exe.task_update(update)
        self.assertFalse(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        # Complete post-task
        post_task_completed = post_task_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(post_task_id, 'agent', TaskStatusUpdate.FINISHED,
                                                          post_task_completed, exit_code=3)
        running_job_exe.task_update(update)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=self._job_exe_id)
        self.assertEqual(pre_task_started, job_exe.pre_started)
        self.assertEqual(pre_task_completed, job_exe.pre_completed)
        self.assertEqual(1, job_exe.pre_exit_code)
        self.assertEqual(job_task_started, job_exe.job_started)
        self.assertEqual(job_task_completed, job_exe.job_completed)
        self.assertEqual(2, job_exe.job_exit_code)
        self.assertEqual(post_task_started, job_exe.post_started)
        self.assertEqual(post_task_completed, job_exe.post_completed)
        self.assertEqual(3, job_exe.post_exit_code)
        self.assertEqual('COMPLETED', job_exe.status)
        self.assertGreater(job_exe.ended, post_task_completed)