def test_lost_job_execution(self): """Tests running through a job execution that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets lost when_lost = pre_task_completed + timedelta(seconds=1) job_task = running_job_exe.start_next_task() lost_task = running_job_exe.execution_lost(when_lost) self.assertEqual(job_task.id, lost_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual('FAILED', job_exe.status) self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id) self.assertEqual(when_lost, job_exe.ended)
def test_pre_task_launch_error(self): """Tests running through a job execution where a pre-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task fails to launch pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = now() running_job_exe.task_fail(pre_task_results) # Check results job_exe = JobExecution.objects.select_related().get( id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_lost_job_execution(self): """Tests running through a job execution that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task and then execution gets lost when_lost = pre_task_completed + timedelta(seconds=1) job_task = running_job_exe.start_next_task() lost_task = running_job_exe.execution_lost(when_lost) self.assertEqual(job_task.id, lost_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual('FAILED', job_exe.status) self.assertEqual(Error.objects.get_builtin_error('node-lost').id, job_exe.error_id) self.assertEqual(when_lost, job_exe.ended)
def test_consider_next_task(self): """Tests consider_next_task() and get_accepted_running_job_exes()""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) result = node_offers.consider_next_task( job_exe_1) # Same job_exe, should have no effect self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus) result = node_offers.consider_next_task(job_exe_high_cpus) self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS) job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem) result = node_offers.consider_next_task(job_exe_high_mem) self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM) job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk) result = node_offers.consider_next_task(job_exe_high_disk) self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0)
def test_lost_node(self): """Tests accepting a running and queued job execution and then the node being lost""" offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.ACCEPTED) manager.lost_node(self.node_agent) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0)
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_1_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) queue_2_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) queue_3_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) queue_4_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) queue_5_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) queue_6_limit = queue_test_utils.create_queue(job_type=job_type_with_limit) self._job_type_manager.sync_with_database() # One job of this type is already running self._job_exe_manager.add_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) self._offer_manager.add_new_offers([offer_1, offer_2]) num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def test_job_exe_canceled(self): """Tests adding a job execution that becomes canceled while scheduling""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) job_exe_1.execution_canceled() result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.TASK_INVALID) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
def test_pre_task_launch_error(self): """Tests running through a job execution where a pre-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task fails to launch update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now()) running_job_exe.task_update(update) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_job_task_launch_error(self): """Tests running through a job execution where a Docker-based job-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_running(pre_task_id, pre_task_started, '', '') # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task fails to launch job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = now() running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get( id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_docker_terminated_error(self): """Tests running through a job execution where a Docker container terminates""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) # Pre-task Docker container terminates update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, now(), reason='REASON_EXECUTOR_TERMINATED') running_job_exe.task_update(update) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-terminated')
def test_paused_node(self): """Tests adding job executions when the node is paused""" node_offers = NodeOffers(self.paused_node) offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer( 'offer_2', self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Ensure it accepts new tasks for already running job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) # Don't accept new job executions while paused job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NODE_PAUSED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0)
def test_pre_task_launch_error(self): """Tests running through a job execution where a pre-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task fails to launch pre_task_results = TaskResults(pre_task_id) pre_task_results.when = now() running_job_exe.task_fail(pre_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'task-launch')
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def test_job_exe_canceled(self): """Tests adding a job execution that becomes canceled while scheduling""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) job_exe_1.execution_canceled() result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.TASK_INVALID) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
def test_job_task_launch_error(self): """Tests running through a job execution where a Docker-based job-task fails to launch""" # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task fails to launch job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = now() running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'docker-task-launch')
def test_no_offers(self): """Tests adding job executions when there are no offers""" node_offers = NodeOffers(self.node) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.NO_OFFERS) job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NO_OFFERS) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
def create_task_update_model(status): """Creates and returns a task update model for the given Mesos task status :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` :returns: The task update model :rtype: :class:`job.models.TaskUpdate` """ task_update = TaskUpdate() task_update.task_id = get_status_task_id(status) task_update.job_exe_id = RunningJobExecution.get_job_exe_id(task_update.task_id) task_update.status = get_status_state(status) task_update.timestamp = get_status_timestamp(status) task_update.source = get_status_source(status) task_update.reason = get_status_reason(status) task_update.message = get_status_message(status) return task_update
def create_task_update_model(status): """Creates and returns a task update model for the given Mesos task status :param status: The task status :type status: :class:`mesos_pb2.TaskStatus` :returns: The task update model :rtype: :class:`job.models.TaskUpdate` """ task_update = TaskUpdate() task_update.task_id = get_status_task_id(status) task_update.job_exe_id = RunningJobExecution.get_job_exe_id( task_update.task_id) task_update.status = get_status_state(status) task_update.timestamp = get_status_timestamp(status) task_update.source = get_status_source(status) task_update.reason = get_status_reason(status) task_update.message = get_status_message(status) return task_update
def test_no_ready_offers(self): """Tests considering job executions when no offers are ready""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.NODE_OFFLINE)
def test_lost_node(self): """Tests when the node is lost""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Accept a couple job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertGreater(node_offers._available_cpus, 0.0) self.assertGreater(node_offers._available_mem, 0.0) self.assertGreater(node_offers._available_disk, 0.0) # Node is lost node_offers.lost_node() self.assertFalse(node_offers.has_accepted_job_exes()) self.assertEqual(node_offers._available_cpus, 0.0) self.assertEqual(node_offers._available_mem, 0.0) self.assertEqual(node_offers._available_disk, 0.0)
def test_canceled_job_execution(self): """Tests running through a job execution that gets canceled""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets canceled job_task = running_job_exe.start_next_task() canceled_task = running_job_exe.execution_canceled() self.assertEqual(job_task.id, canceled_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready())
def schedule_job_executions(self, framework_id, job_executions, workspaces): """Schedules the given job executions on the provided nodes and resources. The corresponding queue models will be deleted from the database. All database changes occur in an atomic transaction. :param framework_id: The scheduling framework ID :type framework_id: string :param job_executions: A list of queued job executions that have been given nodes and resources on which to run :type job_executions: list[:class:`queue.job_exe.QueuedJobExecution`] :param workspaces: A dict of all workspaces stored by name :type workspaces: {string: :class:`storage.models.Workspace`} :returns: The scheduled job executions :rtype: list[:class:`job.execution.running.job_exe.RunningJobExecution`] """ if not job_executions: return [] job_exe_ids = [] for job_execution in job_executions: job_exe_ids.append(job_execution.id) # Lock corresponding job executions job_exes = {} for job_exe in JobExecution.objects.select_for_update().filter( id__in=job_exe_ids).order_by('id'): job_exes[job_exe.id] = job_exe # Set up job executions to schedule executions_to_schedule = [] for job_execution in job_executions: queue = job_execution.queue node = job_execution.provided_node resources = job_execution.provided_resources job_exe = job_exes[job_execution.id] # Ignore executions that are no longer queued (executions may have been changed since queue model was last # queried) if job_exe.status != 'QUEUED': continue # Check that resources are sufficient if resources.cpus < queue.cpus_required: msg = 'Job execution requires %s CPUs and only %s were provided' raise Exception( msg % (str(queue.cpus_required), str(resources.cpus))) if resources.mem < queue.mem_required: msg = 'Job execution requires %s MiB of memory and only %s MiB were provided' raise Exception(msg % (str(queue.mem_required), str(resources.mem))) if resources.disk_in < queue.disk_in_required: msg = 'Job execution requires %s MiB of input disk space and only %s MiB were provided' raise Exception( msg % (str(queue.disk_in_required), str(resources.disk_in))) if resources.disk_out < queue.disk_out_required: msg = 'Job execution requires %s MiB of output disk space and only %s MiB were provided' raise Exception( msg % (str(queue.disk_out_required), str(resources.disk_out))) if resources.disk_total < queue.disk_total_required: msg = 'Job execution requires %s MiB of total disk space and only %s MiB were provided' raise Exception(msg % (str( queue.disk_total_required), str(resources.disk_total))) executions_to_schedule.append((job_exe, node, resources)) # Schedule job executions scheduled_job_exes = [] for job_exe in JobExecution.objects.schedule_job_executions( framework_id, executions_to_schedule, workspaces): scheduled_job_exes.append(RunningJobExecution(job_exe)) # Clear the job executions from the queue Queue.objects.filter(job_exe_id__in=job_exe_ids).delete() return scheduled_job_exes
def statusUpdate(self, driver, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. """ started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) results.stdout = get_slave_task_file(hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file(hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory(hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail(results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure(job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception('Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def statusUpdate(self, driver, status): ''' Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. See documentation for :meth:`mesos_api.mesos.Scheduler.statusUpdate`. ''' started = now() task_id = status.task_id.value job_exe_id = RunningJobExecution.get_job_exe_id(task_id) logger.info('Status update for task %s: %s', task_id, utils.status_to_string(status.state)) # Since we have a status update for this task, remove it from reconciliation set self._recon_thread.remove_task_id(task_id) try: running_job_exe = self._job_exe_manager.get_job_exe(job_exe_id) if running_job_exe: results = TaskResults(task_id) results.exit_code = utils.parse_exit_code(status) results.when = utils.get_status_timestamp(status) if status.state in [ mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: try: log_start_time = now() hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) results.stdout = get_slave_task_file( hostname, port, task_dir, 'stdout') results.stderr = get_slave_task_file( hostname, port, task_dir, 'stderr') log_end_time = now() logger.debug('Time to pull logs for task: %s', str(log_end_time - log_start_time)) except Exception: logger.exception('Error pulling logs for task %s', task_id) # Apply status update to running job execution if status.state == mesos_pb2.TASK_RUNNING: hostname = running_job_exe._node_hostname port = running_job_exe._node_port task_dir = get_slave_task_directory( hostname, port, task_id) stdout_url = get_slave_task_url(hostname, port, task_dir, 'stdout') stderr_url = get_slave_task_url(hostname, port, task_dir, 'stderr') running_job_exe.task_running(task_id, results.when, stdout_url, stderr_url) elif status.state == mesos_pb2.TASK_FINISHED: running_job_exe.task_complete(results) elif status.state == mesos_pb2.TASK_LOST: running_job_exe.task_fail( results, Error.objects.get_builtin_error('mesos-lost')) elif status.state in [ mesos_pb2.TASK_ERROR, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_KILLED ]: running_job_exe.task_fail(results) # Remove finished job execution if running_job_exe.is_finished(): self._job_exe_manager.remove_job_exe(job_exe_id) else: # Scheduler doesn't have any knowledge of this job execution Queue.objects.handle_job_failure( job_exe_id, now(), Error.objects.get_builtin_error('scheduler-lost')) except Exception: logger.exception( 'Error handling status update for job execution: %s', job_exe_id) # Error handling status update, add task so it can be reconciled self._recon_thread.add_task_ids([task_id]) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def test_general_algorithm_error(self): """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code) """ # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() running_job_exe.task_start(job_task_id, job_task_started) # Fail job-task job_task_failed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = job_task_failed running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get(id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'algorithm-unknown')
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(pre_task_id, 'agent', TaskStatusUpdate.FAILED, pre_task_failed, exit_code=1) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertIsNotNone(job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_lost_task(self): """Tests running through a job execution that has a task that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_started = now() update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, pre_task_started) running_job_exe.task_update(update) pre_task_completed = pre_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.FINISHED, pre_task_completed) running_job_exe.task_update(update) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id job_task_started = pre_task_completed + timedelta(seconds=1) update = job_test_utils.create_task_status_update(task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) running_job_exe.task_update(update) self.assertTrue(task.has_started) # Lose task and make sure the same task is the next one to schedule again when_lost = job_task_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update(job_task_id, 'agent', TaskStatusUpdate.LOST, when_lost) running_job_exe.task_update(update) self.assertFalse(task.has_started) task = running_job_exe.start_next_task() self.assertEqual(job_task_id, task.id)
def test_successful_normal_job_execution(self): """Tests running through a normal job execution successfully""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task sets updated command arguments updated_commands_args = '-arg updated' JobExecution.objects.filter(id=self._job_exe_id).update(command_arguments=updated_commands_args) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id self.assertEqual(task._command_arguments, updated_commands_args) # Make sure job task has updated command args self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Job-task running job_task_started = pre_task_completed + timedelta(seconds=1) running_job_exe.task_start(job_task_id, job_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 2 job_task_results.when = job_task_completed running_job_exe.task_complete(job_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Post-task running post_task_started = job_task_completed + timedelta(seconds=1) running_job_exe.task_start(post_task_id, post_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete post-task post_task_completed = post_task_started + timedelta(seconds=1) post_task_results = TaskResults(post_task_id) post_task_results.exit_code = 3 post_task_results.when = post_task_completed running_job_exe.task_complete(post_task_results) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_completed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual(job_task_started, job_exe.job_started) self.assertEqual(job_task_completed, job_exe.job_completed) self.assertEqual(2, job_exe.job_exit_code) self.assertEqual(post_task_started, job_exe.post_started) self.assertEqual(post_task_completed, job_exe.post_completed) self.assertEqual(3, job_exe.post_exit_code) self.assertEqual('COMPLETED', job_exe.status) self.assertGreater(job_exe.ended, post_task_completed)
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) error = error_test_utils.create_error() running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta( minutes=5 ) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_failed running_job_exe.task_fail(pre_task_results, error) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertEqual(error.id, job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_lost_job_execution(self): """Tests running through a job execution that gets lost""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start, run, and complete pre-task task = running_job_exe.start_next_task() pre_task_id = task.id pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task and then execution gets lost when_lost = pre_task_completed + timedelta(seconds=1) job_task = running_job_exe.start_next_task() lost_task = running_job_exe.execution_lost(when_lost) self.assertEqual(job_task.id, lost_task.id) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual('FAILED', job_exe.status) self.assertEqual( Error.objects.get_builtin_error('node-lost').id, job_exe.error_id) self.assertEqual(when_lost, job_exe.ended)
def test_failed_normal_job_execution(self): """Tests running through a normal job execution that fails""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type(self._job_exe_id) error = error_test_utils.create_error() running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta(minutes=5) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Fail pre-task pre_task_failed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_failed running_job_exe.task_fail(pre_task_results, error) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_failed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual('FAILED', job_exe.status) self.assertEqual(error.id, job_exe.error_id) self.assertGreater(job_exe.ended, pre_task_failed)
def test_general_algorithm_error(self): """Tests running through a job execution where the job-task has a general algorithm error (non-zero exit code) """ # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id # Pre-task running pre_task_started = now() running_job_exe.task_start(pre_task_id, pre_task_started) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 0 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id # Job-task running job_task_started = now() running_job_exe.task_start(job_task_id, job_task_started) # Fail job-task job_task_failed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 1 job_task_results.when = job_task_failed running_job_exe.task_fail(job_task_results) # Check results job_exe = JobExecution.objects.select_related().get( id=self._job_exe_id) self.assertEqual(job_exe.status, 'FAILED') self.assertEqual(job_exe.error.name, 'algorithm-unknown')
def test_successful_normal_job_execution(self): """Tests running through a normal job execution successfully""" job_exe = JobExecution.objects.get_job_exe_with_job_and_job_type( self._job_exe_id) running_job_exe = RunningJobExecution(job_exe) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start pre-task task = running_job_exe.start_next_task() pre_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task running pre_task_started = now() - timedelta( minutes=5 ) # Lots of time so now() called at completion is in future running_job_exe.task_start(pre_task_id, pre_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Pre-task sets updated command arguments updated_commands_args = '-arg updated' JobExecution.objects.filter(id=self._job_exe_id).update( command_arguments=updated_commands_args) # Complete pre-task pre_task_completed = pre_task_started + timedelta(seconds=1) pre_task_results = TaskResults(pre_task_id) pre_task_results.exit_code = 1 pre_task_results.when = pre_task_completed running_job_exe.task_complete(pre_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start job-task task = running_job_exe.start_next_task() job_task_id = task.id self.assertEqual(task._command_arguments, updated_commands_args ) # Make sure job task has updated command args self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Job-task running job_task_started = pre_task_completed + timedelta(seconds=1) running_job_exe.task_start(job_task_id, job_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete job-task job_task_completed = job_task_started + timedelta(seconds=1) job_task_results = TaskResults(job_task_id) job_task_results.exit_code = 2 job_task_results.when = job_task_completed running_job_exe.task_complete(job_task_results) self.assertFalse(running_job_exe.is_finished()) self.assertTrue(running_job_exe.is_next_task_ready()) # Start post-task task = running_job_exe.start_next_task() post_task_id = task.id self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Post-task running post_task_started = job_task_completed + timedelta(seconds=1) running_job_exe.task_start(post_task_id, post_task_started) self.assertFalse(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) # Complete post-task post_task_completed = post_task_started + timedelta(seconds=1) post_task_results = TaskResults(post_task_id) post_task_results.exit_code = 3 post_task_results.when = post_task_completed running_job_exe.task_complete(post_task_results) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=self._job_exe_id) self.assertEqual(pre_task_started, job_exe.pre_started) self.assertEqual(pre_task_completed, job_exe.pre_completed) self.assertEqual(1, job_exe.pre_exit_code) self.assertEqual(job_task_started, job_exe.job_started) self.assertEqual(job_task_completed, job_exe.job_completed) self.assertEqual(2, job_exe.job_exit_code) self.assertEqual(post_task_started, job_exe.post_started) self.assertEqual(post_task_completed, job_exe.post_completed) self.assertEqual(3, job_exe.post_exit_code) self.assertEqual('COMPLETED', job_exe.status) self.assertGreater(job_exe.ended, post_task_completed)