def test_score_job_exe_for_reservation_insufficient_resources(self): """Tests calling score_job_exe_for_reservation() when there are not enough resources to reserve for the job""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) task = HealthTask( '1234', 'agent_1') # Resources are 0.1 CPUs and 32 MiB memory job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)]), priority=1000) job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(56.0), Mem(15.0)]), priority=100) scheduling_node = SchedulingNode('agent_1', node, [task], [job_exe_1, job_exe_2], resource_set) queue_model_1 = queue_test_utils.create_queue(priority=100, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_1 = QueuedJobExecution(queue_model_1) queue_model_2 = queue_test_utils.create_queue(priority=1000, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_2 = QueuedJobExecution(queue_model_2) scheduling_node.accept_new_job_exe(job_exe_1) scheduling_node.accept_new_job_exe(job_exe_2) # We are going to try to reserve the node for a job execution with priority 120 # Calculate available resources for reservation: # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) - Higher Priority # New Job Exes (8, 40) = 135.9 CPUs, 613 memory # This new job should NOT fit for reservation queue_model = queue_test_utils.create_queue(priority=120, cpus_required=140.0, mem_required=600.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)]) score = scheduling_node.score_job_exe_for_reservation( job_exe, [job_type_resource_1]) self.assertIsNone(score)
def test_start_job_exe_tasks(self): """Tests calling start_job_exe_tasks() successfully""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)])) job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(5.0), Mem(25.0)])) scheduling_node.accept_job_exe_next_task(job_exe_1, []) scheduling_node.accept_job_exe_next_task(job_exe_2, []) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 2) job_exe_1.execution_canceled(now( )) # Execution canceled, so it will not have a next task to start scheduling_node.start_job_exe_tasks() self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0) self.assertEqual(len(scheduling_node.allocated_tasks), 1) # Only job_exe_2 had a next task
def test_job_exe_no_offers(self): """Tests the NodeManager where a node is running an exe and has not given offers to Scale in 1 hour. Expected behavior: The node is scheduler and DB are in sync and the node is still active""" last_offer = now() - datetime.timedelta(hours=1) node_mgr = NodeManager() node_mgr.register_agents([self.agent_1]) node_mgr.sync_with_database(scheduler_mgr.config) # Add job to node job_test_utils.create_running_job_exe(agent_id=self.agent_1, node=self.node_1) # Set last_offer_received to 1 hour ago Node.objects.filter(id=self.node_1.id).update( last_offer_received=last_offer) # This inspects what nodes are running jobs and what nodes need to be removed if they # have not sent offers in the last 5 minutes node_mgr.sync_with_database(scheduler_mgr.config) # Get the DB and Scheduler state and make sure they are consistent db_record = Node.objects.get(id=self.node_1.id) scheduler_record = node_mgr.get_node(self.agent_1.agent_id) self.assertEqual(db_record.is_active, scheduler_record._is_active, True)
def test_get_nodes_running_job_exes(self): """Tests calling NodeManager.get_nodes_running_job_exes()""" # Create nodes node_1 = node_test_utils.create_node(hostname='node_1') node_2 = node_test_utils.create_node(hostname='node_2') node_3 = node_test_utils.create_node(hostname='node_3') # No running jobs; should be empty nodes_w_jobs = Node.objects.get_nodes_running_job_exes() self.assertEqual(nodes_w_jobs, []) job_test_utils.create_job_exe(node=node_3, status='COMPLETED') job_test_utils.create_job_exe(node=node_3, status='FAILED') job_test_utils.create_job_exe(node=node_3, status='CANCELED') # 0 running jobs self.assertEqual(Node.objects.get_nodes_running_job_exes(), []) # Create a running job_exe job_test_utils.create_running_job_exe(node=node_1) # 1 running job on node_1 nodes_w_jobs = Node.objects.get_nodes_running_job_exes() self.assertEqual(len(nodes_w_jobs), 1) self.assertEqual(nodes_w_jobs[0], node_1.id) # Create another running job_exe (using a different way to create running job_exe for testing completeness) job_test_utils.create_job_exe(node=node_2, status='RUNNING') # 2 running job_exes nodes_w_jobs = Node.objects.get_nodes_running_job_exes() self.assertEqual(len(nodes_w_jobs), 2) self.assertIn(node_1.id, nodes_w_jobs) self.assertIn(node_2.id, nodes_w_jobs)
def setUp(self): django.setup() # Clear error cache so test works correctly CACHED_ERRORS.clear() self.agent_id = 'agent' self.node_model_1 = node_test_utils.create_node() self.job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_1) self.node_model_2 = node_test_utils.create_node() self.job_exe_2 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_2) self.job_exe_mgr = JobExecutionManager()
def setUp(self): django.setup() # Clear error cache so tests work correctly reset_error_cache() self.agent_id = 'agent' self.node_model_1 = node_test_utils.create_node() self.job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_1) self.node_model_2 = node_test_utils.create_node() self.job_exe_2 = job_test_utils.create_running_job_exe(agent_id=self.agent_id, node=self.node_model_2) self.task_mgr = TaskManager() self.job_exe_mgr = JobExecutionManager()
def test_job_exe_clean_task(self): """Tests the NodeManager where a cleanup task is returned to clean up a job execution""" when = now() node_mgr = NodeManager() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr = CleanupManager() cleanup_mgr.update_nodes(node_mgr.get_nodes()) tasks = node_mgr.get_next_tasks(when) task_mgr = TaskManager() # Complete initial cleanup tasks for task in tasks: task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task_mgr.handle_task_update(update) node_mgr.handle_task_update(update) # Mark image pull done to get rid of image tasks for node in node_mgr.get_nodes(): node._image_pull_completed() node._update_state() job_exe = job_test_utils.create_running_job_exe(agent_id=self.agent_1, node=self.node_1) # Add a job execution to clean up and get the cleanup task for it cleanup_mgr.add_job_execution(job_exe) tasks = node_mgr.get_next_tasks(when) self.assertEqual(len(tasks), 1) task = tasks[0] self.assertEqual(task.agent_id, self.agent_1.agent_id) self.assertFalse(task.is_initial_cleanup) self.assertEqual(len(task.job_exes), 1)
def test_json(self): """Tests coverting a RestartScheduler message to and from JSON""" started = now() scheduler_restarted = started + datetime.timedelta(seconds=30) running_job_exe = job_test_utils.create_running_job_exe( started=started) # Create message message = RestartScheduler() message.when = scheduler_restarted # Convert message to JSON and back, and then execute message_json_dict = message.to_json() new_message = RestartScheduler.from_json(message_json_dict) result = new_message.execute() self.assertTrue(result) failed_jobs_msg = None job_exe_end_msg = None self.assertEqual(len(new_message.new_messages), 2) for msg in new_message.new_messages: if msg.type == 'failed_jobs': failed_jobs_msg = msg elif msg.type == 'create_job_exe_ends': job_exe_end_msg = msg self.assertEqual(failed_jobs_msg._failed_jobs.values()[0][0].job_id, running_job_exe.job_id) self.assertEqual(job_exe_end_msg._job_exe_ends[0].job_exe_id, running_job_exe.id)
def test_accept_job_exe_next_task_canceled(self): """Tests calling accept_job_exe_next_task() when job exe gets canceled (no next task)""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(10.0), Mem(50.0)]) task_resources = NodeResources() watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) job_exe = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(1.0), Mem(10.0)])) waiting_tasks = [] job_exe.execution_canceled(now()) had_waiting_task = scheduling_node.accept_job_exe_next_task( job_exe, waiting_tasks) self.assertFalse(had_waiting_task) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal( NodeResources([Cpus(10.0), Mem(50.0)]))) self.assertListEqual(waiting_tasks, [])
def test_score_job_exe_for_scheduling_insufficient_resources(self): """Tests calling score_job_exe_for_scheduling() when there are not enough resources to schedule the job""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) task_resources = NodeResources([Cpus(100.0), Mem(500.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) # Allocate 10 CPUs and 50 MiB memory to existing job execution job_exe = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)])) scheduling_node.accept_job_exe_next_task(job_exe, []) # Should have 10 CPUs and 50 MiB memory left, so this job execution is too big queue_model = queue_test_utils.create_queue(cpus_required=15.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) score = scheduling_node.score_job_exe_for_scheduling(job_exe, []) self.assertIsNone(score)
def test_job_type_limit(self): """Tests calling perform_scheduling() with a job type limit""" Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_seed_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id, job_type=job_type_with_limit, node=self.node_1) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([running_job_exe_1], []) offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def setUp(self): django.setup() self.scheduler = Scheduler() self.node_agent = 'agent_1' self.node = node_test_utils.create_node(hostname='host_1', slave_id=self.node_agent) self.job_exe = job_test_utils.create_running_job_exe(agent_id=self.node_agent, node=self.node) self.task_mgr = TaskManager()
def test_score_job_exe_for_scheduling(self): """Tests calling score_job_exe_for_scheduling() successfully""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) task_resources = NodeResources([Cpus(100.0), Mem(500.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, task_resources, watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) # Allocate 10 CPUs and 50 MiB memory to existing job execution job_exe = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)])) scheduling_node.accept_job_exe_next_task(job_exe, []) # Should have 10 CPUs and 50 MiB memory left, so this should be scheduled queue_model = queue_test_utils.create_queue(cpus_required=5.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) # Expected available 85 CPUs and 110 MiB memory "left" on node # (watermark - current tasks - allocated - new job we are scoring) # First 2 job types should fit, next 2 are too big, so score should be 2 job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)]) job_type_resource_2 = NodeResources([Cpus(85.0), Mem(109.0)]) job_type_resource_3 = NodeResources([Cpus(86.0), Mem(10.0)]) job_type_resource_4 = NodeResources([Cpus(2.0), Mem(111.0)]) score = scheduling_node.score_job_exe_for_scheduling( job_exe, [ job_type_resource_1, job_type_resource_2, job_type_resource_3, job_type_resource_4 ]) self.assertEqual(score, 2)
def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() running_job_exe = job_test_utils.create_running_job_exe( agent_id='agent_1', job_type=ingest_job_type, num_exes=1) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertFalse(running_job_exe.is_finished() ) # Not finished until killed task update arrives self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready()) # Killed task update arrives, job execution is now finished job_task_kill = when_timed_out + timedelta(seconds=1) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.KILLED, job_task_kill) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) self.assertTrue(running_job_exe.is_finished()) self.assertEqual(running_job_exe.status, 'FAILED') self.assertEqual(running_job_exe.error_category, 'SYSTEM') self.assertEqual(running_job_exe.error.name, 'ingest-timeout') self.assertEqual(running_job_exe.finished, when_timed_out) self.assertFalse(running_job_exe.is_next_task_ready())
def test_add_allocated_offers_remove_all_tasks(self): """Tests calling add_allocated_offers() when there are not enough resources for the job exes or node tasks""" node = MagicMock() node.hostname = 'host_1' node.id = 1 health_task = HealthTask('1234', 'agent_1') pull_task = PullTask('1234', 'agent_1') node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True node.get_next_tasks = MagicMock() node.get_next_tasks.return_value = [health_task, pull_task] offered_resources = NodeResources([Cpus(100.0), Mem(500.0)]) watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set) running_job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(1.0), Mem(10.0)])) running_job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(2.0), Mem(20.0)])) node_task_resources = NodeResources() node_task_resources.add(health_task.get_resources()) node_task_resources.add(pull_task.get_resources()) all_required_resources = NodeResources() all_required_resources.add(node_task_resources) all_required_resources.add( running_job_exe_1.next_task().get_resources()) all_required_resources.add( running_job_exe_2.next_task().get_resources()) expected_remaining_resources = NodeResources() expected_remaining_resources.add(offered_resources) expected_remaining_resources.subtract(node_task_resources) # Set up node with node tasks and job exes (there would never be queued job exes since they would be scheduled # before add_allocated_offers() was called scheduling_node.accept_node_tasks(now(), []) scheduling_node.accept_job_exe_next_task(running_job_exe_1, []) scheduling_node.accept_job_exe_next_task(running_job_exe_2, []) self.assertEqual(len(scheduling_node.allocated_tasks), 2) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 2) self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal( all_required_resources)) # Set up offers (not enough for job exes or node tasks) offer_1 = ResourceOffer('offer_1', 'agent_1', '1234', NodeResources([Cpus(0.1), Mem(600.0)]), now(), None) scheduling_node.add_allocated_offers([offer_1]) self.assertListEqual(scheduling_node.allocated_offers, [offer_1]) # All allocated tasks and job exes should be gone self.assertEqual(len(scheduling_node.allocated_tasks), 0) self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0) self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0) self.assertTrue( scheduling_node.allocated_resources.is_equal(NodeResources())) self.assertTrue( scheduling_node._remaining_resources.is_equal(offered_resources))
def setUp(self): django.setup() self.conditions = NodeConditions('test_node') self.job_exes = [job_test_utils.create_running_job_exe()] self.job_ids = [exe.job_id for exe in self.job_exes]
def test_score_job_exe_for_reservation(self): """Tests calling score_job_exe_for_reservation() successfully""" node = MagicMock() node.hostname = 'host_1' node.id = 1 node.is_ready_for_new_job = MagicMock() node.is_ready_for_new_job.return_value = True node.is_ready_for_next_job_task = MagicMock() node.is_ready_for_next_job_task.return_value = True offered_resources = NodeResources([Cpus(20.0), Mem(100.0)]) watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)]) resource_set = ResourceSet(offered_resources, NodeResources(), watermark_resources) task = HealthTask( '1234', 'agent_1') # Resources are 0.1 CPUs and 32 MiB memory job_exe_1 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(10.0), Mem(50.0)]), priority=1000) job_exe_2 = job_test_utils.create_running_job_exe( agent_id=self.agent_id, resources=NodeResources([Cpus(56.0), Mem(15.0)]), priority=100) scheduling_node = SchedulingNode('agent_1', node, [task], [job_exe_1, job_exe_2], resource_set) queue_model_1 = queue_test_utils.create_queue(priority=100, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_1 = QueuedJobExecution(queue_model_1) queue_model_2 = queue_test_utils.create_queue(priority=1000, cpus_required=8.0, mem_required=40.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe_2 = QueuedJobExecution(queue_model_2) scheduling_node.accept_new_job_exe(job_exe_1) scheduling_node.accept_new_job_exe(job_exe_2) # We are going to try to reserve the node for a job execution with priority 120 # Calculate available resources for reservation: # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) - Higher Priority # New Job Exes (8, 40) = 135.9 CPUs, 613 memory # This new job should fit for reservation queue_model = queue_test_utils.create_queue(priority=120, cpus_required=130.0, mem_required=600.0, disk_in_required=0.0, disk_out_required=0.0, disk_total_required=0.0) job_exe = QueuedJobExecution(queue_model) # Expected available 5.9 CPUs and 13 MiB memory "left" on node # (available above - new job we are scoring) # First 2 job types should fit, next 2 are too big, so score should be 2 job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)]) job_type_resource_2 = NodeResources([Cpus(5.5), Mem(12.0)]) job_type_resource_3 = NodeResources([Cpus(6.0), Mem(10.0)]) job_type_resource_4 = NodeResources([Cpus(2.0), Mem(14.0)]) score = scheduling_node.score_job_exe_for_reservation( job_exe, [ job_type_resource_1, job_type_resource_2, job_type_resource_3, job_type_resource_4 ]) self.assertEqual(score, 2)
def test_execute(self): """Tests calling RestartScheduler.execute() successfully""" started = now() scheduler_restarted = started + datetime.timedelta(seconds=30) started_later = scheduler_restarted + datetime.timedelta(seconds=30) running_job_exe_1 = job_test_utils.create_running_job_exe( started=started) running_job_exe_2 = job_test_utils.create_running_job_exe( started=started) running_job_exe_3 = job_test_utils.create_running_job_exe( started=started) running_job_exe_4 = job_test_utils.create_running_job_exe( started=started_later) # After scheduler restart # Set job 1 so it is still QUEUED Job.objects.filter(id=running_job_exe_1.job_id).update(status='QUEUED') # Set job 3 to COMPLETED, so it should not be failed by scheduler restart Job.objects.filter(id=running_job_exe_3.job_id).update( status='COMPLETED') # Create message message = RestartScheduler() message.when = scheduler_restarted # Execute message result = message.execute() self.assertTrue(result) failed_jobs_msg = None job_exe_end_msg = None self.assertEqual(len(message.new_messages), 2) for msg in message.new_messages: if msg.type == 'failed_jobs': failed_jobs_msg = msg elif msg.type == 'create_job_exe_ends': job_exe_end_msg = msg error = get_builtin_error('scheduler-lost') # Jobs 1 and 2 should be in messages to be failed, Jobs 3 and 4 should not be included expected_failed_jobs = { FailedJob(running_job_exe_1.job_id, running_job_exe_1.exe_num, error.id), FailedJob(running_job_exe_2.job_id, running_job_exe_2.exe_num, error.id) } expected_failed_job_exe_ids = { running_job_exe_1.id, running_job_exe_2.id } self.assertSetEqual(set(failed_jobs_msg._failed_jobs.values()[0]), expected_failed_jobs) failed_job_exe_ids = set() for job_exe_end_model in job_exe_end_msg._job_exe_ends: failed_job_exe_ids.add(job_exe_end_model.job_exe_id) self.assertSetEqual(failed_job_exe_ids, expected_failed_job_exe_ids) # Test executing message again, should get same result message_json_dict = message.to_json() message = RestartScheduler.from_json(message_json_dict) result = message.execute() self.assertTrue(result) # Jobs 1 and 2 should be in messages to be failed, Jobs 3 and 4 should not be included expected_failed_jobs = { FailedJob(running_job_exe_1.job_id, running_job_exe_1.exe_num, error.id), FailedJob(running_job_exe_2.job_id, running_job_exe_2.exe_num, error.id) } expected_failed_job_exe_ids = { running_job_exe_1.id, running_job_exe_2.id } self.assertSetEqual(set(failed_jobs_msg._failed_jobs.values()[0]), expected_failed_jobs) failed_job_exe_ids = set() for job_exe_end_model in job_exe_end_msg._job_exe_ends: failed_job_exe_ids.add(job_exe_end_model.job_exe_id) self.assertSetEqual(failed_job_exe_ids, expected_failed_job_exe_ids)
def test_json(self): """Tests converting a CreateJobExecutionEnd message to and from JSON""" job_exe_1 = job_test_utils.create_running_job_exe() job_exe_2 = job_test_utils.create_running_job_exe() job_exe_3 = job_test_utils.create_running_job_exe() job_exe_4 = job_test_utils.create_running_job_exe() job_exe_5 = job_test_utils.create_running_job_exe() job_exe_ids = [ job_exe_1.id, job_exe_2.id, job_exe_3.id, job_exe_4.id, job_exe_5.id ] # Execution that was immediately canceled job_exe_1.execution_canceled(now()) # Execution that was canceled after a task launched task_2 = job_exe_2.start_next_task() task_2.launch(now()) job_exe_2.execution_canceled(now()) update = job_test_utils.create_task_status_update( task_2.id, task_2.agent_id, TaskStatusUpdate.KILLED, now()) task_2.update(update) job_exe_2.task_update(update) # Execution where a task timed out task_3 = job_exe_3.start_next_task() task_3.launch(now()) job_exe_3.execution_timed_out(task_3, now()) update = job_test_utils.create_task_status_update( task_3.id, task_3.agent_id, TaskStatusUpdate.KILLED, now()) task_3.update(update) job_exe_3.task_update(update) # Execution where a task failed task_4 = job_exe_4.start_next_task() task_4.launch(now()) update = job_test_utils.create_task_status_update( task_4.id, task_4.agent_id, TaskStatusUpdate.FAILED, now()) task_4.update(update) job_exe_4.task_update(update) # Execution that completed while not job_exe_5.is_finished(): task_5 = job_exe_5.start_next_task() task_5.launch(now()) update = job_test_utils.create_task_status_update( task_5.id, task_5.agent_id, TaskStatusUpdate.RUNNING, now()) task_5.update(update) job_exe_5.task_update(update) update = job_test_utils.create_task_status_update( task_5.id, task_5.agent_id, TaskStatusUpdate.FINISHED, now()) task_5.update(update) job_exe_5.task_update(update) # Add models to message message = CreateJobExecutionEnd() if message.can_fit_more(): message.add_job_exe_end(job_exe_1.create_job_exe_end_model()) if message.can_fit_more(): message.add_job_exe_end(job_exe_2.create_job_exe_end_model()) if message.can_fit_more(): message.add_job_exe_end(job_exe_3.create_job_exe_end_model()) if message.can_fit_more(): message.add_job_exe_end(job_exe_4.create_job_exe_end_model()) if message.can_fit_more(): message.add_job_exe_end(job_exe_5.create_job_exe_end_model()) # Convert message to JSON and back, and then execute message_json_dict = message.to_json() new_message = CreateJobExecutionEnd.from_json(message_json_dict) result = new_message.execute() self.assertTrue(result) job_exe_ends = JobExecutionEnd.objects.filter( job_exe_id__in=job_exe_ids).order_by('job_exe_id') self.assertEqual(len(job_exe_ends), 5) self.assertEqual(job_exe_ends[0].status, 'CANCELED') self.assertEqual(job_exe_ends[1].status, 'CANCELED') self.assertEqual(job_exe_ends[2].status, 'FAILED') self.assertEqual(job_exe_ends[3].status, 'FAILED') self.assertEqual(job_exe_ends[4].status, 'COMPLETED')
def test_execute(self): """Tests calling CreateJobExecutionEnd.execute() successfully""" # Add 3 job_exe_end models to messages 1, 2, and 3 message_1 = CreateJobExecutionEnd() message_2 = CreateJobExecutionEnd() message_3 = CreateJobExecutionEnd() job_exe_ids = [] for _ in range(3): job_exe = job_test_utils.create_running_job_exe() job_exe_ids.append(job_exe.id) while not job_exe.is_finished(): task = job_exe.start_next_task() task.launch(now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) task.update(update) job_exe.task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task.update(update) job_exe.task_update(update) message_1.add_job_exe_end(job_exe.create_job_exe_end_model()) message_1.add_job_exe_end(job_exe.create_job_exe_end_model() ) # Test having duplicate models message_2.add_job_exe_end(job_exe.create_job_exe_end_model()) message_3.add_job_exe_end(job_exe.create_job_exe_end_model()) # Execute message 1 with 3 job_exe_end models message_1.execute() self.assertEqual( JobExecutionEnd.objects.filter(job_exe_id__in=job_exe_ids).count(), 3) # Add more job_exe_end models to messages 1 and 2 while message_2.can_fit_more(): job_exe = job_test_utils.create_running_job_exe() job_exe_ids.append(job_exe.id) while not job_exe.is_finished(): task = job_exe.start_next_task() task.launch(now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) task.update(update) job_exe.task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task.update(update) job_exe.task_update(update) message_2.add_job_exe_end(job_exe.create_job_exe_end_model()) message_3.add_job_exe_end(job_exe.create_job_exe_end_model()) # Execute message 2 with same 3 job_exe_end models from before, plus new ones # Old models should not cause an error and only new ones should get created message_2.execute() self.assertEqual( JobExecutionEnd.objects.filter(job_exe_id__in=job_exe_ids).count(), len(job_exe_ids)) # Execute message 3 with all old models # Old models should not cause an error and no new ones should get created message_3.execute() self.assertEqual( JobExecutionEnd.objects.filter(job_exe_id__in=job_exe_ids).count(), len(job_exe_ids))
def test_running_executions(self): """Tests the metrics with running executions that complete""" node_model_1 = node_test_utils.create_node() node_model_2 = node_test_utils.create_node() job_type_1 = job_test_utils.create_seed_job_type() job_type_2 = job_test_utils.create_seed_job_type() job_exe_1 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_2 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_3 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_4 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_1) job_exe_5 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_2) job_exe_6 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_2) job_exe_7 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_8 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_9 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_10 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_11 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a # thorough testing self.metrics.add_running_job_exes([ job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6, job_exe_7, job_exe_8, job_exe_9, job_exe_10, job_exe_11 ]) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, now()) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 4) for job_type_dict in node_list_dict[0]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 3) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 1) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions end_time_1 = now() job_exe_1._set_final_status('COMPLETED', end_time_1) job_exe_2._set_final_status('FAILED', end_time_1, error=self.data_error) job_exe_4._set_final_status('FAILED', end_time_1, error=self.alg_error) self.metrics.job_exe_finished(job_exe_1) self.metrics.job_exe_finished(job_exe_2) self.metrics.job_exe_finished(job_exe_4) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json( node_list_dict, end_time_1 + datetime.timedelta(seconds=1)) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['completed'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions (all executions still on node 2) end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH job_exe_5._set_final_status('COMPLETED', end_time_2) job_exe_6._set_final_status('COMPLETED', end_time_2) job_exe_7._set_final_status('COMPLETED', end_time_2) job_exe_8._set_final_status('COMPLETED', end_time_2) job_exe_9._set_final_status('COMPLETED', end_time_2) job_exe_10._set_final_status('COMPLETED', end_time_2) job_exe_11._set_final_status('COMPLETED', end_time_2) self.metrics.job_exe_finished(job_exe_5) self.metrics.job_exe_finished(job_exe_6) self.metrics.job_exe_finished(job_exe_7) self.metrics.job_exe_finished(job_exe_8) self.metrics.job_exe_finished(job_exe_9) self.metrics.job_exe_finished(job_exe_10) self.metrics.job_exe_finished(job_exe_11) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_2) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['completed'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['completed'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Let all finished job executions roll off by time, only running remaining end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta( seconds=1) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_3) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0)