Пример #1
0
    def test_update_all_cluster_resources(self):
        """Tests successfully updating the all cluster resources database in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        resource_db = ClusterResources.objects.first()

        self.assertIsNone(resource_db)

        resource_mgr.update_all_cluster_resources()

        resource_db = ClusterResources.objects.first()

        self.assertIsNotNone(resource_db)

        self.assertEqual(resource_db.mem, 25120.0)
        self.assertEqual(resource_db.gpus, 0.0)
        self.assertEqual(resource_db.disk, 25120.0)
        self.assertEqual(resource_db.cpus, 252.0)
Пример #2
0
    def test_schedule_system_tasks(self):
        """Tests successfully calling perform_scheduling() when scheduling system tasks"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        # Clear the queue
        Queue.objects.all().delete()
        # Set us up to schedule a database update task
        system_task_mgr._is_db_update_completed = False
        # Set us up to schedule 2 message handler tasks
        Scheduler.objects.update(num_message_handlers=2)
        scheduler_mgr.sync_with_database()

        scheduling_manager = SchedulingManager()

        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(
            num_tasks,
            3)  # Schedule database update task and 2 message handler tasks
Пример #3
0
    def test_score_job_exe_for_scheduling_insufficient_resources(self):
        """Tests calling score_job_exe_for_scheduling() when there are not enough resources to schedule the job"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        task_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        # Allocate 10 CPUs and 50 MiB memory to existing job execution
        job_exe = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]))
        scheduling_node.accept_job_exe_next_task(job_exe, [])

        # Should have 10 CPUs and 50 MiB memory left, so this job execution is too big
        queue_model = queue_test_utils.create_queue(cpus_required=15.0,
                                                    mem_required=40.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)

        score = scheduling_node.score_job_exe_for_scheduling(job_exe, [])
        self.assertIsNone(score)
Пример #4
0
    def test_successful_schedule(self):
        """Tests successfully calling perform_scheduling()"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks,
                         2)  # Schedule smaller queued job executions
        # Ensure job execution models are created and queue models are deleted
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(
            JobExecution.objects.filter(
                job_id=self.queue_large.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 0)
Пример #5
0
    def test_canceled_queue_model(self):
        """Tests successfully calling perform_scheduling() when a queue model has been canceled"""
        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        self.queue_1.is_canceled = True
        self.queue_1.save()

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 1)  # Scheduled non-canceled queued job execution
        # queue_1 should be canceled, queue_2 should be running, queue should be empty now
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 0)
        # Job execution manager should have a message for the canceled job execution
        messages = job_exe_mgr.get_messages()
        found_job_exe_end_message = False
        for message in messages:
            if message.type == 'create_job_exe_ends':
                found_job_exe_end_message = True
        self.assertTrue(found_job_exe_end_message)
Пример #6
0
    def test_accept_job_exe_next_task_canceled(self):
        """Tests calling accept_job_exe_next_task() when job exe gets canceled (no next task)"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        job_exe = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(1.0), Mem(10.0)]))
        waiting_tasks = []

        job_exe.execution_canceled(now())
        had_waiting_task = scheduling_node.accept_job_exe_next_task(
            job_exe, waiting_tasks)
        self.assertFalse(had_waiting_task)
        self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                NodeResources([Cpus(10.0), Mem(50.0)])))
        self.assertListEqual(waiting_tasks, [])
Пример #7
0
    def test_missing_workspace(self):
        """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler"""

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        # Add workspaces to the queued jobs
        queue_1 = Queue.objects.get(id=self.queue_1.id)
        config = queue_1.get_execution_configuration()
        config.set_output_workspaces({'my_output': 'my_workspace'})
        queue_1.configuration = config.get_dict()
        queue_1.save()
        queue_2 = Queue.objects.get(id=self.queue_2.id)
        config = queue_2.get_execution_configuration()
        config.set_output_workspaces({'my_output': 'my_workspace'})
        queue_2.configuration = config.get_dict()
        queue_2.save()

        scheduling_manager = SchedulingManager()

        # Clear out workspace manager for scheduling
        with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces:
            mock_get_workspaces.return_value = {}
            num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        # Nothing should be scheduled
        self.assertEqual(num_tasks, 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Пример #8
0
    def test_accept_node_tasks_insufficient_resources(self):
        """Tests calling accept_node_tasks() when there are not enough resources"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        health_task = HealthTask('1234', 'agent_1')
        pull_task = PullTask('1234', 'agent_1')
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        node.get_next_tasks = MagicMock()
        node.get_next_tasks.return_value = [health_task, pull_task]
        offered_resources = NodeResources([Cpus(0.0), Mem(50.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        waiting_tasks = []

        had_waiting_task = scheduling_node.accept_node_tasks(
            now(), waiting_tasks)
        self.assertTrue(had_waiting_task)
        self.assertEqual(len(scheduling_node.allocated_tasks), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(offered_resources))
        self.assertListEqual(waiting_tasks, [health_task, pull_task])
Пример #9
0
    def test_accept_new_job_exe_gpu_partial_node_other_task(self):
        """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(1.0)])
        task_resources = NodeResources([Gpus(1.0)])
        watermark_resources = NodeResources(
            [Cpus(100.0), Mem(500.0), Gpus(1.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0,
                                                    gpus_required=2)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertFalse(accepted)
Пример #10
0
    def test_accept_new_job_exe_no_jobs(self):
        """Tests calling accept_new_job_exe() when new job exes are not allowed"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = False
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertFalse(accepted)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                NodeResources([Cpus(10.0), Mem(50.0)])))
        self.assertIsNone(job_exe._scheduled_node_id)
Пример #11
0
    def test_max_resources(self):
        """Tests successfully calculating the max resources in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        max = resource_mgr.get_max_available_resources()
        self.assertTrue(
            max.is_equal(
                NodeResources([Cpus(250.0),
                               Mem(22048.0),
                               Disk(24096.0)])))
Пример #12
0
    def test_all_available_resources(self):
        """Tests successfully calculating the available resources in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        all_available_resources = resource_mgr.get_all_available_resources()
        self.assertDictEqual(all_available_resources, {
            'mem': 25120.0,
            'gpus': 0.0,
            'disk': 25120.0,
            'cpus': 252.0
        })
Пример #13
0
    def test_job_type_limit(self):
        """Tests calling perform_scheduling() with a job type limit"""
        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_seed_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id,
                                                                  job_type=job_type_with_limit, node=self.node_1)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([running_job_exe_1], [])

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Пример #14
0
    def test_start_job_exe_tasks(self):
        """Tests calling start_job_exe_tasks() successfully"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        job_exe_1 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]))
        job_exe_2 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(5.0), Mem(25.0)]))
        scheduling_node.accept_job_exe_next_task(job_exe_1, [])
        scheduling_node.accept_job_exe_next_task(job_exe_2, [])
        self.assertEqual(len(scheduling_node._allocated_running_job_exes), 2)

        job_exe_1.execution_canceled(now(
        ))  # Execution canceled, so it will not have a next task to start

        scheduling_node.start_job_exe_tasks()
        self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0)
        self.assertEqual(len(scheduling_node.allocated_tasks),
                         1)  # Only job_exe_2 had a next task
Пример #15
0
    def test_paused_job_type(self):
        """Tests calling perform_scheduling() when a job type is paused"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        self.queue_1.job_type.is_paused = True
        self.queue_1.job_type.save()
        job_type_mgr.sync_with_database()

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks,
                         1)  # Schedule queued job execution that is not paused
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
Пример #16
0
    def test_missing_job_types(self):
        """Tests calling perform_scheduling() when a queued job type has not been synced to the scheduler"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        scheduling_manager = SchedulingManager()

        # Clear out job type manager for scheduling
        with patch('scheduler.scheduling.manager.job_type_mgr.get_job_types'
                   ) as mock_get_job_types:
            mock_get_job_types.return_value = {}
            num_tasks = scheduling_manager.perform_scheduling(
                self._client, now())

        # Nothing should be scheduled
        self.assertEqual(num_tasks, 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Пример #17
0
    def test_paused_scheduler(self):
        """Tests calling perform_scheduling() with a paused scheduler"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        Scheduler.objects.update(is_paused=True)
        scheduler_mgr.sync_with_database()
        node_mgr.sync_with_database(
            scheduler_mgr.config)  # Updates nodes with paused scheduler
        system_task_mgr._is_db_update_completed = False  # Make sure system tasks don't get scheduled

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(num_tasks, 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Пример #18
0
    def test_score_job_exe_for_reservation_insufficient_resources(self):
        """Tests calling score_job_exe_for_reservation() when there are not enough resources to reserve for the job"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(20.0), Mem(100.0)])
        watermark_resources = NodeResources([Cpus(200.0), Mem(700.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        task = HealthTask(
            '1234', 'agent_1')  # Resources are 0.1 CPUs and 32 MiB memory
        job_exe_1 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(10.0), Mem(50.0)]),
            priority=1000)
        job_exe_2 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(56.0), Mem(15.0)]),
            priority=100)
        scheduling_node = SchedulingNode('agent_1', node, [task],
                                         [job_exe_1, job_exe_2], resource_set)
        queue_model_1 = queue_test_utils.create_queue(priority=100,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_1 = QueuedJobExecution(queue_model_1)
        queue_model_2 = queue_test_utils.create_queue(priority=1000,
                                                      cpus_required=8.0,
                                                      mem_required=40.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_2 = QueuedJobExecution(queue_model_2)
        scheduling_node.accept_new_job_exe(job_exe_1)
        scheduling_node.accept_new_job_exe(job_exe_2)

        # We are going to try to reserve the node for a job execution with priority 120
        # Calculate available resources for reservation:
        # Watermark (200, 700) - System Tasks (0.1, 32) - Higher Priority Existing Job Exes (56, 15) -  Higher Priority
        # New Job Exes (8, 40) = 135.9 CPUs, 613 memory
        # This new job should NOT fit for reservation
        queue_model = queue_test_utils.create_queue(priority=120,
                                                    cpus_required=140.0,
                                                    mem_required=600.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0)
        job_exe = QueuedJobExecution(queue_model)
        job_type_resource_1 = NodeResources([Cpus(2.0), Mem(10.0)])

        score = scheduling_node.score_job_exe_for_reservation(
            job_exe, [job_type_resource_1])
        self.assertIsNone(score)
Пример #19
0
    def test_get_queued_resources(self):
        """Tests successfully getting queued resource information"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        resource_db = ClusterResources.objects.first()

        self.assertIsNone(resource_db)

        resource_mgr.update_all_cluster_resources()

        resource_db = ClusterResources.objects.first()

        self.assertIsNotNone(resource_db)

        self.assertEqual(resource_db.mem, 25120.0)
        self.assertEqual(resource_db.gpus, 0.0)
        self.assertEqual(resource_db.disk, 25120.0)
        self.assertEqual(resource_db.cpus, 252.0)

        queued_resources = resource_mgr.get_queued_resources()

        self.assertDictEqual(
            queued_resources, {
                "cluster_resources": {
                    'cpus': 252,
                    'disk': 25120,
                    'gpus': 0,
                    'mem': 25120
                },
                "queue_lengths": {
                    'PENDING': 0,
                    'QUEUED': 3,
                    'RUNNING': 0
                },
                "total_resources": {
                    'PENDING': {},
                    'QUEUED': {
                        'cpus': 3.0,
                        'mem': 384.0
                    },
                    'RUNNING': {}
                }
            })
Пример #20
0
    def test_node_with_new_agent_id(self):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer(
            'offer', self.agent_3.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._client.method_calls
        # One for checking for driver and second for task launch
        self.assertEqual(2, len(calls))
        # Get tasks off 2nd calls (index
        mesos_tasks = calls[1][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id,
                             mesos_task['agent_id']['value'])
Пример #21
0
    def __init__(self, resources=None):
        """Constructor

        :param resources: The list of node resources
        :type resources: list
        """

        self._resources = {}  # {Name: Resource}
        if resources:
            for resource in resources:
                if resource.resource_type != 'SCALAR':
                    raise ScaleLogicBug(
                        'Resource type "%s" is not currently supported',
                        resource.resource_type)
                self._resources[resource.name] = resource

        # Make sure standard resources are defined
        if 'cpus' not in self._resources:
            self._resources['cpus'] = Cpus(0.0)
        if 'mem' not in self._resources:
            self._resources['mem'] = Mem(0.0)
        if 'disk' not in self._resources:
            self._resources['disk'] = Disk(0.0)
        if 'gpus' not in self._resources:
            self._resources['gpus'] = Gpus(0.0)
Пример #22
0
def create_queue(job_type=None, priority=1, timeout=3600, cpus_required=1.0, mem_required=512.0, disk_in_required=200.0,
                 disk_out_required=100.0, disk_total_required=300.0, gpus_required=0, queued=timezone.now()):
    """Creates a queue model for unit testing

    :param job_type: The job type
    :type job_type: :class:`job.models.JobType`
    :param priority: The priority
    :type priority: int
    :param timeout: The timeout
    :type timeout: int
    :param cpus_required: The number of CPUs required
    :type cpus_required: float
    :param mem_required: The memory required in MiB
    :type mem_required: float
    :param disk_in_required: The input disk space required in MiB
    :type disk_in_required: float
    :param disk_out_required: The output disk space required in MiB
    :type disk_out_required: float
    :param disk_total_required: The total disk space required in MiB
    :type disk_total_required: float
    :param gpus_required: The number of GPUs required
    :type gpus_required: float
    :param queued: The time the execution was queued
    :type queued: :class:`datetime.datetime`
    """

    job = job_test_utils.create_job(job_type=job_type, status='QUEUED')
    resources = NodeResources([Cpus(cpus_required), Mem(mem_required), Disk(disk_total_required), Gpus(gpus_required)])

    return Queue.objects.create(job_type=job.job_type, job=job, exe_num=job.num_exes, priority=priority,
                                timeout=timeout, input_file_size=disk_in_required,
                                interface=job.get_job_interface().get_dict(),
                                configuration=ExecutionConfiguration().get_dict(),
                                resources=resources.get_json().get_dict(), queued=queued)
Пример #23
0
 def setUp(self):
     django.setup()
     resource_mgr.clear()
     self.agent_1 = Agent('agent_1', 'host_1')
     self.agent_2 = Agent('agent_2', 'host_2')
     self.framework_id = '1234'
     offer_1 = ResourceOffer(
         'offer_1', self.agent_1.agent_id, self.framework_id,
         NodeResources([Cpus(2.0), Mem(1024.0),
                        Disk(1024.0)]), now(), None)
     offer_2 = ResourceOffer(
         'offer_2', self.agent_2.agent_id, self.framework_id,
         NodeResources([Cpus(25.0), Mem(2048.0),
                        Disk(2048.0)]), now(), None)
     resource_mgr.add_new_offers([offer_1, offer_2])
     resource_mgr.refresh_agent_resources([], now())
Пример #24
0
    def populate_queue_resources(apps, schema_editor):
        from node.resources.node_resources import NodeResources
        from node.resources.resource import Cpus, Disk, Mem

        # Go through all of the queue models and populate their new resources columns
        Queue = apps.get_model('queue', 'Queue')
        total_count = Queue.objects.all().count()
        print 'Populating new resources field for %s queue models' % str(
            total_count)
        done_count = 0
        batch_size = 1000
        while done_count < total_count:
            percent = (float(done_count) / float(total_count)) * 100.00
            print 'Completed %s of %s queue models (%f%%)' % (
                done_count, total_count, percent)
            batch_end = done_count + batch_size
            for queue in Queue.objects.order_by(
                    'job_exe_id')[done_count:batch_end]:
                cpus = queue.cpus_required
                mem = queue.mem_required
                disk = queue.disk_total_required
                resources = NodeResources([Cpus(cpus), Mem(mem), Disk(disk)])
                queue.resources = resources.get_json().get_dict()
                queue.save()
            done_count += batch_size
        print 'All %s queue models completed' % str(total_count)
Пример #25
0
    def test_reset_new_job_exes(self):
        """Tests calling reset_new_job_exes() successfully"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        queue_model_1 = queue_test_utils.create_queue(cpus_required=2.0,
                                                      mem_required=60.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_1 = QueuedJobExecution(queue_model_1)
        queue_model_2 = queue_test_utils.create_queue(cpus_required=4.5,
                                                      mem_required=400.0,
                                                      disk_in_required=0.0,
                                                      disk_out_required=0.0,
                                                      disk_total_required=0.0)
        job_exe_2 = QueuedJobExecution(queue_model_2)
        allocated_resources = NodeResources()
        allocated_resources.add(job_exe_1.required_resources)
        allocated_resources.add(job_exe_2.required_resources)

        # Set up node with queued job exes
        scheduling_node.accept_new_job_exe(job_exe_1)
        scheduling_node.accept_new_job_exe(job_exe_2)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 2)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(allocated_resources))

        # Reset queued job exes and check that everything is back to square one
        scheduling_node.reset_new_job_exes()
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(offered_resources))
Пример #26
0
    def test_no_default_workspace(self, mock_taskinfo):
        """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler"""
        mock_taskinfo.return_value = MagicMock()

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now())
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now())
        resource_mgr.add_new_offers([offer_1, offer_2])
        
        # Add output data to the first queued job:
        # output data + no workspace defined = fail
        queue_1 = Queue.objects.get(id=self.queue_1.id)
        queue_1.get_job_interface().definition['output_data'] = [{'name': 'my_output', 'type': 'file'}]
        config = queue_1.get_execution_configuration()
        queue_1.configuration = config.get_dict()
        queue_1.save()
        # No output data + no workspace = pass
        queue_2 = Queue.objects.get(id=self.queue_2.id)
        config = queue_2.get_execution_configuration()
        queue_2.configuration = config.get_dict()
        queue_2.save()
        
        scheduling_manager = SchedulingManager()
        
        # Set a workspace on the manager
        with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces:
            mock_get_workspaces.return_value = {
                'name': 'my_workspace',
                'title': 'My Workspace',
                'description': 'My workspaces',
                'is_active': True,
                'json_config': {'version': '1.0','broker': {'type': 'host','host_path': '/host/path'}},
            }
            num_tasks = scheduling_manager.perform_scheduling(self._driver, now())
        
        # Only queue_2 should be scheduled
        self.assertEqual(num_tasks, 1)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
Пример #27
0
    def setUp(self):
        django.setup()

        reset_error_cache()

        self.framework_id = '1234'
        Scheduler.objects.initialize_scheduler()
        Scheduler.objects.update(
            num_message_handlers=0
        )  # Prevent message handler tasks from scheduling
        self._client = MagicMock()

        scheduler_mgr.sync_with_database()
        scheduler_mgr.update_from_mesos(framework_id=self.framework_id)
        resource_mgr.clear()
        job_exe_mgr.clear()

        self.agent_1 = Agent('agent_1', 'host_1')
        self.agent_2 = Agent('agent_2', 'host_2')
        self.agent_3 = Agent('agent_3', 'host_2')
        node_mgr.clear()
        node_mgr.register_agents([self.agent_1, self.agent_2])
        node_mgr.sync_with_database(scheduler_mgr.config)
        # Ignore initial cleanup, health check, and image pull tasks
        for node in node_mgr.get_nodes():
            node._last_health_task = now()
            node._initial_cleanup_completed()
            node._is_image_pulled = True
            node._update_state()
            if node.agent_id == 'agent_1':
                self.node_1_id = node.id
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        self.node_1 = Node.objects.get(id=self.node_1_id)
        # Ignore system tasks
        system_task_mgr._is_db_update_completed = True

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_large = queue_test_utils.create_queue(
            resources=NodeResources([Cpus(
                125.0), Mem(12048.0), Disk(12048.0)]))

        job_type_mgr.sync_with_database()
Пример #28
0
    def test_accept_new_job_exe_gpu_partial_node(self):
        """Tests successfully calling accept_new_job_exe() when job requires less GPUs than available"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        offered_resources = NodeResources([Cpus(10.0), Mem(50.0), Gpus(4.0)])
        task_resources = NodeResources()
        watermark_resources = NodeResources(
            [Cpus(100.0), Mem(500.0), Gpus(4.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)

        queue_model = queue_test_utils.create_queue(cpus_required=1.0,
                                                    mem_required=10.0,
                                                    disk_in_required=0.0,
                                                    disk_out_required=0.0,
                                                    disk_total_required=0.0,
                                                    gpus_required=1)
        job_exe = QueuedJobExecution(queue_model)

        accepted = scheduling_node.accept_new_job_exe(job_exe)
        self.assertTrue(accepted)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 1)
        # Verify that our greedy GPU allocation logic is working
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(
                NodeResources([Cpus(1.0), Mem(10.0),
                               Gpus(4.0)])))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                NodeResources([Cpus(9.0), Mem(40.0)])))
        self.assertEqual(job_exe._scheduled_node_id, node.id)
Пример #29
0
    def test_accept_node_tasks(self):
        """Tests successfully calling accept_node_tasks()"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        health_task = HealthTask('1234', 'agent_1')
        pull_task = PullTask('1234', 'agent_1')
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        node.get_next_tasks = MagicMock()
        node.get_next_tasks.return_value = [health_task, pull_task]
        node_task_resources = NodeResources()
        node_task_resources.add(health_task.get_resources())
        node_task_resources.add(pull_task.get_resources())
        offered_resources = NodeResources([Cpus(100.0), Mem(5000.0)])
        expected_remaining_resources = NodeResources()
        expected_remaining_resources.add(offered_resources)
        expected_remaining_resources.subtract(node_task_resources)
        task_resources = NodeResources()
        watermark_resources = NodeResources([Cpus(100.0), Mem(5000.0)])
        resource_set = ResourceSet(offered_resources, task_resources,
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        waiting_tasks = []

        had_waiting_task = scheduling_node.accept_node_tasks(
            now(), waiting_tasks)
        self.assertFalse(had_waiting_task)
        self.assertEqual(len(scheduling_node.allocated_tasks), 2)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(node_task_resources))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(
                expected_remaining_resources))
        self.assertListEqual(waiting_tasks, [])
Пример #30
0
def job_type_get_resources(self):
    """Returns the resources required for jobs of this type

    :returns: The required resources
    :rtype: :class:`node.resources.node_resources.NodeResources`
    """

    resources = Resources(self.custom_resources).get_node_resources()
    resources.remove_resource('cpus')
    resources.remove_resource('mem')
    resources.remove_resource('disk')
    cpus = max(self.cpus_required, 0.25)
    resources.add(NodeResources([Cpus(cpus)]))
    return resources