Пример #1
0
    def test_adding_offers(self):
        """Tests adding offer and checking the results"""

        node_offers = NodeOffers(self.node)

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        node_offers.add_offer(offer_1)  # Add same offer twice, should ignore

        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=5.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        offer_3 = ResourceOffer(
            'offer_3', self.node_agent,
            NodeResources(cpus=3.0, mem=512.0, disk=1024.0))
        node_offers.add_offer(offer_3)

        offer_4 = ResourceOffer(
            'offer_4', 'bad_agent',
            NodeResources(cpus=1.0, mem=512.0, disk=1024.0))
        self.assertRaises(Exception, node_offers.add_offer, offer_4)

        self.assertEqual(node_offers._available_cpus, 10.0)
        self.assertEqual(node_offers._available_mem, 3584.0)
        self.assertEqual(node_offers._available_disk, 4096.0)
Пример #2
0
    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_NOT_READY)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
Пример #3
0
    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Пример #4
0
    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Пример #5
0
    def test_remove_offer(self):
        """Tests remove_offer()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Remove one offer, new job execution should still be accepted
        node_offers.remove_offer(offer_1.id)
        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(),
                             [job_exe_1])

        # Remove second offer, no resources left, all job executions should be removed
        node_offers.remove_offer(offer_2.id)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
Пример #6
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING')
        queue_1_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_2_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_3_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_4_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_5_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_6_limit = queue_test_utils.create_queue(job_type=job_type_with_limit)
        self._job_type_manager.sync_with_database()
        # One job of this type is already running
        self._job_exe_manager.add_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        self._offer_manager.add_new_offers([offer_1, offer_2])

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Пример #7
0
    def test_successful_schedule(self, mock_taskinfo):
        """Tests successfully scheduling tasks"""
        mock_taskinfo.return_value = MagicMock()

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))
        self._offer_manager.add_new_offers([offer_1, offer_2])

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 2)
Пример #8
0
    def test_paused_scheduler(self, mock_taskinfo):
        """Tests running the scheduling thread with a paused scheduler"""
        mock_taskinfo.return_value = MagicMock()

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))
        self._offer_manager.add_new_offers([offer_1, offer_2])
        Scheduler.objects.update(is_paused=True)
        self._scheduler_manager.sync_with_database()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 0)
Пример #9
0
    def test_consider_next_task(self):
        """Tests consider_next_task() and get_accepted_running_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_next_task(
            job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus)
        result = node_offers.consider_next_task(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem)
        result = node_offers.consider_next_task(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk)
        result = node_offers.consider_next_task(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()),
                            {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
Пример #10
0
    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
Пример #11
0
    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_OFFLINE)
Пример #12
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit,
                                                  status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_1,
            NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_2,
            NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore Docker pull tasks
        for node in node_mgr.get_nodes():
            node._is_image_pulled = True

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node._initial_cleanup_completed()
            node._update_state()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(
            num_tasks, 3
        )  # One is already running, should only be able to schedule 3 more
Пример #13
0
    def test_high_disk(self):
        """Tests rejecting a queued job execution due to too much disk required"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_high_disk)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NOT_ENOUGH_DISK)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Пример #14
0
    def test_all_offers_paused(self):
        """Tests rejecting a queued job execution due to all nodes being paused"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_paused,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
Пример #15
0
    def test_offers_with_no_nodes(self):
        """Tests considering job executions when offers cannot be readied due to no nodes updated"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_NOT_READY)
Пример #16
0
    def test_lost_node_that_comes_back(self):
        """Tests that when a lost name comes back, it can schedule tasks again"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # Node goes down and comes back up with new agent ID
        manager.lost_node(self.node_agent)
        new_node_agent = 'i_am_a_new_node_agent'
        self.node.update_from_mesos(agent_id=new_node_agent)

        job_exe_1 = QueuedJobExecution(self.queue_1)

        # Offers for previous agent should be gone, do not schedule the job exe
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        offer_3 = ResourceOffer(
            'offer_3', new_node_agent,
            NodeResources(cpus=35.0, mem=3048.0, disk=3048.0))
        manager.add_new_offers([offer_3])
        manager.update_nodes([self.node])
        manager.ready_new_offers()

        # New offers have come in for new agent ID, should schedule job exe now
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 1)
Пример #17
0
    def resourceOffers(self, driver, offers):
        '''
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single slave.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources (see
        SchedulerDriver.launchTasks) or (b) those resources have been
        rescinded (see Scheduler.offerRescinded).  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).

        See documentation for :meth:`mesos_api.mesos.Scheduler.resourceOffers`.
        '''

        started = now()

        agent_ids = []
        resource_offers = []
        for offer in offers:
            offer_id = offer.id.value
            agent_id = offer.slave_id.value
            disk = 0
            mem = 0
            cpus = 0
            for resource in offer.resources:
                if resource.name == 'disk':
                    disk = resource.scalar.value
                elif resource.name == 'mem':
                    mem = resource.scalar.value
                elif resource.name == 'cpus':
                    cpus = resource.scalar.value
            resources = NodeResources(cpus=cpus, mem=mem, disk=disk)
            agent_ids.append(agent_id)
            resource_offers.append(ResourceOffer(offer_id, agent_id,
                                                 resources))

        self._node_manager.add_agent_ids(agent_ids)
        self._offer_manager.add_new_offers(resource_offers)

        duration = now() - started
        msg = 'Scheduler resourceOffers() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())