Exemplo n.º 1
0
    def test_schedule_system_tasks(self):
        """Tests successfully calling perform_scheduling() when scheduling system tasks"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        # Clear the queue
        Queue.objects.all().delete()
        # Set us up to schedule a database update task
        system_task_mgr._is_db_update_completed = False
        # Set us up to schedule 2 message handler tasks
        Scheduler.objects.update(num_message_handlers=2)
        scheduler_mgr.sync_with_database()

        scheduling_manager = SchedulingManager()

        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(
            num_tasks,
            3)  # Schedule database update task and 2 message handler tasks
Exemplo n.º 2
0
    def test_canceled_queue_model(self):
        """Tests successfully calling perform_scheduling() when a queue model has been canceled"""
        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        self.queue_1.is_canceled = True
        self.queue_1.save()

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 1)  # Scheduled non-canceled queued job execution
        # queue_1 should be canceled, queue_2 should be running, queue should be empty now
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 0)
        # Job execution manager should have a message for the canceled job execution
        messages = job_exe_mgr.get_messages()
        found_job_exe_end_message = False
        for message in messages:
            if message.type == 'create_job_exe_ends':
                found_job_exe_end_message = True
        self.assertTrue(found_job_exe_end_message)
Exemplo n.º 3
0
    def test_max_resources(self):
        """Tests successfully calculating the max resources in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        max = resource_mgr.get_max_available_resources()
        self.assertTrue(
            max.is_equal(
                NodeResources([Cpus(250.0),
                               Mem(22048.0),
                               Disk(24096.0)])))
Exemplo n.º 4
0
    def test_all_available_resources(self):
        """Tests successfully calculating the available resources in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        all_available_resources = resource_mgr.get_all_available_resources()
        self.assertDictEqual(all_available_resources, {
            'mem': 25120.0,
            'gpus': 0.0,
            'disk': 25120.0,
            'cpus': 252.0
        })
Exemplo n.º 5
0
    def test_job_type_limit(self):
        """Tests calling perform_scheduling() with a job type limit"""
        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_seed_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id,
                                                                  job_type=job_type_with_limit, node=self.node_1)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([running_job_exe_1], [])

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Exemplo n.º 6
0
    def test_successful_schedule(self):
        """Tests successfully calling perform_scheduling()"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks,
                         2)  # Schedule smaller queued job executions
        # Ensure job execution models are created and queue models are deleted
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 1)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(
            JobExecution.objects.filter(
                job_id=self.queue_large.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 0)
Exemplo n.º 7
0
    def test_missing_workspace(self):
        """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler"""

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        # Add workspaces to the queued jobs
        queue_1 = Queue.objects.get(id=self.queue_1.id)
        config = queue_1.get_execution_configuration()
        config.set_output_workspaces({'my_output': 'my_workspace'})
        queue_1.configuration = config.get_dict()
        queue_1.save()
        queue_2 = Queue.objects.get(id=self.queue_2.id)
        config = queue_2.get_execution_configuration()
        config.set_output_workspaces({'my_output': 'my_workspace'})
        queue_2.configuration = config.get_dict()
        queue_2.save()

        scheduling_manager = SchedulingManager()

        # Clear out workspace manager for scheduling
        with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces:
            mock_get_workspaces.return_value = {}
            num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        # Nothing should be scheduled
        self.assertEqual(num_tasks, 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Exemplo n.º 8
0
    def test_update_all_cluster_resources(self):
        """Tests successfully updating the all cluster resources database in a cluster"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        resource_db = ClusterResources.objects.first()

        self.assertIsNone(resource_db)

        resource_mgr.update_all_cluster_resources()

        resource_db = ClusterResources.objects.first()

        self.assertIsNotNone(resource_db)

        self.assertEqual(resource_db.mem, 25120.0)
        self.assertEqual(resource_db.gpus, 0.0)
        self.assertEqual(resource_db.disk, 25120.0)
        self.assertEqual(resource_db.cpus, 252.0)
Exemplo n.º 9
0
    def test_paused_job_type(self):
        """Tests calling perform_scheduling() when a job type is paused"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        self.queue_1.job_type.is_paused = True
        self.queue_1.job_type.save()
        job_type_mgr.sync_with_database()

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks,
                         1)  # Schedule queued job execution that is not paused
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
Exemplo n.º 10
0
    def test_missing_job_types(self):
        """Tests calling perform_scheduling() when a queued job type has not been synced to the scheduler"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        scheduling_manager = SchedulingManager()

        # Clear out job type manager for scheduling
        with patch('scheduler.scheduling.manager.job_type_mgr.get_job_types'
                   ) as mock_get_job_types:
            mock_get_job_types.return_value = {}
            num_tasks = scheduling_manager.perform_scheduling(
                self._client, now())

        # Nothing should be scheduled
        self.assertEqual(num_tasks, 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Exemplo n.º 11
0
    def test_paused_scheduler(self):
        """Tests calling perform_scheduling() with a paused scheduler"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        Scheduler.objects.update(is_paused=True)
        scheduler_mgr.sync_with_database()
        node_mgr.sync_with_database(
            scheduler_mgr.config)  # Updates nodes with paused scheduler
        system_task_mgr._is_db_update_completed = False  # Make sure system tasks don't get scheduled

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(num_tasks, 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 0)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 2)
Exemplo n.º 12
0
    def test_get_queued_resources(self):
        """Tests successfully getting queued resource information"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(22048.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        offer_3 = ResourceOffer(
            'offer_3', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(225.0),
                           Mem(1024.0),
                           Disk(22048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2, offer_3])

        resource_mgr.refresh_agent_resources([], now())

        resource_db = ClusterResources.objects.first()

        self.assertIsNone(resource_db)

        resource_mgr.update_all_cluster_resources()

        resource_db = ClusterResources.objects.first()

        self.assertIsNotNone(resource_db)

        self.assertEqual(resource_db.mem, 25120.0)
        self.assertEqual(resource_db.gpus, 0.0)
        self.assertEqual(resource_db.disk, 25120.0)
        self.assertEqual(resource_db.cpus, 252.0)

        queued_resources = resource_mgr.get_queued_resources()

        self.assertDictEqual(
            queued_resources, {
                "cluster_resources": {
                    'cpus': 252,
                    'disk': 25120,
                    'gpus': 0,
                    'mem': 25120
                },
                "queue_lengths": {
                    'PENDING': 0,
                    'QUEUED': 3,
                    'RUNNING': 0
                },
                "total_resources": {
                    'PENDING': {},
                    'QUEUED': {
                        'cpus': 3.0,
                        'mem': 384.0
                    },
                    'RUNNING': {}
                }
            })
Exemplo n.º 13
0
    def test_node_with_new_agent_id(self):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer(
            'offer', self.agent_3.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._client.method_calls
        # One for checking for driver and second for task launch
        self.assertEqual(2, len(calls))
        # Get tasks off 2nd calls (index
        mesos_tasks = calls[1][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id,
                             mesos_task['agent_id']['value'])
Exemplo n.º 14
0
 def setUp(self):
     django.setup()
     resource_mgr.clear()
     self.agent_1 = Agent('agent_1', 'host_1')
     self.agent_2 = Agent('agent_2', 'host_2')
     self.framework_id = '1234'
     offer_1 = ResourceOffer(
         'offer_1', self.agent_1.agent_id, self.framework_id,
         NodeResources([Cpus(2.0), Mem(1024.0),
                        Disk(1024.0)]), now(), None)
     offer_2 = ResourceOffer(
         'offer_2', self.agent_2.agent_id, self.framework_id,
         NodeResources([Cpus(25.0), Mem(2048.0),
                        Disk(2048.0)]), now(), None)
     resource_mgr.add_new_offers([offer_1, offer_2])
     resource_mgr.refresh_agent_resources([], now())
Exemplo n.º 15
0
    def resourceOffers(self, driver, offers):
        """
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single slave.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources (see
        SchedulerDriver.launchTasks) or (b) those resources have been
        rescinded (see Scheduler.offerRescinded).  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).

        See documentation for :meth:`mesos_api.mesos.Scheduler.resourceOffers`.
        """

        started = now()

        agents = {}
        resource_offers = []
        total_resources = NodeResources()
        for offer in offers:
            offer_id = offer.id.value
            agent_id = offer.slave_id.value
            framework_id = offer.framework_id.value
            hostname = offer.hostname
            resource_list = []
            for resource in offer.resources:
                if resource.type == 0:  # This is the SCALAR type
                    resource_list.append(
                        ScalarResource(resource.name, resource.scalar.value))
            resources = NodeResources(resource_list)
            total_resources.add(resources)
            agents[agent_id] = Agent(agent_id, hostname)
            resource_offers.append(
                ResourceOffer(offer_id, agent_id, framework_id, resources,
                              started))

        node_mgr.register_agents(agents.values())
        resource_mgr.add_new_offers(resource_offers)

        num_offers = len(resource_offers)
        logger.info('Received %d offer(s) with %s from %d node(s)', num_offers,
                    total_resources, len(agents))
        scheduler_mgr.add_new_offer_count(num_offers)

        duration = now() - started
        msg = 'Scheduler resourceOffers() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Exemplo n.º 16
0
    def test_no_default_workspace(self, mock_taskinfo):
        """Tests calling perform_scheduling() when a queued job's workspace has not been synced to the scheduler"""
        mock_taskinfo.return_value = MagicMock()

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now())
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now())
        resource_mgr.add_new_offers([offer_1, offer_2])
        
        # Add output data to the first queued job:
        # output data + no workspace defined = fail
        queue_1 = Queue.objects.get(id=self.queue_1.id)
        queue_1.get_job_interface().definition['output_data'] = [{'name': 'my_output', 'type': 'file'}]
        config = queue_1.get_execution_configuration()
        queue_1.configuration = config.get_dict()
        queue_1.save()
        # No output data + no workspace = pass
        queue_2 = Queue.objects.get(id=self.queue_2.id)
        config = queue_2.get_execution_configuration()
        queue_2.configuration = config.get_dict()
        queue_2.save()
        
        scheduling_manager = SchedulingManager()
        
        # Set a workspace on the manager
        with patch('scheduler.scheduling.manager.workspace_mgr.get_workspaces') as mock_get_workspaces:
            mock_get_workspaces.return_value = {
                'name': 'my_workspace',
                'title': 'My Workspace',
                'description': 'My workspaces',
                'is_active': True,
                'json_config': {'version': '1.0','broker': {'type': 'host','host_path': '/host/path'}},
            }
            num_tasks = scheduling_manager.perform_scheduling(self._driver, now())
        
        # Only queue_2 should be scheduled
        self.assertEqual(num_tasks, 1)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(Queue.objects.filter(id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
Exemplo n.º 17
0
    def test_node_with_new_agent_id(self, mock_taskinfo):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        mock_taskinfo.return_value = MagicMock()

        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer('offer', self.agent_3.agent_id, self.framework_id,
                              NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now())
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._driver, now())
        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._driver.method_calls
        self.assertEqual(1, len(calls))
        mesos_tasks = calls[0][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id, mesos_task.slave_id.value)
Exemplo n.º 18
0
    def offers(self, offers):
        """
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single agent.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources
        or (b) those resources have been rescinded.  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).
        """

        started = now()

        agents = {}
        offered_nodes = []
        resource_offers = []
        total_resources = NodeResources()
        skipped_roles = set()
        for offer in offers:
            scale_offer = from_mesos_offer(offer)
            offer_id = scale_offer.id.value
            agent_id = scale_offer.agent_id.value
            framework_id = scale_offer.framework_id.value
            hostname = scale_offer.hostname
            offered_nodes.append(hostname)
            # ignore offers while we're paused
            if scheduler_mgr.config.is_paused:
                offer.decline()
                continue
            resource_list = []
            for resource in scale_offer.resources:
                # Only accept resource that are of SCALAR type and have a role matching our accept list
                if resource.type == RESOURCE_TYPE_SCALAR:
                    if resource.role in settings.ACCEPTED_RESOURCE_ROLE:
                        logger.debug("Received scalar resource %s with value %i associated with role %s" %
                                     (resource.name, resource.scalar.value, resource.role))
                        resource_list.append(ScalarResource(resource.name, resource.scalar.value))
                    else:
                        skipped_roles.add(resource.role)
                        offer.decline()

            logger.debug("Number of resources: %i" % len(resource_list))

            # Only register agent, if offers are being received
            if len(resource_list) > 0:
                resources = NodeResources(resource_list)
                total_resources.add(resources)
                agents[agent_id] = Agent(agent_id, hostname)
                resource_offers.append(ResourceOffer(offer_id, agent_id, framework_id, resources, started, offer))

        logger.debug("Offer analysis complete with %i resource offers." % len(resource_offers))

        node_mgr.register_agents(agents.values())
        logger.debug("Agents registered.")
        resource_mgr.add_new_offers(resource_offers)
        logger.debug("Resource offers added.")
        Node.objects.update_node_offers(offered_nodes, now())
        logger.debug("Node offer times updated.")

        num_offers = len(resource_offers)
        logger.info('Received %d offer(s) with %s from %d node(s)', num_offers, total_resources, len(agents))
        if len(skipped_roles):
            logger.warning('Skipped offers from roles that are not marked as accepted: %s', ','.join(skipped_roles))
        scheduler_mgr.add_new_offer_count(num_offers)

        duration = now() - started
        msg = 'Scheduler resourceOffers() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Exemplo n.º 19
0
    def test_add_allocated_offers_remove_all_tasks(self):
        """Tests calling add_allocated_offers() when there are not enough resources for the job exes or node tasks"""

        node = MagicMock()
        node.hostname = 'host_1'
        node.id = 1
        health_task = HealthTask('1234', 'agent_1')
        pull_task = PullTask('1234', 'agent_1')
        node.is_ready_for_new_job = MagicMock()
        node.is_ready_for_new_job.return_value = True
        node.is_ready_for_next_job_task = MagicMock()
        node.is_ready_for_next_job_task.return_value = True
        node.get_next_tasks = MagicMock()
        node.get_next_tasks.return_value = [health_task, pull_task]
        offered_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        watermark_resources = NodeResources([Cpus(100.0), Mem(500.0)])
        resource_set = ResourceSet(offered_resources, NodeResources(),
                                   watermark_resources)
        scheduling_node = SchedulingNode('agent_1', node, [], [], resource_set)
        running_job_exe_1 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(1.0), Mem(10.0)]))
        running_job_exe_2 = job_test_utils.create_running_job_exe(
            agent_id=self.agent_id,
            resources=NodeResources([Cpus(2.0), Mem(20.0)]))
        node_task_resources = NodeResources()
        node_task_resources.add(health_task.get_resources())
        node_task_resources.add(pull_task.get_resources())
        all_required_resources = NodeResources()
        all_required_resources.add(node_task_resources)
        all_required_resources.add(
            running_job_exe_1.next_task().get_resources())
        all_required_resources.add(
            running_job_exe_2.next_task().get_resources())
        expected_remaining_resources = NodeResources()
        expected_remaining_resources.add(offered_resources)
        expected_remaining_resources.subtract(node_task_resources)

        # Set up node with node tasks and job exes (there would never be queued job exes since they would be scheduled
        # before add_allocated_offers() was called
        scheduling_node.accept_node_tasks(now(), [])
        scheduling_node.accept_job_exe_next_task(running_job_exe_1, [])
        scheduling_node.accept_job_exe_next_task(running_job_exe_2, [])
        self.assertEqual(len(scheduling_node.allocated_tasks), 2)
        self.assertEqual(len(scheduling_node._allocated_running_job_exes), 2)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(
                all_required_resources))

        # Set up offers (not enough for job exes or node tasks)
        offer_1 = ResourceOffer('offer_1', 'agent_1', '1234',
                                NodeResources([Cpus(0.1),
                                               Mem(600.0)]), now(), None)

        scheduling_node.add_allocated_offers([offer_1])
        self.assertListEqual(scheduling_node.allocated_offers, [offer_1])
        # All allocated tasks and job exes should be gone
        self.assertEqual(len(scheduling_node.allocated_tasks), 0)
        self.assertEqual(len(scheduling_node._allocated_running_job_exes), 0)
        self.assertEqual(len(scheduling_node._allocated_queued_job_exes), 0)
        self.assertTrue(
            scheduling_node.allocated_resources.is_equal(NodeResources()))
        self.assertTrue(
            scheduling_node._remaining_resources.is_equal(offered_resources))