示例#1
0
    def test_node_with_new_agent_id(self):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer(
            'offer', self.agent_3.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._client.method_calls
        # One for checking for driver and second for task launch
        self.assertEqual(2, len(calls))
        # Get tasks off 2nd calls (index
        mesos_tasks = calls[1][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id,
                             mesos_task['agent_id']['value'])
示例#2
0
    def resourceOffers(self, driver, offers):
        """
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single slave.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources (see
        SchedulerDriver.launchTasks) or (b) those resources have been
        rescinded (see Scheduler.offerRescinded).  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).

        See documentation for :meth:`mesos_api.mesos.Scheduler.resourceOffers`.
        """

        started = now()

        agents = {}
        resource_offers = []
        total_resources = NodeResources()
        for offer in offers:
            offer_id = offer.id.value
            agent_id = offer.slave_id.value
            framework_id = offer.framework_id.value
            hostname = offer.hostname
            resource_list = []
            for resource in offer.resources:
                if resource.type == 0:  # This is the SCALAR type
                    resource_list.append(
                        ScalarResource(resource.name, resource.scalar.value))
            resources = NodeResources(resource_list)
            total_resources.add(resources)
            agents[agent_id] = Agent(agent_id, hostname)
            resource_offers.append(
                ResourceOffer(offer_id, agent_id, framework_id, resources,
                              started))

        node_mgr.register_agents(agents.values())
        resource_mgr.add_new_offers(resource_offers)

        num_offers = len(resource_offers)
        logger.info('Received %d offer(s) with %s from %d node(s)', num_offers,
                    total_resources, len(agents))
        scheduler_mgr.add_new_offer_count(num_offers)

        duration = now() - started
        msg = 'Scheduler resourceOffers() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
示例#3
0
    def setUp(self):
        django.setup()

        reset_error_cache()

        self.framework_id = '1234'
        Scheduler.objects.initialize_scheduler()
        Scheduler.objects.update(
            num_message_handlers=0
        )  # Prevent message handler tasks from scheduling
        self._client = MagicMock()

        scheduler_mgr.sync_with_database()
        scheduler_mgr.update_from_mesos(framework_id=self.framework_id)
        resource_mgr.clear()
        job_exe_mgr.clear()

        self.agent_1 = Agent('agent_1', 'host_1')
        self.agent_2 = Agent('agent_2', 'host_2')
        self.agent_3 = Agent('agent_3', 'host_2')
        node_mgr.clear()
        node_mgr.register_agents([self.agent_1, self.agent_2])
        node_mgr.sync_with_database(scheduler_mgr.config)
        # Ignore initial cleanup, health check, and image pull tasks
        for node in node_mgr.get_nodes():
            node._last_health_task = now()
            node._initial_cleanup_completed()
            node._is_image_pulled = True
            node._update_state()
            if node.agent_id == 'agent_1':
                self.node_1_id = node.id
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        self.node_1 = Node.objects.get(id=self.node_1_id)
        # Ignore system tasks
        system_task_mgr._is_db_update_completed = True

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_large = queue_test_utils.create_queue(
            resources=NodeResources([Cpus(
                125.0), Mem(12048.0), Disk(12048.0)]))

        job_type_mgr.sync_with_database()
示例#4
0
    def test_node_with_new_agent_id(self, mock_taskinfo):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        mock_taskinfo.return_value = MagicMock()

        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer('offer', self.agent_3.agent_id, self.framework_id,
                              NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now())
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._driver, now())
        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._driver.method_calls
        self.assertEqual(1, len(calls))
        mesos_tasks = calls[0][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id, mesos_task.slave_id.value)
示例#5
0
    def offers(self, offers):
        """
        Invoked when resources have been offered to this framework. A single
        offer will only contain resources from a single agent.  Resources
        associated with an offer will not be re-offered to _this_ framework
        until either (a) this framework has rejected those resources
        or (b) those resources have been rescinded.  Note that resources may be
        concurrently offered to more than one framework at a time (depending
        on the allocator being used).  In that case, the first framework to
        launch tasks using those resources will be able to use them while the
        other frameworks will have those resources rescinded (or if a
        framework has already launched tasks with those resources then those
        tasks will fail with a TASK_LOST status and a message saying as much).
        """

        started = now()

        agents = {}
        offered_nodes = []
        resource_offers = []
        total_resources = NodeResources()
        skipped_roles = set()
        for offer in offers:
            scale_offer = from_mesos_offer(offer)
            offer_id = scale_offer.id.value
            agent_id = scale_offer.agent_id.value
            framework_id = scale_offer.framework_id.value
            hostname = scale_offer.hostname
            offered_nodes.append(hostname)
            # ignore offers while we're paused
            if scheduler_mgr.config.is_paused:
                offer.decline()
                continue
            resource_list = []
            for resource in scale_offer.resources:
                # Only accept resource that are of SCALAR type and have a role matching our accept list
                if resource.type == RESOURCE_TYPE_SCALAR:
                    if resource.role in settings.ACCEPTED_RESOURCE_ROLE:
                        logger.debug("Received scalar resource %s with value %i associated with role %s" %
                                     (resource.name, resource.scalar.value, resource.role))
                        resource_list.append(ScalarResource(resource.name, resource.scalar.value))
                    else:
                        skipped_roles.add(resource.role)
                        offer.decline()

            logger.debug("Number of resources: %i" % len(resource_list))

            # Only register agent, if offers are being received
            if len(resource_list) > 0:
                resources = NodeResources(resource_list)
                total_resources.add(resources)
                agents[agent_id] = Agent(agent_id, hostname)
                resource_offers.append(ResourceOffer(offer_id, agent_id, framework_id, resources, started, offer))

        logger.debug("Offer analysis complete with %i resource offers." % len(resource_offers))

        node_mgr.register_agents(agents.values())
        logger.debug("Agents registered.")
        resource_mgr.add_new_offers(resource_offers)
        logger.debug("Resource offers added.")
        Node.objects.update_node_offers(offered_nodes, now())
        logger.debug("Node offer times updated.")

        num_offers = len(resource_offers)
        logger.info('Received %d offer(s) with %s from %d node(s)', num_offers, total_resources, len(agents))
        if len(skipped_roles):
            logger.warning('Skipped offers from roles that are not marked as accepted: %s', ','.join(skipped_roles))
        scheduler_mgr.add_new_offer_count(num_offers)

        duration = now() - started
        msg = 'Scheduler resourceOffers() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
示例#6
0
    def test_generate_nodes_status(self):
        """Tests the _generate_nodes_status method"""

        # Setup nodes
        from scheduler.node.manager import node_mgr
        node_mgr.clear()

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': False,
                'detail': {
                    'msg': 'No nodes reported'
                },
                'errors': [{
                    'NODES_OFFLINE': 'No nodes reported.'
                }],
                'warnings': []
            })

        node_mgr.register_agents([
            self.agent_1, self.agent_2, self.agent_3, self.agent_4,
            self.agent_5, self.agent_6, self.agent_7, self.agent_8,
            self.agent_9, self.agent_10
        ])
        node_mgr.sync_with_database(scheduler_mgr.config)

        nodes = node_mgr.get_nodes()
        self.assertEqual(len(nodes), 10)

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': True,
                'detail': {
                    'msg': 'Enough nodes are online to function.'
                },
                'errors': [],
                'warnings': []
            })

        node_mgr.lost_node(self.agent_1.agent_id)
        node_mgr.lost_node(self.agent_2.agent_id)
        node_mgr.lost_node(self.agent_3.agent_id)
        node_mgr.lost_node(self.agent_4.agent_id)
        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK':
                False,
                'detail': {
                    u'msg': u'Over a third of nodes are in an error state'
                },
                'errors': [{
                    'NODES_ERRORED':
                    'Over a third of the nodes are offline or degraded.'
                }],
                'warnings': [{
                    u'NODES_OFFLINE': u'4 nodes are offline'
                }]
            })