示例#1
0
    def test_node_with_new_agent_id(self):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer(
            'offer', self.agent_3.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._client.method_calls
        # One for checking for driver and second for task launch
        self.assertEqual(2, len(calls))
        # Get tasks off 2nd calls (index
        mesos_tasks = calls[1][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id,
                             mesos_task['agent_id']['value'])
示例#2
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.warning('Node lost on host %s', node.hostname)
        else:
            logger.warning('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        resource_mgr.lost_agent(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for finished_job_exe in job_exe_mgr.lost_node(node.id, started):
                cleanup_mgr.add_job_execution(finished_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
示例#3
0
    def slaveLost(self, driver, slaveId):
        """
        Invoked when a slave has been determined unreachable (e.g., machine
        failure, network partition.) Most frameworks will need to reschedule
        any tasks launched on this slave on a new slave.

        See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`.
        """

        started = now()

        agent_id = slaveId.value
        node = node_mgr.get_node(agent_id)

        if node:
            logger.error('Node lost on host %s', node.hostname)
        else:
            logger.error('Node lost on agent %s', agent_id)

        node_mgr.lost_node(agent_id)
        offer_mgr.lost_node(agent_id)

        # Fail job executions that were running on the lost node
        if node:
            for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id):
                try:
                    running_job_exe.execution_lost(started)
                except DatabaseError:
                    logger.exception('Error failing lost job execution: %s', running_job_exe.id)
                    # Error failing execution, add task so it can be reconciled
                    task = running_job_exe.current_task
                    if task:
                        recon_mgr.add_task_ids([task.id])
                if running_job_exe.is_finished():
                    running_job_mgr.remove_job_exe(running_job_exe.id)
                    cleanup_mgr.add_job_execution(running_job_exe)

        duration = now() - started
        msg = 'Scheduler slaveLost() took %.3f seconds'
        if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
示例#4
0
    def test_node_with_new_agent_id(self, mock_taskinfo):
        """Tests successfully calling perform_scheduling() when a node get a new agent ID"""
        mock_taskinfo.return_value = MagicMock()

        # Host 2 gets new agent ID of agent_3
        node_mgr.lost_node(self.agent_2)
        node_mgr.register_agents([self.agent_3])
        node_mgr.sync_with_database(scheduler_mgr.config)

        offer = ResourceOffer('offer', self.agent_3.agent_id, self.framework_id,
                              NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now())
        resource_mgr.add_new_offers([offer])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._driver, now())
        self.assertEqual(num_tasks, 2)  # Schedule both queued job executions
        # Check that created tasks have the correct agent ID
        calls = self._driver.method_calls
        self.assertEqual(1, len(calls))
        mesos_tasks = calls[0][1][1]
        for mesos_task in mesos_tasks:
            self.assertEqual(self.agent_3.agent_id, mesos_task.slave_id.value)
示例#5
0
    def test_generate_nodes_status(self):
        """Tests the _generate_nodes_status method"""

        # Setup nodes
        from scheduler.node.manager import node_mgr
        node_mgr.clear()

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': False,
                'detail': {
                    'msg': 'No nodes reported'
                },
                'errors': [{
                    'NODES_OFFLINE': 'No nodes reported.'
                }],
                'warnings': []
            })

        node_mgr.register_agents([
            self.agent_1, self.agent_2, self.agent_3, self.agent_4,
            self.agent_5, self.agent_6, self.agent_7, self.agent_8,
            self.agent_9, self.agent_10
        ])
        node_mgr.sync_with_database(scheduler_mgr.config)

        nodes = node_mgr.get_nodes()
        self.assertEqual(len(nodes), 10)

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': True,
                'detail': {
                    'msg': 'Enough nodes are online to function.'
                },
                'errors': [],
                'warnings': []
            })

        node_mgr.lost_node(self.agent_1.agent_id)
        node_mgr.lost_node(self.agent_2.agent_id)
        node_mgr.lost_node(self.agent_3.agent_id)
        node_mgr.lost_node(self.agent_4.agent_id)
        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK':
                False,
                'detail': {
                    u'msg': u'Over a third of nodes are in an error state'
                },
                'errors': [{
                    'NODES_ERRORED':
                    'Over a third of the nodes are offline or degraded.'
                }],
                'warnings': [{
                    u'NODES_OFFLINE': u'4 nodes are offline'
                }]
            })