Exemplo n.º 1
0
    def setUp(self):
        django.setup()

        reset_error_cache()

        self.framework_id = '1234'
        Scheduler.objects.initialize_scheduler()
        Scheduler.objects.update(
            num_message_handlers=0
        )  # Prevent message handler tasks from scheduling
        self._client = MagicMock()

        scheduler_mgr.sync_with_database()
        scheduler_mgr.update_from_mesos(framework_id=self.framework_id)
        resource_mgr.clear()
        job_exe_mgr.clear()

        self.agent_1 = Agent('agent_1', 'host_1')
        self.agent_2 = Agent('agent_2', 'host_2')
        self.agent_3 = Agent('agent_3', 'host_2')
        node_mgr.clear()
        node_mgr.register_agents([self.agent_1, self.agent_2])
        node_mgr.sync_with_database(scheduler_mgr.config)
        # Ignore initial cleanup, health check, and image pull tasks
        for node in node_mgr.get_nodes():
            node._last_health_task = now()
            node._initial_cleanup_completed()
            node._is_image_pulled = True
            node._update_state()
            if node.agent_id == 'agent_1':
                self.node_1_id = node.id
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        self.node_1 = Node.objects.get(id=self.node_1_id)
        # Ignore system tasks
        system_task_mgr._is_db_update_completed = True

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_large = queue_test_utils.create_queue(
            resources=NodeResources([Cpus(
                125.0), Mem(12048.0), Disk(12048.0)]))

        job_type_mgr.sync_with_database()
Exemplo n.º 2
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        running_job_mgr.add_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node.initial_cleanup_completed()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Exemplo n.º 3
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()
        self._driver = MagicMock()

        scheduler_mgr.sync_with_database()
        offer_mgr.clear()

        self.node_agent_1 = 'agent_1'
        self.node_agent_2 = 'agent_2'
        self.slave_infos = [SlaveInfo('host_1', slave_id=self.node_agent_1),
                            SlaveInfo('host_2', slave_id=self.node_agent_2)]
        node_mgr.clear()
        node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2])
        with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves:
            mock_get_slaves.return_value = self.slave_infos
            node_mgr.sync_with_database('master_host', 5050)
        # Ignore initial cleanup tasks
        for node in node_mgr.get_nodes():
            node.initial_cleanup_completed()

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        job_type_mgr.sync_with_database()

        self._scheduling_thread = SchedulingThread(self._driver, '123')
Exemplo n.º 4
0
    def _perform_scheduling(self):
        """Performs task reconciliation with the Mesos master

        :returns: The number of Mesos tasks that were scheduled
        :rtype: int
        """

        when = now()

        # Get updated node and job type models from managers
        nodes = node_mgr.get_nodes()
        cleanup_mgr.update_nodes(nodes)
        offer_mgr.update_nodes(nodes)
        offer_mgr.ready_new_offers()
        self._job_types = job_type_mgr.get_job_types()

        # Look at job type limits and determine number available to be scheduled
        self._job_type_limit_available = {}
        for job_type in self._job_types.values():
            if job_type.max_scheduled:
                self._job_type_limit_available[job_type.id] = job_type.max_scheduled
        for running_job_exe in job_exe_mgr.get_running_job_exes():
            if running_job_exe.job_type_id in self._job_type_limit_available:
                self._job_type_limit_available[running_job_exe.job_type_id] -= 1

        self._consider_node_tasks(when)
        self._consider_running_job_exes()
        self._consider_new_job_exes()

        return self._schedule_accepted_tasks()
Exemplo n.º 5
0
    def _perform_scheduling(self):
        """Performs task reconciliation with the Mesos master

        :returns: The number of Mesos tasks that were scheduled
        :rtype: int
        """

        # Get updated node and job type models from managers
        nodes = node_mgr.get_nodes()
        cleanup_mgr.update_nodes(nodes)
        offer_mgr.update_nodes(nodes)
        offer_mgr.ready_new_offers()
        self._job_types = job_type_mgr.get_job_types()

        # Look at job type limits and determine number available to be scheduled
        self._job_type_limit_available = {}
        for job_type in self._job_types.values():
            if job_type.max_scheduled:
                self._job_type_limit_available[job_type.id] = job_type.max_scheduled
        for running_job_exe in running_job_mgr.get_all_job_exes():
            if running_job_exe.job_type_id in self._job_type_limit_available:
                self._job_type_limit_available[running_job_exe.job_type_id] -= 1

        self._send_tasks_for_reconciliation()
        self._consider_cleanup_tasks()
        self._consider_running_job_exes()
        self._consider_new_job_exes()

        return self._schedule_accepted_tasks()
Exemplo n.º 6
0
    def _prepare_nodes(self, tasks, running_job_exes, when):
        """Prepares the nodes to use for scheduling

        :param tasks: The current current running
        :type tasks: list
        :param running_job_exes: The currently running job executions
        :type running_job_exes: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: The dict of scheduling nodes stored by node ID
        :rtype: dict
        """

        nodes = node_mgr.get_nodes()

        # Group tasks by agent ID
        tasks_by_agent_id = {}  # {Agent ID: List of tasks}
        for task in tasks:
            if task.agent_id not in tasks_by_agent_id:
                tasks_by_agent_id[task.agent_id] = [task]
            else:
                tasks_by_agent_id[task.agent_id].append(task)

        # Group job executions by node ID
        running_exes_by_node_id = {}  # {Node ID: List of running job exes}
        for running_job_exe in running_job_exes:
            if running_job_exe.node_id not in running_exes_by_node_id:
                running_exes_by_node_id[running_job_exe.node_id] = [
                    running_job_exe
                ]
            else:
                running_exes_by_node_id[running_job_exe.node_id].append(
                    running_job_exe)

        agent_resources = resource_mgr.refresh_agent_resources(tasks, when)

        scheduling_nodes = {}  # {Node ID: SchedulingNode}
        for node in nodes:
            agent_id = node.agent_id  # Grab agent ID once since it could change while we are scheduling

            if agent_id in tasks_by_agent_id:
                node_tasks = tasks_by_agent_id[agent_id]
            else:
                node_tasks = []
            if node.id in running_exes_by_node_id:
                node_exes = running_exes_by_node_id[node.id]
            else:
                node_exes = []
            if agent_id in agent_resources:
                resource_set = agent_resources[agent_id]
            else:
                resource_set = ResourceSet()

            scheduling_node = SchedulingNode(agent_id, node, node_tasks,
                                             node_exes, resource_set)
            scheduling_nodes[scheduling_node.node_id] = scheduling_node
        return scheduling_nodes
Exemplo n.º 7
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit,
                                                  status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_1,
            NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_2,
            NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore Docker pull tasks
        for node in node_mgr.get_nodes():
            node._is_image_pulled = True

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node._initial_cleanup_completed()
            node._update_state()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(
            num_tasks, 3
        )  # One is already running, should only be able to schedule 3 more
Exemplo n.º 8
0
Arquivo: sync.py Projeto: Fizz11/scale
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        job_exe_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Handle canceled job executions
        for finished_job_exe in job_exe_mgr.sync_with_database():
            cleanup_mgr.add_job_execution(finished_job_exe)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Exemplo n.º 9
0
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Kill running tasks for canceled job executions
        for task_to_kill in job_exe_mgr.sync_with_database():
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task_to_kill.id
            logger.info('Killing task %s', task_to_kill.id)
            self._driver.killTask(pb_task_to_kill)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Exemplo n.º 10
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()
        self._driver = MagicMock()

        scheduler_mgr.sync_with_database()
        offer_mgr.clear()

        self.node_agent_1 = 'agent_1'
        self.node_agent_2 = 'agent_2'
        self.slave_infos = [
            SlaveInfo('host_1', slave_id=self.node_agent_1),
            SlaveInfo('host_2', slave_id=self.node_agent_2)
        ]
        node_mgr.clear()
        node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2])
        with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves:
            mock_get_slaves.return_value = self.slave_infos
            node_mgr.sync_with_database('master_host', 5050)
        # Ignore initial cleanup tasks and health check tasks
        for node in node_mgr.get_nodes():
            node._last_heath_task = now()
            node._initial_cleanup_completed()
            node._update_state()

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        job_type_mgr.sync_with_database()

        self._scheduling_thread = SchedulingThread(self._driver, '123')
Exemplo n.º 11
0
    def test_generate_nodes_status(self):
        """Tests the _generate_nodes_status method"""

        # Setup nodes
        from scheduler.node.manager import node_mgr
        node_mgr.clear()

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': False,
                'detail': {
                    'msg': 'No nodes reported'
                },
                'errors': [{
                    'NODES_OFFLINE': 'No nodes reported.'
                }],
                'warnings': []
            })

        node_mgr.register_agents([
            self.agent_1, self.agent_2, self.agent_3, self.agent_4,
            self.agent_5, self.agent_6, self.agent_7, self.agent_8,
            self.agent_9, self.agent_10
        ])
        node_mgr.sync_with_database(scheduler_mgr.config)

        nodes = node_mgr.get_nodes()
        self.assertEqual(len(nodes), 10)

        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK': True,
                'detail': {
                    'msg': 'Enough nodes are online to function.'
                },
                'errors': [],
                'warnings': []
            })

        node_mgr.lost_node(self.agent_1.agent_id)
        node_mgr.lost_node(self.agent_2.agent_id)
        node_mgr.lost_node(self.agent_3.agent_id)
        node_mgr.lost_node(self.agent_4.agent_id)
        nodes = dependency_mgr._generate_nodes_status()
        self.assertDictEqual(
            nodes, {
                'OK':
                False,
                'detail': {
                    u'msg': u'Over a third of nodes are in an error state'
                },
                'errors': [{
                    'NODES_ERRORED':
                    'Over a third of the nodes are offline or degraded.'
                }],
                'warnings': [{
                    u'NODES_OFFLINE': u'4 nodes are offline'
                }]
            })