def test_node_with_new_agent_id(self): """Tests successfully calling perform_scheduling() when a node get a new agent ID""" # Host 2 gets new agent ID of agent_3 node_mgr.lost_node(self.agent_2) node_mgr.register_agents([self.agent_3]) node_mgr.sync_with_database(scheduler_mgr.config) offer = ResourceOffer( 'offer', self.agent_3.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 2) # Schedule both queued job executions # Check that created tasks have the correct agent ID calls = self._client.method_calls # One for checking for driver and second for task launch self.assertEqual(2, len(calls)) # Get tasks off 2nd calls (index mesos_tasks = calls[1][1][1] for mesos_task in mesos_tasks: self.assertEqual(self.agent_3.agent_id, mesos_task['agent_id']['value'])
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.warning('Node lost on host %s', node.hostname) else: logger.warning('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) resource_mgr.lost_agent(agent_id) # Fail job executions that were running on the lost node if node: for finished_job_exe in job_exe_mgr.lost_node(node.id, started): cleanup_mgr.add_job_execution(finished_job_exe) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def slaveLost(self, driver, slaveId): """ Invoked when a slave has been determined unreachable (e.g., machine failure, network partition.) Most frameworks will need to reschedule any tasks launched on this slave on a new slave. See documentation for :meth:`mesos_api.mesos.Scheduler.slaveLost`. """ started = now() agent_id = slaveId.value node = node_mgr.get_node(agent_id) if node: logger.error('Node lost on host %s', node.hostname) else: logger.error('Node lost on agent %s', agent_id) node_mgr.lost_node(agent_id) offer_mgr.lost_node(agent_id) # Fail job executions that were running on the lost node if node: for running_job_exe in running_job_mgr.get_job_exes_on_node(node.id): try: running_job_exe.execution_lost(started) except DatabaseError: logger.exception('Error failing lost job execution: %s', running_job_exe.id) # Error failing execution, add task so it can be reconciled task = running_job_exe.current_task if task: recon_mgr.add_task_ids([task.id]) if running_job_exe.is_finished(): running_job_mgr.remove_job_exe(running_job_exe.id) cleanup_mgr.add_job_execution(running_job_exe) duration = now() - started msg = 'Scheduler slaveLost() took %.3f seconds' if duration > ScaleScheduler.DATABASE_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def test_node_with_new_agent_id(self, mock_taskinfo): """Tests successfully calling perform_scheduling() when a node get a new agent ID""" mock_taskinfo.return_value = MagicMock() # Host 2 gets new agent ID of agent_3 node_mgr.lost_node(self.agent_2) node_mgr.register_agents([self.agent_3]) node_mgr.sync_with_database(scheduler_mgr.config) offer = ResourceOffer('offer', self.agent_3.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now()) resource_mgr.add_new_offers([offer]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._driver, now()) self.assertEqual(num_tasks, 2) # Schedule both queued job executions # Check that created tasks have the correct agent ID calls = self._driver.method_calls self.assertEqual(1, len(calls)) mesos_tasks = calls[0][1][1] for mesos_task in mesos_tasks: self.assertEqual(self.agent_3.agent_id, mesos_task.slave_id.value)
def test_generate_nodes_status(self): """Tests the _generate_nodes_status method""" # Setup nodes from scheduler.node.manager import node_mgr node_mgr.clear() nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': False, 'detail': { 'msg': 'No nodes reported' }, 'errors': [{ 'NODES_OFFLINE': 'No nodes reported.' }], 'warnings': [] }) node_mgr.register_agents([ self.agent_1, self.agent_2, self.agent_3, self.agent_4, self.agent_5, self.agent_6, self.agent_7, self.agent_8, self.agent_9, self.agent_10 ]) node_mgr.sync_with_database(scheduler_mgr.config) nodes = node_mgr.get_nodes() self.assertEqual(len(nodes), 10) nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': True, 'detail': { 'msg': 'Enough nodes are online to function.' }, 'errors': [], 'warnings': [] }) node_mgr.lost_node(self.agent_1.agent_id) node_mgr.lost_node(self.agent_2.agent_id) node_mgr.lost_node(self.agent_3.agent_id) node_mgr.lost_node(self.agent_4.agent_id) nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': False, 'detail': { u'msg': u'Over a third of nodes are in an error state' }, 'errors': [{ 'NODES_ERRORED': 'Over a third of the nodes are offline or degraded.' }], 'warnings': [{ u'NODES_OFFLINE': u'4 nodes are offline' }] })