def __init__(self, agent_id, node, scheduler_config): """Constructor :param agent_id: The Mesos agent ID for the node :type agent_id: string :param node: The node model :type node: :class:`node.models.Node` :param scheduler_config: The scheduler configuration :type scheduler_config: :class:`scheduler.configuration.SchedulerConfiguration` """ self._hostname = str(node.hostname) # Never changes self._id = node.id # Never changes self._agent_id = agent_id self._cleanup = NodeCleanup() self._cleanup_task = None self._conditions = NodeConditions(self._hostname) self._health_task = None self._is_active = node.is_active self._is_image_pulled = False self._is_initial_cleanup_completed = False self._is_online = True self._is_paused = node.is_paused self._is_scheduler_paused = scheduler_config.is_paused self._last_health_task = None self._last_offer_received = node.last_offer_received self._lock = threading.Lock() self._pull_task = None self._state = None self._update_state()
def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" node = Node(self.node_agent, self.node) node.initial_cleanup_completed() node_cleanup = NodeCleanup(node) # No task since there are no job executions to clean self.assertIsNone(node_cleanup.get_next_task()) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node_cleanup.add_job_execution(job_exe) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) node_cleanup.handle_task_update(update) # No task since all job executions have been cleaned self.assertIsNone(node_cleanup.get_next_task())
def test_paused_node(self): """Tests not returning tasks when its node is paused""" paused_node = node_test_utils.create_node(hostname='host_1', slave_id=self.node_agent) paused_node.is_paused = True node = Node(self.node_agent, paused_node) node_cleanup = NodeCleanup(node) task = node_cleanup.get_next_task() # No task due to paused node self.assertIsNone(task)
def _reset_node(self): """Resets the node if it goes away so initial cleanup and the Scale image pull occur when the node comes back. Caller must have obtained the node's thread lock. """ self._cleanup = NodeCleanup() self._cleanup_task = None self._health_task = None self._is_image_pulled = False self._is_initial_cleanup_completed = False self._last_health_task = None self._pull_task = None
def test_handle_killed_task(self): """Tests handling killed cleanup task""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() task_1_id = task.id self.assertIsNotNone(task) # Kill task after running and get different task next time task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertNotEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed)
def test_handle_initial_cleanup_task(self): """Tests handling the initial cleanup task""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertTrue(task.is_initial_cleanup) self.assertEqual(task.agent_id, self.node_agent) # Schedule initial cleanup and make sure no new task is ready task.launch(now()) self.assertIsNone(node_cleanup.get_next_task()) self.assertFalse(node.is_initial_cleanup_completed) # Complete initial clean up, verify no new task update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) node_cleanup.handle_task_update(update) self.assertIsNone(node_cleanup.get_next_task()) self.assertTrue(node.is_initial_cleanup_completed)
class Node(object): """This class represents a node in the scheduler. It combines information retrieved from the database node models as well as run-time information retrieved from Mesos. This class is thread-safe.""" # Thresholds for when to schedule node tasks again that have failed CLEANUP_ERR_THRESHOLD = datetime.timedelta(minutes=2) HEALTH_ERR_THRESHOLD = datetime.timedelta(minutes=2) IMAGE_PULL_ERR_THRESHOLD = datetime.timedelta(minutes=5) # Normal health check task threshold NORMAL_HEALTH_THRESHOLD = datetime.timedelta(minutes=5) # Node States deprecated_desc = 'Node is deprecated and will not be used by Scale. Existing jobs on the node will be failed.' DEPRECATED = NodeState(state='DEPRECATED', title='Deprecated', description=deprecated_desc) OFFLINE = NodeState( state='OFFLINE', title='Offline', description= 'Node is offline/unavailable, so no jobs can currently run on it.') stopped_desc = 'Scheduler is paused, so no new jobs will be scheduled. Existing jobs will continue to run.' SCHEDULER_STOPPED = NodeState(state='SCHEDULER_STOPPED', title='Scheduler Stopped', description=stopped_desc) paused_desc = 'Node is paused, so no new jobs will be scheduled. Existing jobs will continue to run.' PAUSED = NodeState(state='PAUSED', title='Paused', description=paused_desc) degraded_desc = 'Node has an error condition, putting it in a degraded state.' degraded_desc += ' New jobs will not be scheduled, and the node will attempt to continue to run existing jobs.' DEGRADED = NodeState(state='DEGRADED', title='Degraded', description=degraded_desc) cleanup_desc = 'Node is performing an initial cleanup step to remove existing Scale Docker containers and volumes.' INITIAL_CLEANUP = NodeState(state='INITIAL_CLEANUP', title='Cleaning up', description=cleanup_desc) pull_desc = 'Node is pulling the Scale Docker image.' IMAGE_PULL = NodeState(state='IMAGE_PULL', title='Pulling image', description=pull_desc) READY = NodeState(state='READY', title='Ready', description='Node is ready to run new jobs.') def __init__(self, agent_id, node, scheduler_config): """Constructor :param agent_id: The Mesos agent ID for the node :type agent_id: string :param node: The node model :type node: :class:`node.models.Node` :param scheduler_config: The scheduler configuration :type scheduler_config: :class:`scheduler.configuration.SchedulerConfiguration` """ self._hostname = str(node.hostname) # Never changes self._id = node.id # Never changes self._agent_id = agent_id self._cleanup = NodeCleanup() self._cleanup_task = None self._conditions = NodeConditions(self._hostname) self._health_task = None self._is_active = node.is_active self._is_image_pulled = False self._is_initial_cleanup_completed = False self._is_online = True self._is_paused = node.is_paused self._is_scheduler_paused = scheduler_config.is_paused self._last_health_task = None self._last_offer_received = node.last_offer_received self._lock = threading.Lock() self._pull_task = None self._state = None self._update_state() @property def agent_id(self): """Returns the agent ID of the node :returns: The agent ID :rtype: string """ return self._agent_id @property def hostname(self): """Returns the hostname of the node :returns: The hostname :rtype: string """ return self._hostname @property def id(self): """Returns the ID of the node :returns: The node ID :rtype: int """ return self._id @property def is_active(self): """Indicates whether this node is active (True) or not (False) :returns: Whether this node is active :rtype: bool """ return self._is_active def add_job_execution(self, job_exe): """Adds a job execution that needs to be cleaned up :param job_exe: The job execution to add :type job_exe: :class:`job.execution.job_exe.RunningJobExecution` """ with self._lock: self._cleanup.add_job_execution(job_exe) self._conditions.update_cleanup_count( self._cleanup.get_num_job_exes()) def generate_status_json(self, nodes_list): """Generates the portion of the status JSON that describes this node :param nodes_list: The list of nodes within the status JSON :type nodes_list: :func:`list` """ with self._lock: state_dict = { 'name': self._state.state, 'title': self._state.title, 'description': self._state.description } node_dict = { 'id': self._id, 'hostname': self._hostname, 'agent_id': self._agent_id, 'is_active': self._is_active, 'state': state_dict } self._conditions.generate_status_json(node_dict) nodes_list.append(node_dict) def get_next_tasks(self, when): """Returns the next node tasks to launch :param when: The current time :type when: :class:`datetime.datetime` :returns: The list of node tasks to launch :rtype: [:class:`job.tasks.base_task.Task`] """ with self._lock: self._create_next_tasks(when) tasks = [] # Check if ready for cleanup task and it hasn't been launched yet if self._is_ready_for_cleanup_task(when): if self._cleanup_task and not self._cleanup_task.has_been_launched: tasks.append(self._cleanup_task) # Check if ready for health check task and it hasn't been launched yet if self._is_ready_for_health_task( when ) and self._health_task and not self._health_task.has_been_launched: tasks.append(self._health_task) # Check if ready for pull task and it hasn't been launched yet if self._is_ready_for_pull_task( when ) and self._pull_task and not self._pull_task.has_been_launched: tasks.append(self._pull_task) return tasks def handle_task_timeout(self, task): """Handles the timeout of the given node task :param task: The task :type task: :class:`job.tasks.base_task.Task` """ with self._lock: if self._cleanup_task and self._cleanup_task.id == task.id: logger.warning('Cleanup task on node %s timed out', self._hostname) self._conditions.handle_cleanup_task_timeout( self._cleanup_task.job_exes) if self._cleanup_task.has_ended: self._cleanup_task = None elif self._health_task and self._health_task.id == task.id: logger.warning('Health check task on node %s timed out', self._hostname) if self._health_task.has_ended: self._health_task = None self._last_health_task = now() self._conditions.handle_health_task_timeout() elif self._pull_task and self._pull_task.id == task.id: logger.warning('Scale image pull task on node %s timed out', self._hostname) if self._pull_task.has_ended: self._pull_task = None self._conditions.handle_pull_task_timeout() self._update_state() def handle_task_update(self, task_update): """Handles the given task update :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` """ with self._lock: if self._cleanup_task and self._cleanup_task.id == task_update.task_id: self._handle_cleanup_task_update(task_update) elif self._health_task and self._health_task.id == task_update.task_id: self._handle_health_task_update(task_update) elif self._pull_task and self._pull_task.id == task_update.task_id: self._handle_pull_task_update(task_update) self._update_state() def is_ready_for_new_job(self): """Indicates whether this node is ready to launch a new job execution :returns: True if this node is ready to launch a new job execution, False otherwise :rtype: bool """ return self._state == Node.READY def is_ready_for_next_job_task(self): """Indicates whether this node is ready to launch the next task of a job execution :returns: True if this node is ready to launch a job task, False otherwise :rtype: bool """ return self._state not in [Node.DEPRECATED, Node.OFFLINE ] and self._is_image_pulled def is_ready_for_system_task(self): """Indicates whether this node is ready to launch a new system task :returns: True if this node is ready to launch a new system task, False otherwise :rtype: bool """ return self._state == Node.READY def should_be_removed(self): """Indicates whether this node should be removed from the scheduler. If the node is no longer active and is also no longer online, there's no reason for the scheduler to continue to track it. :returns: True if this node should be removed from the scheduler :rtype: bool """ if not self._is_active and not self._is_online: return True elif self._last_offer_received < (now() - datetime.timedelta(minutes=5)): # Scale has not received an offer for the node in the last 5 min return True else: return False def update_from_mesos(self, agent_id=None, is_online=None): """Updates this node's data from Mesos :param agent_id: The Mesos agent ID for the node :type agent_id: string :param is_online: Whether the Mesos agent is online :type is_online: bool """ with self._lock: if agent_id: self._agent_id = agent_id if is_online is not None: self._is_online = is_online from node.models import Node if is_online: # If node comes online, sync database node state with scheduler node state Node.objects.update_node({'is_active': True}, node_id=self._id) else: # If node goes offline, sync scheduler node state with database node state and reset it Node.objects.update_node({'is_active': False}, node_id=self._id) self._reset_node() self._update_state() def update_from_model(self, node, scheduler_config): """Updates this node's data from the database models :param node: The node model :type node: :class:`node.models.Node` :param scheduler_config: The scheduler configuration :type scheduler_config: :class:`scheduler.configuration.SchedulerConfiguration` """ if self.id != node.id: raise Exception( 'Trying to update node from incorrect database model') with self._lock: self._is_active = node.is_active self._is_paused = node.is_paused self._last_offer_received = node.last_offer_received self._is_scheduler_paused = scheduler_config.is_paused if not node.is_active: # If node is deprecated, reset it self._reset_node() self._update_state() def _create_next_tasks(self, when): """Creates the next tasks that needs to be run for this node. Caller must have obtained the thread lock. :param when: The current time :type when: :class:`datetime.datetime` """ # If we have a cleanup task, check that node's agent ID has not changed if self._cleanup_task and self._cleanup_task.agent_id != self._agent_id: self._cleanup_task = None if not self._cleanup_task and self._is_ready_for_cleanup_task(when): self._cleanup_task = self._cleanup.create_next_task( self._agent_id, self._hostname, self._is_initial_cleanup_completed) # If we have a health task, check that node's agent ID has not changed if self._health_task and self._health_task.agent_id != self._agent_id: self._health_task = None if not self._health_task and self._is_ready_for_health_task(when): self._health_task = HealthTask(scheduler_mgr.framework_id, self._agent_id) # If we have a pull task, check that node's agent ID has not changed if self._pull_task and self._pull_task.agent_id != self._agent_id: self._pull_task = None if not self._pull_task and self._is_ready_for_pull_task(when): self._pull_task = PullTask(scheduler_mgr.framework_id, self._agent_id) def _image_pull_completed(self): """Tells this node that its image pull task has succeeded. Caller must have obtained the thread lock. """ logger.info('Node %s has finished pulling the Scale image', self._hostname) self._is_image_pulled = True def _initial_cleanup_completed(self): """Tells this node that its initial cleanup task has succeeded. Caller must have obtained the thread lock. """ logger.info('Node %s has completed initial clean up', self._hostname) self._is_initial_cleanup_completed = True def _is_ready_for_cleanup_task(self, when): """Indicates whether this node is ready to launch a cleanup task. Caller must have obtained the thread lock. :param when: The current time :type when: :class:`datetime.datetime` :returns: True if this node is ready to launch a cleanup task, False otherwise :rtype: bool """ if self._state in [Node.INITIAL_CLEANUP, Node.IMAGE_PULL, Node.READY]: return True elif self._state == Node.DEGRADED: # The cleanup task can be scheduled during DEGRADED state as long as the Docker daemon is OK if self._conditions.is_daemon_bad: return False # Schedule cleanup task if threshold has passed since last cleanup task error last_cleanup_error = self._conditions.last_cleanup_task_error() if last_cleanup_error: return when - last_cleanup_error > Node.CLEANUP_ERR_THRESHOLD # No cleanup error return True return False def _is_ready_for_health_task(self, when): """Indicates whether this node is ready to launch a health check task. Caller must have obtained the thread lock. :param when: The current time :type when: :class:`datetime.datetime` :returns: True if this node is ready to launch a health check task, False otherwise :rtype: bool """ if self._state in [Node.DEPRECATED, Node.OFFLINE]: return False elif not self._conditions.is_health_check_normal: # Schedule health task if threshold has passed since last health task error return not self._last_health_task or (when - self._last_health_task > Node.HEALTH_ERR_THRESHOLD) # Node is normal, use normal threshold for when to schedule next health check return not self._last_health_task or (when - self._last_health_task > Node.NORMAL_HEALTH_THRESHOLD) def _is_ready_for_pull_task(self, when): """Indicates whether this node is ready to launch the pull task for the Scale Docker image. Caller must have obtained the thread lock. :param when: The current time :type when: :class:`datetime.datetime` :returns: True if this node is ready to launch a pull task, False otherwise :rtype: bool """ if self._state == Node.IMAGE_PULL: return True elif self._state == Node.DEGRADED: # The pull task can be scheduled during DEGRADED state if other conditions match IMAGE_PULL state and the # DEGRADED errors do not affect the ability to do an image pull # Make sure initial cleanup is done, image pull is not done, and image pull on the node works if not self._is_initial_cleanup_completed or self._is_image_pulled or self._conditions.is_pull_bad: return False # Schedule pull task if threshold has passed since last pull task error last_pull_error = self._conditions.last_image_pull_task_error() if last_pull_error: return when - last_pull_error > Node.IMAGE_PULL_ERR_THRESHOLD # No pull error return True return False def _handle_cleanup_task_update(self, task_update): """Handles the given task update for a cleanup task. Caller must have obtained the thread lock. :param task_update: The cleanup task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` """ if task_update.status == TaskStatusUpdate.FINISHED: if self._cleanup_task.is_initial_cleanup: self._initial_cleanup_completed() else: # Clear job executions that were cleaned up self._cleanup.delete_job_executions( self._cleanup_task.job_exes) self._conditions.update_cleanup_count( self._cleanup.get_num_job_exes()) self._conditions.handle_cleanup_task_completed() elif task_update.status == TaskStatusUpdate.FAILED: logger.warning('Cleanup task on node %s failed', self._hostname) self._conditions.handle_cleanup_task_failed( self._cleanup_task.job_exes) elif task_update.status == TaskStatusUpdate.KILLED: logger.warning('Cleanup task on node %s killed', self._hostname) elif task_update.status == TaskStatusUpdate.LOST: logger.warning('Cleanup task on node %s lost', self._hostname) self._cleanup_task = None if self._cleanup_task and self._cleanup_task.has_ended: self._cleanup_task = None def _handle_health_task_update(self, task_update): """Handles the given task update for a health check task. Caller must have obtained the thread lock. :param task_update: The health check task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` """ if task_update.status == TaskStatusUpdate.FINISHED: self._last_health_task = now() self._conditions.handle_health_task_completed() elif task_update.status == TaskStatusUpdate.FAILED: logger.warning('Health check task on node %s failed', self._hostname) self._last_health_task = now() self._conditions.handle_health_task_failed(task_update) elif task_update.status == TaskStatusUpdate.KILLED: logger.warning('Health check task on node %s killed', self._hostname) elif task_update.status == TaskStatusUpdate.LOST: logger.warning('Health check task on node %s lost', self._hostname) self._health_task = None if self._health_task and self._health_task.has_ended: self._health_task = None def _handle_pull_task_update(self, task_update): """Handles the given task update for a pull task. Caller must have obtained the thread lock. :param task_update: The pull task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` """ if task_update.status == TaskStatusUpdate.FINISHED: self._image_pull_completed() self._conditions.handle_pull_task_completed() elif task_update.status == TaskStatusUpdate.FAILED: logger.warning('Scale image pull task on node %s failed', self._hostname) self._conditions.handle_pull_task_failed() elif task_update.status == TaskStatusUpdate.KILLED: logger.warning('Scale image pull task on node %s killed', self._hostname) elif task_update.status == TaskStatusUpdate.LOST: logger.warning('Scale image pull task on node %s lost', self._hostname) self._pull_task = None if self._pull_task and self._pull_task.has_ended: self._pull_task = None def _reset_node(self): """Resets the node if it goes away so initial cleanup and the Scale image pull occur when the node comes back. Caller must have obtained the node's thread lock. """ self._cleanup = NodeCleanup() self._cleanup_task = None self._health_task = None self._is_image_pulled = False self._is_initial_cleanup_completed = False self._last_health_task = None self._pull_task = None def _update_state(self): """Updates the node's state. Caller must have obtained the node's thread lock. """ old_state = self._state if not self._is_active: self._state = self.DEPRECATED elif not self._is_online: self._state = self.OFFLINE elif self._is_paused: self._state = self.PAUSED elif self._is_scheduler_paused: self._state = self.SCHEDULER_STOPPED elif self._conditions.has_active_errors(): self._state = self.DEGRADED elif not self._is_initial_cleanup_completed: self._state = self.INITIAL_CLEANUP elif not self._is_image_pulled: self._state = self.IMAGE_PULL else: self._state = self.READY if old_state and old_state != self._state: if self._state == self.DEGRADED: logger.warning('Node %s is now in %s state', self._hostname, self._state.state) else: logger.info('Node %s is now in %s state', self._hostname, self._state.state)
def test_handle_lost_tasks(self): """Tests handling lost cleanup tasks""" node = Node(self.node_agent, self.node) node_cleanup = NodeCleanup(node) # Get initial cleanup task task = node_cleanup.get_next_task() task_1_id = task.id self.assertIsNotNone(task) # Lose task without scheduling and get same task again update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed) # Lose task with scheduling and get same task again task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed) # Lose task after running and get same task again task.launch(now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) node_cleanup.handle_task_update(update) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.LOST, now()) node_cleanup.handle_task_update(update) task = node_cleanup.get_next_task() self.assertIsNotNone(task) self.assertEqual(task.id, task_1_id) self.assertFalse(node.is_initial_cleanup_completed)