def test_scheduler_event_handlers(mocker): sched = mocker.Mock() driver = mocker.Mock() proxy = SchedulerProxy(sched) proxy.executorLost(driver, mesos_pb2.ExecutorInfo(), mesos_pb2.ExecutorID(), 1) proxy.frameworkMessage(driver, mesos_pb2.ExecutorID(), mesos_pb2.SlaveID(), 'message') proxy.offerRescinded(driver, mesos_pb2.OfferID()) proxy.registered(driver, mesos_pb2.FrameworkID(), mesos_pb2.MasterInfo()) proxy.resourceOffers(driver, [mesos_pb2.Offer(), mesos_pb2.Offer()]) proxy.slaveLost(driver, mesos_pb2.SlaveID()) proxy.statusUpdate(driver, mesos_pb2.TaskStatus()) proxy.reregistered(driver, mesos_pb2.MasterInfo()) proxy.error(driver, 'message') proxy.disconnected(driver) sched.on_executor_lost.assert_called_once() sched.on_message.assert_called_once() sched.on_rescinded.assert_called_once() sched.on_registered.assert_called_once() sched.on_offers.assert_called_once() sched.on_slave_lost.assert_called_once() sched.on_update.assert_called_once() sched.on_reregistered.assert_called_once() sched.on_error.assert_called_once() sched.on_disconnected.assert_called_once()
def _schedule_accepted_tasks(self): """Schedules all of the tasks that have been accepted :returns: The number of Mesos tasks that were scheduled :rtype: int """ tasks_to_launch = {} # {Node ID: [Mesos Tasks]} queued_job_exes_to_schedule = [] node_offers_list = self._offer_manager.pop_offers_with_accepted_job_exes( ) for node_offers in node_offers_list: mesos_tasks = [] tasks_to_launch[node_offers.node.id] = mesos_tasks # Start next task for already running job executions that were accepted for running_job_exe in node_offers.get_accepted_running_job_exes(): task = running_job_exe.start_next_task() if task: mesos_tasks.append(create_mesos_task(task)) # Gather up queued job executions that were accepted for queued_job_exe in node_offers.get_accepted_new_job_exes(): queued_job_exes_to_schedule.append(queued_job_exe) try: # Schedule queued job executions and start their first tasks workspaces = self._workspace_manager.get_workspaces() scheduled_job_exes = self._schedule_queued_job_executions( queued_job_exes_to_schedule, workspaces) self._job_exe_manager.add_job_exes(scheduled_job_exes) for scheduled_job_exe in scheduled_job_exes: task = scheduled_job_exe.start_next_task() if task: tasks_to_launch[scheduled_job_exe.node_id].append( create_mesos_task(task)) except OperationalError: logger.exception('Failed to schedule queued job executions') # Launch tasks on Mesos total_num_tasks = 0 total_num_nodes = 0 for node_offers in node_offers_list: task_list = tasks_to_launch[node_offers.node.id] num_tasks = len(task_list) total_num_tasks += num_tasks if num_tasks: total_num_nodes += 1 mesos_offer_ids = [] for offer_id in node_offers.offer_ids: mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer_id mesos_offer_ids.append(mesos_offer_id) self._driver.launchTasks(mesos_offer_ids, task_list) if total_num_tasks: logger.info('Launched %i Mesos task(s) on %i node(s)', total_num_tasks, total_num_nodes) return total_num_tasks
def setUp(self): self.framework_id = mesos_pb2.FrameworkID(value=self.FRAMEWORK_ID) self.framework_info = mesos_pb2.FrameworkInfo( user='******', name='fake_framework_name', ) self.command_info = mesos_pb2.CommandInfo(value='fake-command') self.executor_id = mesos_pb2.ExecutorID(value='fake-executor-id') self.executor_info = mesos_pb2.ExecutorInfo( executor_id=self.executor_id, framework_id=self.framework_id, command=self.command_info, ) self.slave_id = mesos_pb2.SlaveID(value='fake-slave-id') self.offer_id = mesos_pb2.OfferID(value='1')
def run(self): """The main run loop of the thread """ logger.info('Scheduling thread started') while self._running: started = now() num_tasks = 0 try: num_tasks = self._perform_scheduling() except Exception: logger.exception('Critical error in scheduling thread') duration = now() - started msg = 'Scheduling thread loop took %.3f seconds' if duration > SchedulingThread.SCHEDULE_LOOP_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) if num_tasks == 0: # Since we didn't schedule anything, give resources back to Mesos and pause a moment try: for node_offers in offer_mgr.pop_all_offers(): for offer_id in node_offers.offer_ids: mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer_id self._driver.declineOffer(mesos_offer_id) except Exception: logger.exception('Critical error in scheduling thread') logger.debug('Scheduling thread is pausing for %i second(s)', SchedulingThread.DELAY) time.sleep(SchedulingThread.DELAY) logger.info('Scheduling thread stopped')
def _launch_tasks(self, driver, nodes): """Launches all of the tasks that have been scheduled on the given nodes :param driver: The Mesos scheduler driver :type driver: :class:`mesos_api.mesos.SchedulerDriver` :param nodes: The dict of all scheduling nodes stored by node ID :type nodes: dict :returns: The number of tasks that were launched and the number of offers accepted :rtype: tuple """ started = now() # Start and launch tasks in the task manager all_tasks = [] for node in nodes.values(): node.start_job_exe_tasks() all_tasks.extend(node.allocated_tasks) task_mgr.launch_tasks(all_tasks, started) # Launch tasks in Mesos node_count = 0 total_node_count = 0 total_offer_count = 0 total_task_count = 0 total_offer_resources = NodeResources() total_task_resources = NodeResources() for node in nodes.values(): mesos_offer_ids = [] mesos_tasks = [] offers = node.allocated_offers for offer in offers: total_offer_count += 1 total_offer_resources.add(offer.resources) mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer.id mesos_offer_ids.append(mesos_offer_id) tasks = node.allocated_tasks for task in tasks: total_task_resources.add(task.get_resources()) mesos_tasks.append(create_mesos_task(task)) task_count = len(tasks) total_task_count += task_count if task_count: node_count += 1 if mesos_offer_ids: total_node_count += 1 try: driver.launchTasks(mesos_offer_ids, mesos_tasks) except Exception: logger.exception( 'Error occurred while launching tasks on node %s', node.hostname) duration = now() - started msg = 'Launching tasks took %.3f seconds' if duration > LAUNCH_TASK_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) declined_resources = NodeResources() declined_resources.add(total_offer_resources) declined_resources.subtract(total_task_resources) if total_offer_count: logger.info( 'Accepted %d offer(s) from %d node(s), launched %d task(s) with %s on %d node(s), declined %s', total_offer_count, total_node_count, total_task_count, total_task_resources, node_count, declined_resources) return total_task_count, total_offer_count