def _schedule_accepted_tasks(self): """Schedules all of the tasks that have been accepted :returns: The number of Mesos tasks that were scheduled :rtype: int """ when = now() tasks_to_launch = {} # {Node ID: [Mesos Tasks]} queued_job_exes_to_schedule = [] node_offers_list = offer_mgr.pop_offers_with_accepted_job_exes() for node_offers in node_offers_list: mesos_tasks = [] tasks_to_launch[node_offers.node.id] = mesos_tasks # Add cleanup tasks for task in node_offers.get_accepted_tasks(): task.launch(when) mesos_tasks.append(create_mesos_task(task)) # Start next task for already running job executions that were accepted for running_job_exe in node_offers.get_accepted_running_job_exes(): task = running_job_exe.start_next_task() if task: task.launch(when) mesos_tasks.append(create_mesos_task(task)) # Gather up queued job executions that were accepted for queued_job_exe in node_offers.get_accepted_new_job_exes(): queued_job_exes_to_schedule.append(queued_job_exe) try: # Schedule queued job executions and start their first tasks workspaces = workspace_mgr.get_workspaces() scheduled_job_exes = self._schedule_queued_job_executions(queued_job_exes_to_schedule, workspaces) running_job_mgr.add_job_exes(scheduled_job_exes) for scheduled_job_exe in scheduled_job_exes: task = scheduled_job_exe.start_next_task() if task: task.launch(when) tasks_to_launch[scheduled_job_exe.node_id].append(create_mesos_task(task)) except OperationalError: logger.exception('Failed to schedule queued job executions') # Launch tasks on Mesos total_num_tasks = 0 total_num_nodes = 0 for node_offers in node_offers_list: task_list = tasks_to_launch[node_offers.node.id] num_tasks = len(task_list) total_num_tasks += num_tasks if num_tasks: total_num_nodes += 1 mesos_offer_ids = [] for offer_id in node_offers.offer_ids: mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer_id mesos_offer_ids.append(mesos_offer_id) self._driver.launchTasks(mesos_offer_ids, task_list) if total_num_tasks: logger.info('Launched %i Mesos task(s) on %i node(s)', total_num_tasks, total_num_nodes) return total_num_tasks
def _schedule_accepted_tasks(self): """Schedules all of the tasks that have been accepted :returns: The number of Mesos tasks that were scheduled :rtype: int """ tasks_to_launch = {} # {Node ID: [Mesos Tasks]} queued_job_exes_to_schedule = [] node_offers_list = self._offer_manager.pop_offers_with_accepted_job_exes( ) for node_offers in node_offers_list: mesos_tasks = [] tasks_to_launch[node_offers.node.id] = mesos_tasks # Start next task for already running job executions that were accepted for running_job_exe in node_offers.get_accepted_running_job_exes(): task = running_job_exe.start_next_task() if task: mesos_tasks.append(create_mesos_task(task)) # Gather up queued job executions that were accepted for queued_job_exe in node_offers.get_accepted_new_job_exes(): queued_job_exes_to_schedule.append(queued_job_exe) try: # Schedule queued job executions and start their first tasks workspaces = self._workspace_manager.get_workspaces() scheduled_job_exes = self._schedule_queued_job_executions( queued_job_exes_to_schedule, workspaces) self._job_exe_manager.add_job_exes(scheduled_job_exes) for scheduled_job_exe in scheduled_job_exes: task = scheduled_job_exe.start_next_task() if task: tasks_to_launch[scheduled_job_exe.node_id].append( create_mesos_task(task)) except OperationalError: logger.exception('Failed to schedule queued job executions') # Launch tasks on Mesos total_num_tasks = 0 total_num_nodes = 0 for node_offers in node_offers_list: task_list = tasks_to_launch[node_offers.node.id] num_tasks = len(task_list) total_num_tasks += num_tasks if num_tasks: total_num_nodes += 1 mesos_offer_ids = [] for offer_id in node_offers.offer_ids: mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer_id mesos_offer_ids.append(mesos_offer_id) self._driver.launchTasks(mesos_offer_ids, task_list) if total_num_tasks: logger.info('Launched %i Mesos task(s) on %i node(s)', total_num_tasks, total_num_nodes) return total_num_tasks
def _launch_tasks(self, client, nodes): """Launches all of the tasks that have been scheduled on the given nodes :param client: The Mesos scheduler client :type client: :class:`mesoshttp.client.MesosClient` :param nodes: The dict of all scheduling nodes stored by node ID :type nodes: dict :returns: The number of tasks that were launched and the number of offers accepted :rtype: tuple """ started = now() # Start and launch tasks in the task manager all_tasks = [] for node in nodes.values(): node.start_job_exe_tasks() all_tasks.extend(node.allocated_tasks) task_mgr.launch_tasks(all_tasks, started) # Launch tasks in Mesos node_count = 0 total_node_count = 0 total_offer_count = 0 total_task_count = 0 total_offer_resources = NodeResources() total_task_resources = NodeResources() for node in nodes.values(): mesos_offers = [] mesos_tasks = [] offers = node.allocated_offers for offer in offers: total_offer_count += 1 total_offer_resources.add(offer.resources) mesos_offers.append(offer.mesos_offer) tasks = node.allocated_tasks for task in tasks: total_task_resources.add(task.get_resources()) mesos_tasks.append(create_mesos_task(task)) task_count = len(tasks) total_task_count += task_count if task_count: node_count += 1 if mesos_offers: total_node_count += 1 try: client.combine_offers(mesos_offers, mesos_tasks) except Exception: logger.exception( 'Error occurred while launching tasks on node %s', node.hostname) duration = now() - started msg = 'Launching tasks took %.3f seconds' if duration > LAUNCH_TASK_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) declined_resources = NodeResources() declined_resources.add(total_offer_resources) declined_resources.subtract(total_task_resources) if total_offer_count: logger.info( 'Accepted %d offer(s) from %d node(s), launched %d task(s) with %s on %d node(s), declined %s', total_offer_count, total_node_count, total_task_count, total_task_resources, node_count, declined_resources) return total_task_count, total_offer_count