Пример #1
0
    def _schedule_waiting_tasks(self, nodes, running_job_exes, when):
        """Schedules all waiting tasks for which there are sufficient resources and updates the resource manager with
        any resource shortages. All scheduling nodes that have fulfilled all of their waiting tasks will be returned so
        new job executions can be added to them.

        :param nodes: The dict of scheduling nodes stored by node ID
        :type nodes: dict
        :param running_job_exes: The currently running job executions
        :type running_job_exes: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: The dict of scheduling nodes stored by node ID that have no more waiting tasks
        :rtype: dict
        """

        fulfilled_nodes = {}  # {Node ID: SchedulingNode}
        waiting_tasks = []

        # Schedule waiting node tasks first
        for node in nodes.values():
            has_waiting_tasks = node.accept_node_tasks(when, waiting_tasks)
            if node.is_ready_for_next_job_task and not has_waiting_tasks:
                # A node can only be fulfilled if it is able to run waiting tasks and it has no more waiting tasks
                fulfilled_nodes[node.node_id] = node

        # Schedule job executions already on the node waiting for their next task
        # TODO: fail job_exes with a "node lost" error if job_exe's node does not appear in the dict or is offline or
        # changed agent ID
        # TODO: fail job_exes if they are starving to get resources for their next task
        for running_job_exe in running_job_exes:
            if running_job_exe.is_next_task_ready(
            ) and running_job_exe.node_id in nodes:
                node = nodes[running_job_exe.node_id]
                has_waiting_tasks = node.accept_job_exe_next_task(
                    running_job_exe, waiting_tasks)
                if has_waiting_tasks and node.node_id in fulfilled_nodes:
                    # Node has tasks waiting for resources
                    del fulfilled_nodes[node.node_id]

        # Update waiting task counts and calculate shortages
        agent_shortages = {}  # {Agent ID: NodeResources}
        new_waiting_tasks = {}  # {Task ID: int}
        for task in waiting_tasks:
            if task.id in self._waiting_tasks:
                count = self._waiting_tasks[task.id] + 1
            else:
                count = 1
            new_waiting_tasks[task.id] = count
            if count >= TASK_SHORTAGE_WAIT_COUNT:
                # This task has waited too long for resources, generate a shortage
                if task.agent_id in agent_shortages:
                    agent_shortages[task.agent_id].add(task.get_resources())
                else:
                    resources = NodeResources()
                    resources.add(task.get_resources())
                    agent_shortages[task.agent_id] = resources
        self._waiting_tasks = new_waiting_tasks
        resource_mgr.set_agent_shortages(agent_shortages)

        return fulfilled_nodes
Пример #2
0
    def _schedule_waiting_tasks(self, nodes, running_job_exes, when):
        """Schedules all waiting tasks for which there are sufficient resources and updates the resource manager with
        any resource shortages. All scheduling nodes that have fulfilled all of their waiting tasks will be returned so
        new job executions can be added to them.

        :param nodes: The dict of scheduling nodes stored by node ID
        :type nodes: dict
        :param running_job_exes: The currently running job executions
        :type running_job_exes: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: The dict of scheduling nodes stored by node ID that have no more waiting tasks
        :rtype: dict
        """

        fulfilled_nodes = {}  # {Node ID: SchedulingNode}
        waiting_tasks = []

        # Schedule waiting node tasks first
        for node in nodes.values():
            has_waiting_tasks = node.accept_node_tasks(when, waiting_tasks)
            if node.is_ready_for_next_job_task and not has_waiting_tasks:
                # A node can only be fulfilled if it is able to run waiting tasks and it has no more waiting tasks
                fulfilled_nodes[node.node_id] = node

        # Schedule job executions already on the node waiting for their next task
        node_lost_job_exes_ids = []
        for running_job_exe in running_job_exes:
            if running_job_exe.node_id not in nodes:  # Unknown/lost node
                node_lost_job_exes_ids.append(running_job_exe.id)
            else:
                node = nodes[running_job_exe.node_id]
                if not node.is_ready_for_next_job_task or node.agent_id != running_job_exe.agent_id:
                    # Node is deprecated, offline, or has switched agent IDs
                    node_lost_job_exes_ids.append(running_job_exe.id)
                elif running_job_exe.is_next_task_ready():
                    has_waiting_tasks = node.accept_job_exe_next_task(
                        running_job_exe, waiting_tasks)
                    if has_waiting_tasks and node.node_id in fulfilled_nodes:
                        # Node has tasks waiting for resources
                        del fulfilled_nodes[node.node_id]
        # Handle any running job executions that have lost their node or become starved
        finished_job_exes = job_exe_mgr.check_for_starvation(when)
        if node_lost_job_exes_ids:
            finished_job_exes.extend(
                job_exe_mgr.lost_job_exes(node_lost_job_exes_ids, when))
        for finished_job_exe in finished_job_exes:
            cleanup_mgr.add_job_execution(finished_job_exe)

        # Update waiting task counts and calculate shortages
        agent_shortages = {}  # {Agent ID: NodeResources}
        new_waiting_tasks = {}  # {Task ID: int}
        for task in waiting_tasks:
            if task.id in self._waiting_tasks:
                count = self._waiting_tasks[task.id] + 1
            else:
                count = 1
            new_waiting_tasks[task.id] = count
            if count >= TASK_SHORTAGE_WAIT_COUNT:
                # This task has waited too long for resources, generate a shortage
                if task.agent_id in agent_shortages:
                    agent_shortages[task.agent_id].add(task.get_resources())
                else:
                    resources = NodeResources()
                    resources.add(task.get_resources())
                    agent_shortages[task.agent_id] = resources
        self._waiting_tasks = new_waiting_tasks
        resource_mgr.set_agent_shortages(agent_shortages)

        return fulfilled_nodes