예제 #1
0
    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics()
        self.autoscaler = MockAutoscaler(
            self.config_path,
            self.load_metrics,
            MockNodeInfoStub(),
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            event_summarizer=EventSummarizer(),
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )
예제 #2
0
class Simulator:
    """This autoscaler simulator consists of a few components.

    State is stored in 3 main data structures:
        * Resource management state is stored in self.ip_to_nodes
        * The scheduler's work queue is stored in self.work_queue
        * An event queue which acts as the simulation's "timeline" in
          self.event_queue


    The logic is organized into 3 functions (and their helpers):
        * self.run_autoscaler plays the role of `monitor.py` and translates
          resource management state for load_metrics to consume.
        * self.schedule is the only consumer of the work queue. It dispatches
          work to the appropriate schedulers, which mutate cluster state and
          produce events for the event queue.
        * self.process_event is the sole consumer of the event queue. It
          dispatches work to the appropriate event handlers.

    There are 3 main ways of interacting with the simulator:
        * simulator.submit: To submit tasks
        * simulator.step: To go to the next "event"
        * task/actor/placement group start/done callbacks

    """
    def __init__(
        self,
        config_path,
        provider,
        autoscaler_update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
        node_startup_delay_s=120,
    ):
        self.config_path = config_path
        self.provider = provider
        self.autoscaler_update_interval_s = autoscaler_update_interval_s
        self.node_startup_delay_s = node_startup_delay_s

        self._setup_autoscaler()
        self._setup_simulator()

    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics()
        self.autoscaler = MockAutoscaler(
            self.config_path,
            self.load_metrics,
            MockNodeInfoStub(),
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            event_summarizer=EventSummarizer(),
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )

    def _setup_simulator(self):
        self.virtual_time = 0
        self.ip_to_nodes = {}
        self._update_cluster_state(join_immediately=True)

        self.work_queue = []
        self.event_queue = PriorityQueue()
        self.event_queue.put(Event(0, SIMULATOR_EVENT_AUTOSCALER_UPDATE))

    def _update_cluster_state(self, join_immediately=False):
        nodes = self.provider.non_terminated_nodes(tag_filters={})
        for node_id in nodes:
            ip = self.provider.internal_ip(node_id)
            if ip in self.ip_to_nodes:
                continue
            node_tags = self.provider.node_tags(node_id)
            if TAG_RAY_USER_NODE_TYPE in node_tags:
                node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
                resources = self.config["available_node_types"][node_type].get(
                    "resources", {})
                node = Node(resources, join_immediately, node_type,
                            self.virtual_time)
                self.ip_to_nodes[ip] = node
                if not join_immediately:
                    join_time = self.virtual_time + self.node_startup_delay_s
                    self.event_queue.put(
                        Event(join_time, SIMULATOR_EVENT_NODE_JOINED, node))

    def submit(self, work):
        if isinstance(work, list):
            self.work_queue.extend(work)
        else:
            self.work_queue.append(work)

    def _get_node_to_run(self, bundle, nodes):
        for ip, node in nodes.items():
            if node.bundle_fits(bundle):
                return ip, node
        return None, None

    def _schedule_placement_group(self, pg, nodes):
        # This scheduling algorithm is bad, but it is approximately as bad as
        # the real placement group scheduler.
        to_allocate = []
        if (pg.strategy == PlacementStrategy.STRICT_PACK
                or pg.strategy == PlacementStrategy.PACK):
            combined = collections.defaultdict(float)
            for bundle in pg.bundles:
                for k, v in bundle.items():
                    combined[k] += v
            ip, node_to_run = self._get_node_to_run(combined, nodes)
            if node_to_run is None:
                return False
            to_allocate.append((combined, ip))
        elif (pg.strategy == PlacementStrategy.STRICT_SPREAD
              or pg.strategy == PlacementStrategy.SPREAD):
            # TODO (Alex): More accurate handling of non-STRICT_PACK groups.
            remaining_nodes = nodes.copy()
            for bundle in pg.bundles:
                ip, node_to_run = self._get_node_to_run(
                    bundle, remaining_nodes)
                if node_to_run is None:
                    return False
                del remaining_nodes[ip]
                to_allocate.append((bundle, ip))

        for bundle, ip in to_allocate:
            node = self.ip_to_nodes[ip]
            node.allocate(bundle)
        pg.start_time = self.virtual_time
        end_time = self.virtual_time + pg.duration
        self.event_queue.put(
            Event(end_time, SIMULATOR_EVENT_PG_DONE, (pg, to_allocate)))
        if pg.start_callback:
            pg.start_callback()
        return True

    def _schedule_task(self, task, nodes):
        ip, node = self._get_node_to_run(task.resources, nodes)
        if node is None:
            return False

        node.allocate(task.resources)
        task.node = node
        task.start_time = self.virtual_time
        end_time = self.virtual_time + task.duration
        self.event_queue.put(Event(end_time, SIMULATOR_EVENT_TASK_DONE, task))
        if task.start_callback:
            task.start_callback()
        return True

    def schedule(self):
        # TODO (Alex): Implement a more realistic scheduling algorithm.
        new_work_queue = []
        for work in self.work_queue:
            if isinstance(work, Task):
                scheduled = self._schedule_task(work, self.ip_to_nodes)
            elif isinstance(work, PlacementGroup):
                scheduled = self._schedule_placement_group(
                    work, self.ip_to_nodes)
            else:
                assert False, "Unknown work object!"

            if scheduled is False:
                new_work_queue.append(work)
        self.work_queue = new_work_queue

    def _launch_nodes(self):
        """Launch all queued nodes. Since this will be run serially after
        `autoscaler.update` there are no race conditions in checking if the
        queue is empty.
        """
        while not self.node_launcher.queue.empty():
            config, count, node_type = self.node_launcher.queue.get()
            try:
                self.node_launcher._launch_node(config, count, node_type)
            except Exception:
                pass
            finally:
                self.node_launcher.pending.dec(node_type, count)

    def _infeasible(self, bundle):
        for node in self.ip_to_nodes.values():
            if node.feasible(bundle):
                return False
        return True

    def run_autoscaler(self):
        waiting_bundles = []
        infeasible_bundles = []
        placement_groups = []
        for work in self.work_queue:
            if isinstance(work, Task):
                shape = work.resources
                if self._infeasible(shape):
                    infeasible_bundles.append(shape)
                else:
                    waiting_bundles.append(shape)
            if isinstance(work, PlacementGroup):
                placement_groups.append(
                    PlacementGroupTableData(
                        state=PlacementGroupTableData.PENDING,
                        strategy=work.strategy,
                        bundles=[
                            Bundle(unit_resources=bundle)
                            for bundle in work.bundles
                        ],
                    ))

        for ip, node in self.ip_to_nodes.items():
            if not node.in_cluster:
                continue
            self.load_metrics.update(
                ip=ip,
                raylet_id=node.raylet_id,
                static_resources=node.total_resources,
                dynamic_resources=node.available_resources,
                resource_load={},
                waiting_bundles=waiting_bundles,
                infeasible_bundles=infeasible_bundles,
                pending_placement_groups=placement_groups,
            )

        self.autoscaler.update()
        self._launch_nodes()
        self._update_cluster_state()

    def process_event(self, event):
        if event.event_type == SIMULATOR_EVENT_AUTOSCALER_UPDATE:
            self.run_autoscaler()
            next_update = self.virtual_time + self.autoscaler_update_interval_s
            self.event_queue.put(
                Event(next_update, SIMULATOR_EVENT_AUTOSCALER_UPDATE))
        elif event.event_type == SIMULATOR_EVENT_TASK_DONE:
            task = event.data
            task.node.free(task.resources)
            if task.done_callback:
                task.done_callback()
        elif event.event_type == SIMULATOR_EVENT_NODE_JOINED:
            node = event.data
            node.in_cluster = True
        elif event.event_type == SIMULATOR_EVENT_PG_DONE:
            pg, allocated = event.data
            for bundle, ip in allocated:
                self.ip_to_nodes[ip].free(bundle)
            if pg.done_callback:
                pg.done_callback()
        else:
            assert False, "Unknown event!"

    def step(self):
        self.virtual_time = self.event_queue.queue[0].time
        while self.event_queue.queue[0].time == self.virtual_time:
            event = self.event_queue.get()
            self.process_event(event)
        self.schedule()
        print(self.info_string())
        return self.virtual_time

    def node_costs(self):
        """Returns the cost of nodes. Cost is measured in terms of cumulative hours of
        runtime per node type.

        """
        costs = collections.defaultdict(float)

        for node in self.ip_to_nodes.values():
            if not node.in_cluster:
                continue
            runtime = self.virtual_time - node.start_time
            costs[node.node_type] += runtime
        return costs

    def info_string(self):
        num_connected_nodes = len(
            [node for node in self.ip_to_nodes.values() if node.in_cluster])
        num_pending_nodes = len(self.ip_to_nodes) - num_connected_nodes
        return (f"[t={self.virtual_time}] "
                f"Connected: {num_connected_nodes}, "
                f"Pending: {num_pending_nodes}, "
                f"Remaining: {len(self.work_queue)}")