Пример #1
0
    async def wait_for_goal(self, goal_id: GoalId) -> Optional[Exception]:
        """
        Wait for given goal_id to complete by external code calling
        complete_goal(goal_id), either result or exception could be set.

        Args:
            goal_id (GoalId): Target goal_id to wait on
        Returns:
            goal_id (GoalId): If goal_id finished successfully
        Raises:
            exception (Exception): If set by caller of complete_goal() with
                an exception object
        """
        start = time.time()
        if goal_id not in self._pending_goals:
            logger.debug(f"Goal {goal_id} not found")
            return None

        async_goal = self._pending_goals[goal_id]
        await async_goal.wait()
        logger.debug(
            f"Waiting for goal {goal_id} took {time.time() - start} seconds")

        if async_goal.exception is not None:
            return async_goal.exception
Пример #2
0
    def _set_backend_goal(self, backend_info: Optional[BackendInfo]) -> None:
        """
        Set desirable state for a given backend, identified by tag.

        Args:
            backend_info (Optional[BackendInfo]): Contains backend and
                replica config, if passed in as None, we're marking
                target backend as shutting down.
        """
        existing_goal_id = self._curr_goal
        new_goal_id = self._goal_manager.create_goal()

        if backend_info is not None:
            self._target_info = backend_info
            self._target_replicas = backend_info.backend_config.num_replicas
            self._target_version = BackendVersion(
                backend_info.version,
                user_config=backend_info.backend_config.user_config)

        else:
            self._target_replicas = 0

        self._curr_goal = new_goal_id
        logger.debug(
            f"Set backend goal for {self._name} with version "
            f"{backend_info if backend_info is None else backend_info.version}"
        )
        return new_goal_id, existing_goal_id
Пример #3
0
    def start(self, backend_info: BackendInfo):
        self._actor_resources = backend_info.replica_config.resource_dict

        try:
            self._placement_group = ray.util.get_placement_group(
                self._placement_group_name)
        except ValueError:
            logger.debug(
                "Creating placement group '{}' for backend '{}'".format(
                    self._placement_group_name, self._backend_tag))
            self._placement_group = ray.util.placement_group(
                [self._actor_resources],
                lifetime="detached",
                name=self._placement_group_name)

        try:
            self._actor_handle = ray.get_actor(self._actor_name)
        except ValueError:
            logger.debug("Starting replica '{}' for backend '{}'.".format(
                self._replica_tag, self._backend_tag))
            self._actor_handle = ray.remote(backend_info.worker_class).options(
                name=self._actor_name,
                lifetime="detached" if self._detached else None,
                placement_group=self._placement_group,
                placement_group_capture_child_tasks=False,
                **backend_info.replica_config.ray_actor_options).remote(
                    self._backend_tag, self._replica_tag,
                    backend_info.replica_config.init_args,
                    backend_info.backend_config, self._controller_name)
        self._startup_obj_ref = self._actor_handle.ready.remote()
Пример #4
0
    def check_stopped(self) -> bool:
        """Check if the replica has stopped. If so, transition to STOPPED.

        Should handle the case where the replica has already stopped.
        """
        if self._state == ReplicaState.STOPPED:
            return True
        assert self._state == ReplicaState.STOPPING, (
            f"State must be {ReplicaState.STOPPING}, *not* {self._state}")

        stopped = self._actor.check_stopped()
        if stopped:
            self._state = ReplicaState.STOPPED
            # Clean up any associated resources (e.g., placement group).
            self._actor.cleanup()
            return True

        timeout_passed = time.time() > self._shutdown_deadline

        if timeout_passed:
            # Graceful period passed, kill it forcefully.
            # This will be called repeatedly until the replica shuts down.
            logger.debug(
                f"Replica {self._replica_tag} did not shutdown after "
                f"{self._graceful_shutdown_timeout_s}s, force-killing.")

            self._actor.force_stop()
        return False
Пример #5
0
    async def enqueue_request(self, request_meta, *request_args,
                              **request_kwargs):
        service = request_meta.service
        logger.debug("Received a request for service {}".format(service))

        # check if the slo specified is directly the
        # wall clock time
        if request_meta.absolute_slo_ms is not None:
            request_slo_ms = request_meta.absolute_slo_ms
        else:
            request_slo_ms = request_meta.adjust_relative_slo_ms()
        request_context = request_meta.request_context
        query = Query(
            request_args,
            request_kwargs,
            request_context,
            request_slo_ms,
            call_method=request_meta.call_method,
            async_future=asyncio.get_event_loop().create_future())
        await self.service_queues[service].put(query)
        await self.flush()

        # Note: a future change can be to directly return the ObjectID from
        # replica task submission
        result = await query.async_future
        return result
Пример #6
0
    async def enqueue_request(self, request_meta, *request_args,
                              **request_kwargs):
        endpoint = request_meta.endpoint
        logger.debug("Received a request for endpoint {}".format(endpoint))
        self.num_router_requests.labels(endpoint=endpoint).add()

        # check if the slo specified is directly the
        # wall clock time
        if request_meta.absolute_slo_ms is not None:
            request_slo_ms = request_meta.absolute_slo_ms
        else:
            request_slo_ms = request_meta.adjust_relative_slo_ms()
        request_context = request_meta.request_context
        query = Query(request_args,
                      request_kwargs,
                      request_context,
                      request_slo_ms,
                      call_method=request_meta.call_method,
                      shard_key=request_meta.shard_key,
                      async_future=asyncio.get_event_loop().create_future())
        async with self.flush_lock:
            self.endpoint_queues[endpoint].appendleft(query)
            self.flush_endpoint_queue(endpoint)

        # Note: a future change can be to directly return the ObjectID from
        # replica task submission
        try:
            result = await query.async_future
        except RayTaskError as e:
            self.num_error_endpoint_request.labels(endpoint=endpoint).add()
            result = e
        return result
Пример #7
0
    def _process_update(self, updates: Dict[str, UpdatedObject]):
        if isinstance(updates, (ray.exceptions.RayActorError)):
            # This can happen during shutdown where the controller is
            # intentionally killed, the client should just gracefully
            # exit.
            logger.debug("LongPollClient failed to connect to host. "
                         "Shutting down.")
            return

        if isinstance(updates, (ray.exceptions.RayTaskError)):
            # Some error happened in the controller. It could be a bug or some
            # undesired state.
            logger.error("LongPollHost errored\n" + updates.traceback_str)
            self._poll_next()
            return

        logger.debug(f"LongPollClient {self} received updates for keys: "
                     f"{list(updates.keys())}.")
        for key, update in updates.items():
            self.object_snapshots[key] = update.object_snapshot
            self.snapshot_ids[key] = update.snapshot_id
            callback = self.key_listeners[key]

            # Bind the parameters because closures are late-binding.
            # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501
            def chained(callback=callback, arg=update.object_snapshot):
                callback(arg)
                self._on_callback_completed(trigger_at=len(updates))

            if self.event_loop is None:
                chained()
            else:
                self.event_loop.call_soon_threadsafe(chained)
Пример #8
0
 def set_max_concurrent_queries(self, backend_config: BackendConfig):
     new_value: int = backend_config.max_concurrent_queries
     if new_value != self.max_concurrent_queries:
         self.max_concurrent_queries = new_value
         logger.debug(
             f"ReplicaSet: changing max_concurrent_queries to {new_value}")
         self.config_updated_event.set()
Пример #9
0
    def check_stopped(self):
        if self._state == ReplicaState.STOPPED:
            return True
        assert self._state == ReplicaState.STOPPING, (
            f"State must be {ReplicaState.STOPPING}, *not* {self._state}")

        try:
            replica = ray.get_actor(self._actor_name)
        except ValueError:
            self._state = ReplicaState.STOPPED
            return True

        ready, _ = ray.wait([self._drain_obj_ref], timeout=0)
        timeout_passed = time.time() > self._shutdown_deadline

        if len(ready) == 1 or timeout_passed:
            if timeout_passed:
                # Graceful period passed, kill it forcefully.
                logger.debug(
                    f"{self._actor_name} did not shutdown after "
                    f"{self._graceful_shutdown_timeout_s}s, force-killing.")

            ray.kill(replica, no_restart=True)
            self._state = ReplicaState.STOPPED
            return True
        return False
Пример #10
0
 async def assign_replica(self, query: Query) -> ray.ObjectRef:
     """Given a query, submit it to a replica and return the object ref.
     This method will keep track of the in flight queries for each replicas
     and only send a query to available replicas (determined by the backend
     max_concurrent_quries value.)
     """
     endpoint = query.metadata.endpoint
     self.num_queued_queries += 1
     self.num_queued_queries_gauge.set(self.num_queued_queries,
                                       tags={"endpoint": endpoint})
     assigned_ref = self._try_assign_replica(query)
     while assigned_ref is None:  # Can't assign a replica right now.
         logger.debug("Failed to assign a replica for "
                      f"query {query.metadata.request_id}")
         # Maybe there exists a free replica, we just need to refresh our
         # query tracker.
         num_finished = self._drain_completed_object_refs()
         # All replicas are really busy, wait for a query to complete or the
         # config to be updated.
         if num_finished == 0:
             logger.debug(
                 "All replicas are busy, waiting for a free replica.")
             await asyncio.wait(self._all_query_refs +
                                [self.config_updated_event.wait()],
                                return_when=asyncio.FIRST_COMPLETED)
             if self.config_updated_event.is_set():
                 self.config_updated_event.clear()
         # We are pretty sure a free replica is ready now, let's recurse and
         # assign this query a replica.
         assigned_ref = self._try_assign_replica(query)
     self.num_queued_queries -= 1
     self.num_queued_queries_gauge.set(self.num_queued_queries,
                                       tags={"endpoint": endpoint})
     return assigned_ref
Пример #11
0
    def start(self, backend_info: BackendInfo):
        self._actor_resources = backend_info.replica_config.resource_dict

        # Feature flagging because of placement groups doesn't handle
        # newly added nodes.
        # https://github.com/ray-project/ray/issues/15801
        if USE_PLACEMENT_GROUP:
            try:
                self._placement_group = ray.util.get_placement_group(
                    self._placement_group_name)
            except ValueError:
                logger.debug(
                    "Creating placement group '{}' for backend '{}'".format(
                        self._placement_group_name, self._backend_tag))
                self._placement_group = ray.util.placement_group(
                    [self._actor_resources],
                    lifetime="detached" if self._detached else None,
                    name=self._placement_group_name)

        try:
            self._actor_handle = ray.get_actor(self._actor_name)
        except ValueError:
            logger.debug("Starting replica '{}' for backend '{}'.".format(
                self._replica_tag, self._backend_tag))
            self._actor_handle = backend_info.actor_def.options(
                name=self._actor_name,
                lifetime="detached" if self._detached else None,
                placement_group=self._placement_group,
                placement_group_capture_child_tasks=False,
                **backend_info.replica_config.ray_actor_options).remote(
                    self._backend_tag, self._replica_tag,
                    backend_info.replica_config.init_args,
                    backend_info.backend_config, self._controller_name)
        self._startup_obj_ref = self._actor_handle.ready.remote()
Пример #12
0
    async def _start_backend_replica(self, current_state: SystemState,
                                     backend_tag: BackendTag,
                                     replica_tag: ReplicaTag) -> ActorHandle:
        """Start a replica and return its actor handle.

        Checks if the named actor already exists before starting a new one.

        Assumes that the backend configuration is already in the Goal State.
        """
        # NOTE(edoakes): the replicas may already be created if we
        # failed after creating them but before writing a
        # checkpoint.
        replica_name = format_actor_name(replica_tag, self.controller_name)
        try:
            replica_handle = ray.get_actor(replica_name)
        except ValueError:
            logger.debug("Starting replica '{}' for backend '{}'.".format(
                replica_tag, backend_tag))
            backend_info = current_state.get_backend(backend_tag)

            replica_handle = ray.remote(backend_info.worker_class).options(
                name=replica_name,
                lifetime="detached" if self.detached else None,
                max_restarts=-1,
                max_task_retries=-1,
                **backend_info.replica_config.ray_actor_options).remote(
                    backend_tag, replica_tag,
                    backend_info.replica_config.actor_init_args,
                    backend_info.backend_config, self.controller_name)

        return replica_handle
Пример #13
0
    def update_routes(self, endpoints: Dict[EndpointTag,
                                            EndpointInfo]) -> None:
        logger.debug(f"Got updated endpoints: {endpoints}.")

        existing_handles = set(self.handles.keys())
        routes = []
        route_info = {}
        for endpoint, info in endpoints.items():
            # Default case where the user did not specify a route prefix.
            if info.route is None:
                route = f"/{endpoint}"
            else:
                route = info.route

            routes.append(route)
            route_info[route] = endpoint
            if endpoint in self.handles:
                existing_handles.remove(endpoint)
            else:
                self.handles[endpoint] = self._get_handle(endpoint)

        # Clean up any handles that are no longer used.
        for endpoint in existing_handles:
            del self.handles[endpoint]

        # Routes are sorted in order of decreasing length to enable longest
        # prefix matching.
        self.sorted_routes = sorted(routes, key=lambda x: len(x), reverse=True)
        self.route_info = route_info
Пример #14
0
    async def _flush_service_queues(self):
        # perform traffic splitting for requests
        for service, queue in self.service_queues.items():
            # while there are incoming requests and there are backends
            while queue.qsize() and len(self.traffic[service]):
                backend_names = list(self.traffic[service].keys())
                backend_weights = list(self.traffic[service].values())
                if len(self.traffic[service]) >= 2:
                    # randomly pick 2 backends
                    backend1, backend2 = np.random.choice(backend_names,
                                                          2,
                                                          replace=False,
                                                          p=backend_weights)

                    # see the length of buffer queues of the two backends
                    # and pick the one which has less no. of queries
                    # in the buffer
                    if (len(self.buffer_queues[backend1]) <= len(
                            self.buffer_queues[backend2])):
                        chosen_backend = backend1
                    else:
                        chosen_backend = backend2
                    logger.debug("[Power of two chocies] found two backends "
                                 "{} and {}: choosing {}.".format(
                                     backend1, backend2, chosen_backend))
                else:
                    chosen_backend = np.random.choice(
                        backend_names, replace=False,
                        p=backend_weights).squeeze()
                request = await queue.get()
                self.buffer_queues[chosen_backend].add(request)
Пример #15
0
 async def set_traffic(self, service, traffic_dict):
     logger.debug("Setting traffic for service %s to %s", service,
                  traffic_dict)
     self.traffic[service] = traffic_dict
     backend_names = list(self.traffic[service].keys())
     self.round_robin_iterator_map[service] = itertools.cycle(backend_names)
     await self.flush()
Пример #16
0
    async def _start_single_replica(self, config_store: ConfigurationStore,
                                    backend_tag: BackendTag,
                                    replica_tag: ReplicaTag,
                                    replica_name: str) -> ActorHandle:
        """Creates a backend replica and waits for it to start up.

        Assumes that the backend configuration has already been registered
        in the ConfigurationStore.
        """
        logger.debug("Starting replica '{}' for backend '{}'.".format(
            replica_tag, backend_tag))
        backend_info = config_store.get_backend(backend_tag)

        replica_handle = ray.remote(backend_info.worker_class).options(
            name=replica_name,
            lifetime="detached" if self.detached else None,
            max_restarts=-1,
            max_task_retries=-1,
            **backend_info.replica_config.ray_actor_options).remote(
                backend_tag, replica_tag,
                backend_info.replica_config.actor_init_args,
                backend_info.backend_config, self.controller_name)
        # TODO(edoakes): we should probably have a timeout here.
        await replica_handle.ready.remote()
        return replica_handle
Пример #17
0
    async def enqueue_request(self, request_meta, *request_args,
                              **request_kwargs):
        endpoint = request_meta.endpoint
        logger.debug("Received request {} for endpoint {}.".format(
            request_meta.request_id, endpoint))
        request_start = time.time()
        self.num_router_requests.record(1, tags={"endpoint": endpoint})

        request_context = request_meta.request_context
        query = Query(request_args,
                      request_kwargs,
                      request_context,
                      metadata=request_meta,
                      async_future=asyncio.get_event_loop().create_future())
        async with self.flush_lock:
            self.endpoint_queues[endpoint].appendleft(query)
            self.flush_endpoint_queue(endpoint)

        try:
            result = await query.async_future
        except RayTaskError as e:
            self.num_error_endpoint_requests.record(
                1, tags={"endpoint": endpoint})
            result = e

        request_time_ms = (time.time() - request_start) * 1000
        logger.debug("Finished request {} in {:.2f}ms".format(
            request_meta.request_id, request_time_ms))
        return result
Пример #18
0
    async def _start_backend_worker(self, backend_tag, replica_tag):
        """Creates a backend worker and waits for it to start up.

        Assumes that the backend configuration has already been registered
        in self.backends.
        """
        logger.debug("Starting worker '{}' for backend '{}'.".format(
            replica_tag, backend_tag))
        (backend_worker, backend_config,
         replica_config) = self.backends[backend_tag]

        replica_name = format_actor_name(replica_tag, self.instance_name)
        worker_handle = ray.remote(backend_worker).options(
            name=replica_name,
            max_restarts=-1,
            max_task_retries=-1,
            **replica_config.ray_actor_options).remote(
                backend_tag,
                replica_tag,
                replica_config.actor_init_args,
                backend_config,
                instance_name=self.instance_name)
        # TODO(edoakes): we should probably have a timeout here.
        await worker_handle.ready.remote()
        return worker_handle
Пример #19
0
    async def add_new_worker(self, backend_tag, replica_tag, worker_handle):
        backend_replica_tag = backend_tag + ":" + replica_tag
        if backend_replica_tag in self.replicas:
            return
        self.replicas[backend_replica_tag] = worker_handle

        logger.debug("New worker added for backend '{}'".format(backend_tag))
        await self.mark_worker_idle(backend_tag, backend_replica_tag)
Пример #20
0
 async def remove_endpoint(self, endpoint):
     logger.debug("Removing endpoint {}".format(endpoint))
     async with self.flush_lock:
         self.flush_endpoint_queue(endpoint)
         if endpoint in self.endpoint_queues:
             del self.endpoint_queues[endpoint]
         if endpoint in self.traffic:
             del self.traffic[endpoint]
Пример #21
0
 async def set_traffic(self, endpoint, traffic_dict):
     logger.debug("Setting traffic for endpoint %s to %s", endpoint,
                  traffic_dict)
     self.traffic[endpoint] = traffic_dict
     backend_names = list(self.traffic[endpoint].keys())
     self.round_robin_iterator_map[endpoint] = itertools.cycle(
         backend_names)
     await self.flush()
Пример #22
0
    def notify_changed(self, object_key: str, updated_object: Any):
        self.snapshot_ids[object_key] += 1
        self.object_snapshots[object_key] = updated_object
        logger.debug(f"LongPollerHost: {object_key} = {updated_object}")

        if object_key in self.notifier_events:
            for event in self.notifier_events.pop(object_key):
                event.set()
Пример #23
0
 def __del__(self):
     if not self._detached:
         logger.debug(
             "Shutting down Ray Serve because client went out of "
             "scope. To prevent this, either keep a reference to "
             "the client or use serve.start(detached=True)."
         )
         self.shutdown()
Пример #24
0
 async def set_traffic(self, service, traffic_dict):
     logger.debug("Setting traffic for service %s to %s", service,
                  traffic_dict)
     self.traffic[service] = traffic_dict
     backend_names = list(self.traffic[service].keys())
     self.fixed_packing_iterator_map[service] = itertools.cycle(
         itertools.chain.from_iterable(
             itertools.repeat(x, self.packing_num) for x in backend_names))
     await self.flush()
Пример #25
0
 async def remove_service(self, service):
     logger.debug("Removing service {}".format(service))
     async with self.flush_lock:
         await self._flush_service_queues()
         await self._flush_buffer_queues()
         if service in self.service_queues:
             del self.service_queues[service]
         if service in self.traffic:
             del self.traffic[service]
Пример #26
0
 async def wait_for_goal(self, goal_id: GoalId) -> None:
     start = time.time()
     if goal_id not in self.pending_goals:
         logger.debug(f"Goal {goal_id} not found")
         return True
     event = self.pending_goals[goal_id]
     await event.wait()
     logger.debug(
         f"Waiting for goal {goal_id} took {time.time() - start} seconds")
Пример #27
0
    async def __init__(self, kv_store_connector, router_class, router_kwargs,
                       start_http_proxy, http_proxy_host, http_proxy_port,
                       metric_gc_window_s):
        # Used to read/write checkpoints.
        # TODO(edoakes): namespace the master actor and its checkpoints.
        self.kv_store_client = kv_store_connector("serve_checkpoints")
        # path -> (endpoint, methods).
        self.routes = {}
        # backend -> (worker_creator, init_args, backend_config).
        self.backends = {}
        # backend -> replica_tags.
        self.replicas = defaultdict(list)
        # replicas that should be started if recovering from a checkpoint.
        self.replicas_to_start = defaultdict(list)
        # replicas that should be stopped if recovering from a checkpoint.
        self.replicas_to_stop = defaultdict(list)
        # endpoint -> traffic_dict
        self.traffic_policies = dict()
        # Dictionary of backend tag to dictionaries of replica tag to worker.
        # TODO(edoakes): consider removing this and just using the names.
        self.workers = defaultdict(dict)

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # Cached handles to actors in the system.
        self.router = None
        self.http_proxy = None
        self.metric_monitor = None

        # If starting the actor for the first time, starts up the other system
        # components. If recovering, fetches their actor handles.
        self._get_or_start_router(router_class, router_kwargs)
        if start_http_proxy:
            self._get_or_start_http_proxy(http_proxy_host, http_proxy_port)
        self._get_or_start_metric_monitor(metric_gc_window_s)

        # NOTE(edoakes): unfortunately, we can't completely recover from a
        # checkpoint in the constructor because we block while waiting for
        # other actors to start up, and those actors fetch soft state from
        # this actor. Because no other tasks will start executing until after
        # the constructor finishes, if we were to run this logic in the
        # constructor it could lead to deadlock between this actor and a child.
        # However we do need to guarantee that we have fully recovered from a
        # checkpoint before any other state-changing calls run. We address this
        # by acquiring the write_lock and then posting the task to recover from
        # a checkpoint to the event loop. Other state-changing calls acquire
        # this lock and will be blocked until recovering from the checkpoint
        # finishes.
        checkpoint = self.kv_store_client.get("checkpoint")
        if checkpoint is None:
            logger.debug("No checkpoint found")
        else:
            await self.write_lock.acquire()
            asyncio.get_event_loop().create_task(
                self._recover_from_checkpoint(checkpoint))
Пример #28
0
 async def remove_backend(self, backend):
     logger.debug("Removing backend {}".format(backend))
     async with self.flush_lock:
         self.flush_backend_queues([backend])
         if backend in self.backend_info:
             del self.backend_info[backend]
         if backend in self.worker_queues:
             del self.worker_queues[backend]
         if backend in self.backend_queues:
             del self.backend_queues[backend]
Пример #29
0
 async def _do_query(self, backend, backend_replica_tag, req):
     # If the worker died, this will be a RayActorError. Just return it and
     # let the HTTP proxy handle the retry logic.
     logger.debug("Sending query to replica:" + backend_replica_tag)
     start = time.time()
     worker = self.replicas[backend_replica_tag]
     result = await worker.handle_request.remote(req)
     await self.mark_worker_idle(backend, backend_replica_tag)
     logger.debug("Got result in {:.2f}s", time.time() - start)
     return result
Пример #30
0
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPConfig,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)
        self.actor_reconciler = ActorStateReconciler(controller_name, detached)

        # backend -> AutoscalingPolicy
        self.autoscaling_policies = dict()

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # Map of awaiting results
        # TODO(ilr): Checkpoint this once this becomes asynchronous
        self.inflight_results: Dict[UUID, asyncio.Event] = dict()
        self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()

        # HTTP state doesn't currently require a checkpoint.
        self.http_state = HTTPState(controller_name, detached, http_config)

        checkpoint_bytes = self.kv_store.get(CHECKPOINT_KEY)
        if checkpoint_bytes is None:
            logger.debug("No checkpoint found")
            self.backend_state = BackendState()
            self.endpoint_state = EndpointState()
        else:
            checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
            self.backend_state = BackendState(
                checkpoint=checkpoint.backend_state_checkpoint)
            self.endpoint_state = EndpointState(
                checkpoint=checkpoint.endpoint_state_checkpoint)
            await self._recover_from_checkpoint(checkpoint)

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
        self.long_poll_host = LongPollHost()

        # The configs pushed out here get updated by
        # self._recover_from_checkpoint in the failure scenario, so that must
        # be run before we notify the changes.
        self.notify_backend_configs_changed()
        self.notify_replica_handles_changed()
        self.notify_traffic_policies_changed()
        self.notify_route_table_changed()

        asyncio.get_event_loop().create_task(self.run_control_loop())