Exemplo n.º 1
0
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       checkpoint_path: str,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.kv_store = make_kv_store(
            checkpoint_path,
            namespace=f"{self.controller_name}-{self.controller_namespace}")

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.backend_state_manager = BackendStateManager(
            controller_name, detached, self.kv_store, self.long_poll_host,
            self.goal_manager, all_current_actor_names)

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemplo n.º 2
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
    ):
        configure_component_logger(component_name="controller",
                                   component_id=str(os.getpid()))

        # Used to read/write checkpoints.
        self.ray_worker_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.ray_worker_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(
            controller_name,
            detached,
            http_config,
        )
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)

        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actors = ray.util.list_named_actors(all_namespaces=True)
        all_serve_actor_names = [
            actor["name"] for actor in all_current_actors
            if actor["namespace"] == SERVE_NAMESPACE
        ]

        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_serve_actor_names,
        )

        # Reference to Ray task executing most recent deployment request
        self.config_deployment_request_ref: ObjectRef = None

        # Unix timestamp of latest config deployment request. Defaults to 0.
        self.deployment_timestamp = 0

        asyncio.get_event_loop().create_task(self.run_control_loop())

        self._recover_config_from_checkpoint()
Exemplo n.º 3
0
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state = BackendState(controller_name, detached,
                                          self.kv_store, self.long_poll_host,
                                          self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemplo n.º 4
0
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        controller_namespace = ray.get_runtime_context().namespace
        self.kv_store = RayInternalKVStore(
            namespace=f"{controller_name}-{controller_namespace}")

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state_manager = BackendStateManager(
            controller_name, detached, self.kv_store, self.long_poll_host,
            self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemplo n.º 5
0
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
        _override_controller_namespace: Optional[str] = None,
    ):
        configure_component_logger(component_name="controller",
                                   component_id=str(os.getpid()))

        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(
            controller_name,
            detached,
            http_config,
            _override_controller_namespace=_override_controller_namespace,
        )
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_current_actor_names,
            _override_controller_namespace=_override_controller_namespace,
        )

        # Reference to Ray task executing most recent deployment request
        self.config_deployment_request_ref: ObjectRef = None

        # Unix timestamp of latest config deployment request. Defaults to 0.
        self.deployment_timestamp = 0

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemplo n.º 6
0
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # Map of awaiting results
        # TODO(ilr): Checkpoint this once this becomes asynchronous
        self.inflight_results: Dict[UUID, asyncio.Event] = dict()
        self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()

        # HTTP state doesn't currently require a checkpoint.
        self.http_state = HTTPState(controller_name, detached, http_config)

        checkpoint_bytes = self.kv_store.get(CHECKPOINT_KEY)
        if checkpoint_bytes is None:
            logger.debug("No checkpoint found")
            self.backend_state = BackendState(controller_name, detached)
            self.endpoint_state = EndpointState()
        else:
            checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
            self.backend_state = BackendState(
                controller_name,
                detached,
                checkpoint=checkpoint.backend_state_checkpoint)
            self.endpoint_state = EndpointState(
                checkpoint=checkpoint.endpoint_state_checkpoint)

            self._serializable_inflight_results = checkpoint.inflight_reqs
            for uuid, fut_result in self._serializable_inflight_results.items(
            ):
                self._create_event_with_result(fut_result.requested_goal, uuid)

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
        self.long_poll_host = LongPollHost()

        self.notify_backend_configs_changed()
        self.notify_replica_handles_changed()
        self.notify_traffic_policies_changed()
        self.notify_route_table_changed()

        asyncio.get_event_loop().create_task(self.run_control_loop())
Exemplo n.º 7
0
 def _make_http_state(http_options):
     return HTTPState(
         "mock_controller_name",
         detached=True,
         config=http_options,
         _start_proxies_on_init=False,
     )
Exemplo n.º 8
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """
    async def __init__(
        self,
        controller_name: str,
        http_config: HTTPOptions,
        checkpoint_path: str,
        detached: bool = False,
        _override_controller_namespace: Optional[str] = None,
    ):
        configure_component_logger(component_name="controller",
                                   component_id=str(os.getpid()))

        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = f"{self.controller_name}-{self.controller_namespace}"
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(
            controller_name,
            detached,
            http_config,
            _override_controller_namespace=_override_controller_namespace,
        )
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name,
            detached,
            self.kv_store,
            self.long_poll_host,
            all_current_actor_names,
            _override_controller_namespace=_override_controller_namespace,
        )

        # Reference to Ray task executing most recent deployment request
        self.config_deployment_request_ref: ObjectRef = None

        # Unix timestamp of latest config deployment request. Defaults to 0.
        self.deployment_timestamp = 0

        asyncio.get_event_loop().create_task(self.run_control_loop())

    def check_alive(self) -> None:
        """No-op to check if this controller is alive."""
        return

    def get_pid(self) -> int:
        return os.getpid()

    def record_autoscaling_metrics(self, data: Dict[str, float],
                                   send_timestamp: float):
        self.deployment_state_manager.record_autoscaling_metrics(
            data, send_timestamp)

    def record_handle_metrics(self, data: Dict[str, float],
                              send_timestamp: float):
        self.deployment_state_manager.record_handle_metrics(
            data, send_timestamp)

    def _dump_autoscaling_metrics_for_testing(self):
        return self.deployment_state_manager.get_autoscaling_metrics()

    def _dump_replica_states_for_testing(self, deployment_name):
        return self.deployment_state_manager._deployment_states[
            deployment_name]._replicas

    def _stop_one_running_replica_for_testing(self, deployment_name):
        self.deployment_state_manager._deployment_states[
            deployment_name]._stop_one_running_replica_for_testing()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_checkpoint_path(self) -> str:
        return self.checkpoint_path

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[str, Any]]:
        """Returns a dictionary of deployment name to config."""
        return self.endpoint_state.get_endpoints()

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    def get_http_proxy_names(self) -> bytes:
        """Returns the http_proxy actor name list serialized by protobuf."""
        from ray.serve.generated.serve_pb2 import ActorNameList

        actor_name_list = ActorNameList(
            names=self.http_state.get_http_proxy_names().values())
        return actor_name_list.SerializeToString()

    async def run_control_loop(self) -> None:
        # NOTE(edoakes): we catch all exceptions here and simply log them,
        # because an unhandled exception would cause the main control loop to
        # halt, which should *never* happen.
        while True:

            async with self.write_lock:
                try:
                    self.http_state.update()
                except Exception:
                    logger.exception("Exception updating HTTP state.")

                try:
                    self.deployment_state_manager.update()
                except Exception:
                    logger.exception("Exception updating deployment state.")

            try:
                self._put_serve_snapshot()
            except Exception:
                logger.exception("Exception putting serve snapshot.")
            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _put_serve_snapshot(self) -> None:
        val = dict()
        for deployment_name, (
                deployment_info,
                route_prefix,
        ) in self.list_deployments_internal(include_deleted=True).items():
            entry = dict()
            entry["name"] = deployment_name
            entry["namespace"] = ray.get_runtime_context().namespace
            entry["ray_job_id"] = deployment_info.deployer_job_id.hex()
            entry[
                "class_name"] = deployment_info.replica_config.deployment_def_name
            entry["version"] = deployment_info.version
            entry["http_route"] = route_prefix
            entry["start_time"] = deployment_info.start_time_ms
            entry["end_time"] = deployment_info.end_time_ms or 0
            entry[
                "status"] = "DELETED" if deployment_info.end_time_ms else "RUNNING"
            entry["actors"] = dict()
            if entry["status"] == "RUNNING":
                replicas = self.deployment_state_manager._deployment_states[
                    deployment_name]._replicas
                running_replicas = replicas.get([ReplicaState.RUNNING])
                for replica in running_replicas:
                    try:
                        actor_handle = replica.actor_handle
                    except ValueError:
                        # Actor died or hasn't yet been created.
                        continue
                    actor_id = actor_handle._ray_actor_id.hex()
                    replica_tag = replica.replica_tag
                    replica_version = (None if (replica.version is None
                                                or replica.version.unversioned)
                                       else replica.version.code_version)
                    entry["actors"][actor_id] = {
                        "replica_tag": replica_tag,
                        "version": replica_version,
                    }

            val[deployment_name] = entry
        self.snapshot_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))

    def _all_running_replicas(self) -> Dict[str, List[RunningReplicaInfo]]:
        """Used for testing."""
        return self.deployment_state_manager.get_running_replica_infos()

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    def get_root_url(self):
        """Return the root url for the serve instance."""
        http_config = self.get_http_config()
        if http_config.root_url == "":
            if SERVE_ROOT_URL_ENV_KEY in os.environ:
                return os.environ[SERVE_ROOT_URL_ENV_KEY]
            else:
                return (f"http://{http_config.host}:{http_config.port}"
                        f"{http_config.root_path}")
        return http_config.root_url

    async def shutdown(self):
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            self.deployment_state_manager.shutdown()
            self.endpoint_state.shutdown()
            self.http_state.shutdown()

    def deploy(
        self,
        name: str,
        deployment_config_proto_bytes: bytes,
        replica_config_proto_bytes: bytes,
        route_prefix: Optional[str],
        deployer_job_id: "ray._raylet.JobID",
    ) -> bool:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        deployment_config = DeploymentConfig.from_proto_bytes(
            deployment_config_proto_bytes)
        version = deployment_config.version
        prev_version = deployment_config.prev_version
        replica_config = ReplicaConfig.from_proto_bytes(
            replica_config_proto_bytes, deployment_config.deployment_language)

        if prev_version is not None:
            existing_deployment_info = self.deployment_state_manager.get_deployment(
                name)
            if existing_deployment_info is None or not existing_deployment_info.version:
                raise ValueError(
                    f"prev_version '{prev_version}' is specified but "
                    "there is no existing deployment.")
            if existing_deployment_info.version != prev_version:
                raise ValueError(
                    f"prev_version '{prev_version}' "
                    "does not match with the existing "
                    f"version '{existing_deployment_info.version}'.")

        autoscaling_config = deployment_config.autoscaling_config
        if autoscaling_config is not None:
            # TODO: is this the desired behaviour? Should this be a setting?
            deployment_config.num_replicas = autoscaling_config.min_replicas

            autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config)
        else:
            autoscaling_policy = None

        deployment_info = DeploymentInfo(
            actor_name=name,
            version=version,
            deployment_config=deployment_config,
            replica_config=replica_config,
            deployer_job_id=deployer_job_id,
            start_time_ms=int(time.time() * 1000),
            autoscaling_policy=autoscaling_policy,
        )
        # TODO(architkulkarni): When a deployment is redeployed, even if
        # the only change was num_replicas, the start_time_ms is refreshed.
        # Is this the desired behaviour?

        updating = self.deployment_state_manager.deploy(name, deployment_info)

        if route_prefix is not None:
            endpoint_info = EndpointInfo(route=route_prefix)
            self.endpoint_state.update_endpoint(name, endpoint_info)
        else:
            self.endpoint_state.delete_endpoint(name)

        return updating

    def deploy_group(self, deployment_args_list: List[Dict]) -> List[bool]:
        """
        Takes in a list of dictionaries that contain keyword arguments for the
        controller's deploy() function. Calls deploy on all the argument
        dictionaries in the list. Effectively executes an atomic deploy on a
        group of deployments.
        """

        return [self.deploy(**args) for args in deployment_args_list]

    def deploy_app(
        self,
        import_path: str,
        runtime_env: Dict,
        deployment_override_options: List[Dict],
    ) -> None:
        """Kicks off a task that deploys a Serve application.

        Cancels any previous in-progress task that is deploying a Serve
        application.

        Args:
            import_path: Serve deployment graph's import path
            runtime_env: runtime_env to run the deployment graph in
            deployment_override_options: All dictionaries should
                contain argument-value options that can be passed directly
                into a set_options() call. Overrides deployment options set
                in the graph itself.
        """

        if self.config_deployment_request_ref is not None:
            ray.cancel(self.config_deployment_request_ref)
            logger.info("Received new config deployment request. Cancelling "
                        "previous request.")

        self.config_deployment_request_ref = run_graph.options(
            runtime_env=runtime_env).remote(import_path, runtime_env,
                                            deployment_override_options)

        self.deployment_timestamp = time.time()

    def delete_deployment(self, name: str):
        self.endpoint_state.delete_endpoint(name)
        return self.deployment_state_manager.delete_deployment(name)

    def delete_deployments(self, names: Iterable[str]) -> None:
        for name in names:
            self.delete_deployment(name)

    def get_deployment_info(self, name: str) -> bytes:
        """Get the current information about a deployment.

        Args:
            name(str): the name of the deployment.

        Returns:
            DeploymentRoute's protobuf serialized bytes

        Raises:
            KeyError if the deployment doesn't exist.
        """
        deployment_info = self.deployment_state_manager.get_deployment(name)
        if deployment_info is None:
            raise KeyError(f"Deployment {name} does not exist.")

        route = self.endpoint_state.get_endpoint_route(name)

        from ray.serve.generated.serve_pb2 import DeploymentRoute

        deployment_route = DeploymentRoute(
            deployment_info=deployment_info.to_proto(), route=route)
        return deployment_route.SerializeToString()

    def list_deployments_internal(
        self,
        include_deleted: Optional[bool] = False
    ) -> Dict[str, Tuple[DeploymentInfo, str]]:
        """Gets the current information about all deployments.

        Args:
            include_deleted(bool): Whether to include information about
                deployments that have been deleted.

        Returns:
            Dict(deployment_name, (DeploymentInfo, route))

        Raises:
            KeyError if the deployment doesn't exist.
        """
        return {
            name: (
                self.deployment_state_manager.get_deployment(
                    name, include_deleted=include_deleted),
                self.endpoint_state.get_endpoint_route(name),
            )
            for name in self.deployment_state_manager.get_deployment_configs(
                include_deleted=include_deleted)
        }

    def list_deployments(self,
                         include_deleted: Optional[bool] = False) -> bytes:
        """Gets the current information about all deployments.

        Args:
            include_deleted(bool): Whether to include information about
                deployments that have been deleted.

        Returns:
            DeploymentRouteList's protobuf serialized bytes
        """
        from ray.serve.generated.serve_pb2 import DeploymentRouteList, DeploymentRoute

        deployment_route_list = DeploymentRouteList()
        for deployment_name, (
                deployment_info,
                route_prefix,
        ) in self.list_deployments_internal(
                include_deleted=include_deleted).items():
            deployment_info_proto = deployment_info.to_proto()
            deployment_info_proto.name = deployment_name
            deployment_route_list.deployment_routes.append(
                DeploymentRoute(deployment_info=deployment_info_proto,
                                route=route_prefix))
        return deployment_route_list.SerializeToString()

    async def get_serve_status(self) -> bytes:

        serve_app_status = ApplicationStatus.RUNNING
        serve_app_message = ""
        deployment_timestamp = self.deployment_timestamp

        if self.config_deployment_request_ref:
            finished, pending = ray.wait([self.config_deployment_request_ref],
                                         timeout=0)

            if pending:
                serve_app_status = ApplicationStatus.DEPLOYING
            else:
                try:
                    await finished[0]
                except RayTaskError:
                    serve_app_status = ApplicationStatus.DEPLOY_FAILED
                    serve_app_message = f"Deployment failed:\n{traceback.format_exc()}"

        app_status = ApplicationStatusInfo(serve_app_status, serve_app_message,
                                           deployment_timestamp)
        deployment_statuses = self.deployment_state_manager.get_deployment_statuses(
        )

        status_info = StatusOverview(
            app_status=app_status,
            deployment_statuses=deployment_statuses,
        )

        return status_info.to_proto().SerializeToString()
Exemplo n.º 9
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       checkpoint_path: str,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.controller_namespace = ray.get_runtime_context().namespace
        self.controller_name = controller_name
        self.checkpoint_path = checkpoint_path
        kv_store_namespace = (
            f"{self.controller_name}-{self.controller_namespace}")
        self.kv_store = make_kv_store(checkpoint_path,
                                      namespace=kv_store_namespace)
        self.snapshot_store = RayInternalKVStore(namespace=kv_store_namespace)

        # Dictionary of deployment_name -> proxy_name -> queue length.
        self.deployment_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        # Fetch all running actors in current cluster as source of current
        # replica state for controller failure recovery
        all_current_actor_names = ray.util.list_named_actors()
        self.deployment_state_manager = DeploymentStateManager(
            controller_name, detached, self.kv_store, self.long_poll_host,
            self.goal_manager, all_current_actor_names)

        # TODO(simon): move autoscaling related stuff into a manager.
        self.autoscaling_metrics_store = InMemoryMetricsStore()

        asyncio.get_event_loop().create_task(self.run_control_loop())

    def record_autoscaling_metrics(self, data: Dict[str, float],
                                   send_timestamp: float):
        self.autoscaling_metrics_store.add_metrics_point(data, send_timestamp)

    def _dump_autoscaling_metrics_for_testing(self):
        return self.autoscaling_metrics_store.data

    def _dump_replica_states_for_testing(self, deployment_name):
        return self.deployment_state_manager._deployment_states[
            deployment_name]._replicas

    def _stop_one_running_replica_for_testing(self, deployment_name):
        self.deployment_state_manager._deployment_states[
            deployment_name]._stop_one_running_replica_for_testing()

    async def wait_for_goal(self, goal_id: GoalId) -> Optional[Exception]:
        return await self.goal_manager.wait_for_goal(goal_id)

    async def _num_pending_goals(self) -> int:
        return self.goal_manager.num_pending_goals()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_checkpoint_path(self) -> str:
        return self.checkpoint_path

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[str, Any]]:
        """Returns a dictionary of deployment name to config."""
        return self.endpoint_state.get_endpoints()

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    def autoscale(self) -> None:
        """Updates autoscaling deployments with calculated num_replicas."""
        for deployment_name, (deployment_info,
                              route_prefix) in self.list_deployments().items():
            deployment_config = deployment_info.deployment_config
            autoscaling_policy = deployment_info.autoscaling_policy

            if autoscaling_policy is None:
                continue

            replicas = self.deployment_state_manager._deployment_states[
                deployment_name]._replicas
            running_replicas = replicas.get([ReplicaState.RUNNING])

            current_num_ongoing_requests = []
            for replica in running_replicas:
                replica_tag = replica.replica_tag
                num_ongoing_requests = (
                    self.autoscaling_metrics_store.window_average(
                        replica_tag,
                        time.time() -
                        autoscaling_policy.config.look_back_period_s))
                if num_ongoing_requests is not None:
                    current_num_ongoing_requests.append(num_ongoing_requests)

            if len(current_num_ongoing_requests) == 0:
                continue

            new_deployment_config = deployment_config.copy()

            decision_num_replicas = (
                autoscaling_policy.get_decision_num_replicas(
                    current_num_ongoing_requests=current_num_ongoing_requests,
                    curr_target_num_replicas=deployment_config.num_replicas))
            new_deployment_config.num_replicas = decision_num_replicas

            new_deployment_info = copy(deployment_info)
            new_deployment_info.deployment_config = new_deployment_config

            goal_id, updating = self.deployment_state_manager.deploy(
                deployment_name, new_deployment_info)

    async def run_control_loop(self) -> None:
        # NOTE(edoakes): we catch all exceptions here and simply log them,
        # because an unhandled exception would cause the main control loop to
        # halt, which should *never* happen.
        while True:
            try:
                self.autoscale()
            except Exception:
                logger.exception("Exception in autoscaling.")

            async with self.write_lock:
                try:
                    self.http_state.update()
                except Exception:
                    logger.exception("Exception updating HTTP state.")

                try:
                    self.deployment_state_manager.update()
                except Exception:
                    logger.exception("Exception updating deployment state.")

            try:
                self._put_serve_snapshot()
            except Exception:
                logger.exception("Exception putting serve snapshot.")
            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _put_serve_snapshot(self) -> None:
        val = dict()
        for deployment_name, (deployment_info,
                              route_prefix) in self.list_deployments(
                                  include_deleted=True).items():
            entry = dict()
            entry["name"] = deployment_name
            entry["namespace"] = ray.get_runtime_context().namespace
            entry["ray_job_id"] = deployment_info.deployer_job_id.hex()
            entry["class_name"] = (
                deployment_info.replica_config.func_or_class_name)
            entry["version"] = deployment_info.version
            entry["http_route"] = route_prefix
            entry["start_time"] = deployment_info.start_time_ms
            entry["end_time"] = deployment_info.end_time_ms or 0
            entry["status"] = ("DELETED"
                               if deployment_info.end_time_ms else "RUNNING")
            entry["actors"] = dict()
            if entry["status"] == "RUNNING":
                replicas = self.deployment_state_manager._deployment_states[
                    deployment_name]._replicas
                running_replicas = replicas.get([ReplicaState.RUNNING])
                for replica in running_replicas:
                    try:
                        actor_handle = replica.actor_handle
                    except ValueError:
                        # Actor died or hasn't yet been created.
                        continue
                    actor_id = actor_handle._ray_actor_id.hex()
                    replica_tag = replica.replica_tag
                    replica_version = (None if (replica.version is None
                                                or replica.version.unversioned)
                                       else replica.version.code_version)
                    entry["actors"][actor_id] = {
                        "replica_tag": replica_tag,
                        "version": replica_version
                    }

            val[deployment_name] = entry
        self.snapshot_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))

    def _all_running_replicas(self) -> Dict[str, List[RunningReplicaInfo]]:
        """Used for testing."""
        return self.deployment_state_manager.get_running_replica_infos()

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    def get_root_url(self):
        """Return the root url for the serve instance."""
        http_config = self.get_http_config()
        if http_config.root_url == "":
            if SERVE_ROOT_URL_ENV_KEY in os.environ:
                return os.environ[SERVE_ROOT_URL_ENV_KEY]
            else:
                return f"http://{http_config.host}:{http_config.port}"
        return http_config.root_url

    async def shutdown(self) -> List[GoalId]:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            goal_ids = self.deployment_state_manager.shutdown()
            self.endpoint_state.shutdown()
            self.http_state.shutdown()

            return goal_ids

    def deploy(
        self,
        name: str,
        deployment_config_proto_bytes: bytes,
        replica_config: ReplicaConfig,
        version: Optional[str],
        prev_version: Optional[str],
        route_prefix: Optional[str],
        deployer_job_id: "ray._raylet.JobID",
    ) -> Tuple[Optional[GoalId], bool]:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        deployment_config = DeploymentConfig.from_proto_bytes(
            deployment_config_proto_bytes)

        if prev_version is not None:
            existing_deployment_info = (
                self.deployment_state_manager.get_deployment(name))
            if (existing_deployment_info is None
                    or not existing_deployment_info.version):
                raise ValueError(
                    f"prev_version '{prev_version}' is specified but "
                    "there is no existing deployment.")
            if existing_deployment_info.version != prev_version:
                raise ValueError(
                    f"prev_version '{prev_version}' "
                    "does not match with the existing "
                    f"version '{existing_deployment_info.version}'.")

        autoscaling_config = deployment_config.autoscaling_config
        if autoscaling_config is not None:
            # TODO: is this the desired behaviour? Should this be a setting?
            deployment_config.num_replicas = autoscaling_config.min_replicas

            autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config)
        else:
            autoscaling_policy = None

        deployment_info = DeploymentInfo(
            actor_name=name,
            serialized_deployment_def=replica_config.serialized_deployment_def,
            version=version,
            deployment_config=deployment_config,
            replica_config=replica_config,
            deployer_job_id=deployer_job_id,
            start_time_ms=int(time.time() * 1000),
            autoscaling_policy=autoscaling_policy)
        # TODO(architkulkarni): When a deployment is redeployed, even if
        # the only change was num_replicas, the start_time_ms is refreshed.
        # Is this the desired behaviour?

        goal_id, updating = self.deployment_state_manager.deploy(
            name, deployment_info)
        if route_prefix is not None:
            endpoint_info = EndpointInfo(route=route_prefix)
            self.endpoint_state.update_endpoint(name, endpoint_info)
        return goal_id, updating

    def delete_deployment(self, name: str) -> Optional[GoalId]:
        self.endpoint_state.delete_endpoint(name)
        return self.deployment_state_manager.delete_deployment(name)

    def get_deployment_info(self, name: str) -> Tuple[DeploymentInfo, str]:
        """Get the current information about a deployment.

        Args:
            name(str): the name of the deployment.

        Returns:
            (DeploymentInfo, route)

        Raises:
            KeyError if the deployment doesn't exist.
        """
        deployment_info = self.deployment_state_manager.get_deployment(name)
        if deployment_info is None:
            raise KeyError(f"Deployment {name} does not exist.")

        route = self.endpoint_state.get_endpoint_route(name)

        return deployment_info, route

    def list_deployments(
        self,
        include_deleted: Optional[bool] = False
    ) -> Dict[str, Tuple[DeploymentInfo, str]]:
        """Gets the current information about all deployments.

        Args:
            include_deleted(bool): Whether to include information about
                deployments that have been deleted.

        Returns:
            Dict(deployment_name, (DeploymentInfo, route))

        Raises:
            KeyError if the deployment doesn't exist.
        """
        return {
            name: (self.deployment_state_manager.get_deployment(
                name, include_deleted=include_deleted),
                   self.endpoint_state.get_endpoint_route(name))
            for name in self.deployment_state_manager.get_deployment_configs(
                include_deleted=include_deleted)
        }
Exemplo n.º 10
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state = BackendState(controller_name, detached,
                                          self.kv_store, self.long_poll_host,
                                          self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())

    async def wait_for_goal(self, goal_id: GoalId) -> None:
        await self.goal_manager.wait_for_goal(goal_id)

    async def _num_pending_goals(self) -> int:
        return self.goal_manager.num_pending_goals()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                try:
                    self.http_state.update()
                except Exception as e:
                    logger.error(f"Exception updating HTTP state: {e}")
                try:
                    self.backend_state.update()
                except Exception as e:
                    logger.error(f"Exception updating backend state: {e}")

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _all_replica_handles(
            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
        """Used for testing."""
        return self.backend_state.get_running_replica_handles()

    def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
        """Returns a dictionary of backend tag to backend config."""
        return self.backend_state.get_backend_configs()

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
        """Returns a dictionary of backend tag to backend config."""
        return self.endpoint_state.get_endpoints()

    def _validate_traffic_dict(self, traffic_dict: Dict[str, float]):
        for backend in traffic_dict:
            if self.backend_state.get_backend(backend) is None:
                raise ValueError(
                    "Attempted to assign traffic to a backend '{}' that "
                    "is not registered.".format(backend))

    async def set_traffic(self, endpoint: str,
                          traffic_dict: Dict[str, float]) -> None:
        """Sets the traffic policy for the specified endpoint."""
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info("Setting traffic for endpoint "
                        f"'{endpoint}' to '{traffic_dict}'.")

            self.endpoint_state.set_traffic_policy(endpoint,
                                                   TrafficPolicy(traffic_dict))

    async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
                             proportion: float) -> None:
        """Shadow traffic from the endpoint to the backend."""
        async with self.write_lock:
            if self.backend_state.get_backend(backend_tag) is None:
                raise ValueError(
                    "Attempted to shadow traffic to a backend '{}' that "
                    "is not registered.".format(backend_tag))

            logger.info(
                "Shadowing '{}' of traffic to endpoint '{}' to backend '{}'.".
                format(proportion, endpoint_name, backend_tag))

            self.endpoint_state.shadow_traffic(endpoint_name, backend_tag,
                                               proportion)

    async def create_endpoint(
        self,
        endpoint: str,
        traffic_dict: Dict[str, float],
        route: Optional[str],
        methods: Set[str],
    ) -> None:
        """Create a new endpoint with the specified route and methods.

        If the route is None, this is a "headless" endpoint that will not
        be exposed over HTTP and can only be accessed via a handle.
        """
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info(
                "Registering route '{}' to endpoint '{}' with methods '{}'.".
                format(route, endpoint, methods))

            self.endpoint_state.create_endpoint(
                endpoint, EndpointInfo(methods, route=route),
                TrafficPolicy(traffic_dict))

        # TODO(simon): Use GoalID mechanism for this so client can check for
        # goal id and http_state complete the goal id.
        await self.http_state.ensure_http_route_exists(endpoint, timeout_s=30)

    async def delete_endpoint(self, endpoint: str) -> None:
        """Delete the specified endpoint.

        Does not modify any corresponding backends.
        """
        logger.info("Deleting endpoint '{}'".format(endpoint))
        async with self.write_lock:
            self.endpoint_state.delete_endpoint(endpoint)

    async def create_backend(
            self, backend_tag: BackendTag, backend_config: BackendConfig,
            replica_config: ReplicaConfig) -> Optional[GoalId]:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            backend_info = BackendInfo(actor_def=ray.remote(
                create_backend_replica(backend_tag,
                                       replica_config.serialized_backend_def)),
                                       version=RESERVED_VERSION_TAG,
                                       backend_config=backend_config,
                                       replica_config=replica_config)
            goal_id, _ = self.backend_state.deploy_backend(
                backend_tag, backend_info)
            return goal_id

    async def delete_backend(self,
                             backend_tag: BackendTag,
                             force_kill: bool = False) -> Optional[GoalId]:
        async with self.write_lock:
            # Check that the specified backend isn't used by any endpoints.
            for endpoint, info in self.endpoint_state.get_endpoints().items():
                if (backend_tag in info["traffic"]
                        or backend_tag in info["shadows"]):
                    raise ValueError("Backend '{}' is used by endpoint '{}' "
                                     "and cannot be deleted. Please remove "
                                     "the backend from all endpoints and try "
                                     "again.".format(backend_tag, endpoint))
            return self.backend_state.delete_backend(backend_tag, force_kill)

    async def update_backend_config(self, backend_tag: BackendTag,
                                    config_options: BackendConfig) -> GoalId:
        """Set the config for the specified backend."""
        async with self.write_lock:
            existing_info = self.backend_state.get_backend(backend_tag)
            if existing_info is None:
                raise ValueError(f"Backend {backend_tag} is not registered.")

            backend_info = BackendInfo(
                actor_def=existing_info.actor_def,
                version=existing_info.version,
                backend_config=existing_info.backend_config.copy(
                    update=config_options.dict(exclude_unset=True)),
                replica_config=existing_info.replica_config)
            goal_id, _ = self.backend_state.deploy_backend(
                backend_tag, backend_info)
            return goal_id

    def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
        """Get the current config for the specified backend."""
        if self.backend_state.get_backend(backend_tag) is None:
            raise ValueError(f"Backend {backend_tag} is not registered.")
        return self.backend_state.get_backend(backend_tag).backend_config

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    async def shutdown(self) -> None:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            self.backend_state.shutdown()
            self.endpoint_state.shutdown()
            self.http_state.shutdown()

    async def deploy(
            self, name: str, backend_config: BackendConfig,
            replica_config: ReplicaConfig, python_methods: List[str],
            version: Optional[str],
            route_prefix: Optional[str]) -> Tuple[Optional[GoalId], bool]:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        async with self.write_lock:
            backend_info = BackendInfo(actor_def=ray.remote(
                create_backend_replica(name,
                                       replica_config.serialized_backend_def)),
                                       version=version,
                                       backend_config=backend_config,
                                       replica_config=replica_config)

            goal_id, updating = self.backend_state.deploy_backend(
                name, backend_info)
            endpoint_info = EndpointInfo(ALL_HTTP_METHODS,
                                         route=route_prefix,
                                         python_methods=python_methods,
                                         legacy=False)
            self.endpoint_state.update_endpoint(name, endpoint_info,
                                                TrafficPolicy({name: 1.0}))
            return goal_id, updating

    def delete_deployment(self, name: str) -> Optional[GoalId]:
        self.endpoint_state.delete_endpoint(name)
        return self.backend_state.delete_backend(name, force_kill=False)

    def get_deployment_info(self, name: str) -> Tuple[BackendInfo, str]:
        """Get the current information about a deployment.

        Args:
            name(str): the name of the deployment.

        Returns:
            (BackendInfo, route)

        Raises:
            KeyError if the deployment doesn't exist.
        """
        backend_info: BackendInfo = self.backend_state.get_backend(name)
        if backend_info is None:
            raise KeyError(f"Deployment {name} does not exist.")

        route = self.endpoint_state.get_endpoint_route(name)

        return backend_info, route

    def list_deployments(self) -> Dict[str, Tuple[BackendInfo, str]]:
        """Gets the current information about all active deployments."""
        return {
            name: (self.backend_state.get_backend(name),
                   self.endpoint_state.get_endpoint_route(name))
            for name in self.backend_state.get_backend_configs()
        }
Exemplo n.º 11
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """
    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        controller_namespace = ray.get_runtime_context().namespace
        self.kv_store = RayInternalKVStore(
            namespace=f"{controller_name}-{controller_namespace}")

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state_manager = BackendStateManager(
            controller_name, detached, self.kv_store, self.long_poll_host,
            self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())

    async def wait_for_goal(self, goal_id: GoalId) -> Optional[Exception]:
        return await self.goal_manager.wait_for_goal(goal_id)

    async def _num_pending_goals(self) -> int:
        return self.goal_manager.num_pending_goals()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
        """Returns a dictionary of backend tag to backend config."""
        return self.endpoint_state.get_endpoints()

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                try:
                    self.http_state.update()
                except Exception:
                    logger.exception("Exception updating HTTP state.")
                try:
                    self.backend_state_manager.update()
                except Exception:
                    logger.exception("Exception updating backend state.")
            self._put_serve_snapshot()
            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _put_serve_snapshot(self) -> None:
        val = dict()
        for deployment_name, (backend_info,
                              route_prefix) in self.list_deployments(
                                  include_deleted=True).items():
            entry = dict()
            entry["name"] = deployment_name
            entry["namespace"] = ray.get_runtime_context().namespace
            entry["ray_job_id"] = ("None"
                                   if backend_info.deployer_job_id is None else
                                   backend_info.deployer_job_id.hex())
            entry[
                "class_name"] = backend_info.replica_config.func_or_class_name
            entry["version"] = backend_info.version or "None"
            # TODO(architkulkarni): When we add the feature to allow
            # deployments with no HTTP route, update the below line.
            # Or refactor the route_prefix logic in the Deployment class.
            entry["http_route"] = route_prefix or f"/{deployment_name}"
            entry["start_time"] = backend_info.start_time_ms
            entry["end_time"] = backend_info.end_time_ms or 0
            entry["status"] = ("DELETED"
                               if backend_info.end_time_ms else "RUNNING")
            entry["actors"] = dict()
            if entry["status"] == "RUNNING":
                replicas = self.backend_state_manager._backend_states[
                    deployment_name]._replicas
                running_replicas = replicas.get([ReplicaState.RUNNING])
                for replica in running_replicas:
                    try:
                        actor_handle = replica.actor_handle
                    except ValueError:
                        # Actor died or hasn't yet been created.
                        continue
                    actor_id = actor_handle._ray_actor_id.hex()
                    replica_tag = replica.replica_tag
                    replica_version = ("None" if replica.version.unversioned
                                       else replica.version.code_version)
                    entry["actors"][actor_id] = {
                        "replica_tag": replica_tag,
                        "version": replica_version
                    }

            val[deployment_name] = entry
        self.kv_store.put(SNAPSHOT_KEY, json.dumps(val).encode("utf-8"))

    def _all_replica_handles(
            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
        """Used for testing."""
        return self.backend_state_manager.get_running_replica_handles()

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    async def shutdown(self) -> List[GoalId]:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            goal_ids = self.backend_state_manager.shutdown()
            self.endpoint_state.shutdown()
            self.http_state.shutdown()

            return goal_ids

    async def deploy(
        self,
        name: str,
        backend_config: BackendConfig,
        replica_config: ReplicaConfig,
        python_methods: List[str],
        version: Optional[str],
        prev_version: Optional[str],
        route_prefix: Optional[str],
        deployer_job_id: "Optional[ray._raylet.JobID]" = None
    ) -> Tuple[Optional[GoalId], bool]:
        if route_prefix is not None:
            assert route_prefix.startswith("/")

        async with self.write_lock:
            if prev_version is not None:
                existing_backend_info = self.backend_state_manager.get_backend(
                    name)
                if (existing_backend_info is None
                        or not existing_backend_info.version):
                    raise ValueError(
                        f"prev_version '{prev_version}' is specified but "
                        "there is no existing deployment.")
                if existing_backend_info.version != prev_version:
                    raise ValueError(
                        f"prev_version '{prev_version}' "
                        "does not match with the existing "
                        f"version '{existing_backend_info.version}'.")
            backend_info = BackendInfo(actor_def=ray.remote(
                create_backend_replica(name,
                                       replica_config.serialized_backend_def)),
                                       version=version,
                                       backend_config=backend_config,
                                       replica_config=replica_config,
                                       deployer_job_id=deployer_job_id,
                                       start_time_ms=int(time.time() * 1000))

            goal_id, updating = self.backend_state_manager.deploy_backend(
                name, backend_info)
            endpoint_info = EndpointInfo(route=route_prefix,
                                         python_methods=python_methods)
            self.endpoint_state.update_endpoint(name, endpoint_info)
            return goal_id, updating

    def delete_deployment(self, name: str) -> Optional[GoalId]:
        self.endpoint_state.delete_endpoint(name)
        return self.backend_state_manager.delete_backend(name,
                                                         force_kill=False)

    def get_deployment_info(self, name: str) -> Tuple[BackendInfo, str]:
        """Get the current information about a deployment.

        Args:
            name(str): the name of the deployment.

        Returns:
            (BackendInfo, route)

        Raises:
            KeyError if the deployment doesn't exist.
        """
        backend_info: BackendInfo = self.backend_state_manager.get_backend(
            name)
        if backend_info is None:
            raise KeyError(f"Deployment {name} does not exist.")

        route = self.endpoint_state.get_endpoint_route(name)

        return backend_info, route

    def list_deployments(
        self,
        include_deleted: Optional[bool] = False
    ) -> Dict[str, Tuple[BackendInfo, str]]:
        """Gets the current information about all deployments.

        Args:
            include_deleted(bool): Whether to include information about
                deployments that have been deleted.

        Returns:
            Dict(deployment_name, (BackendInfo, route))

        Raises:
            KeyError if the deployment doesn't exist.
        """
        return {
            name: (self.backend_state_manager.get_backend(
                name, include_deleted=include_deleted),
                   self.endpoint_state.get_endpoint_route(name))
            for name in self.backend_state_manager.get_backend_configs(
                include_deleted=include_deleted)
        }
Exemplo n.º 12
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """

    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state = BackendState(controller_name, detached,
                                          self.kv_store, self.long_poll_host,
                                          self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())

    async def wait_for_goal(self, goal_id: GoalId) -> None:
        await self.goal_manager.wait_for_goal(goal_id)

    async def _num_pending_goals(self) -> int:
        return self.goal_manager.num_pending_goals()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                self.http_state.update()
                self.backend_state.update()

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _all_replica_handles(
            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
        """Used for testing."""
        return self.backend_state.get_running_replica_handles()

    def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
        """Returns a dictionary of backend tag to backend config."""
        return self.backend_state.get_backend_configs()

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
        """Returns a dictionary of backend tag to backend config."""
        return self.endpoint_state.get_endpoints()

    def _validate_traffic_dict(self, traffic_dict: Dict[str, float]):
        for backend in traffic_dict:
            if self.backend_state.get_backend(backend) is None:
                raise ValueError(
                    "Attempted to assign traffic to a backend '{}' that "
                    "is not registered.".format(backend))

    async def set_traffic(self, endpoint: str,
                          traffic_dict: Dict[str, float]) -> None:
        """Sets the traffic policy for the specified endpoint."""
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info("Setting traffic for endpoint "
                        f"'{endpoint}' to '{traffic_dict}'.")

            self.endpoint_state.set_traffic_policy(endpoint,
                                                   TrafficPolicy(traffic_dict))

    async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
                             proportion: float) -> None:
        """Shadow traffic from the endpoint to the backend."""
        async with self.write_lock:
            if self.backend_state.get_backend(backend_tag) is None:
                raise ValueError(
                    "Attempted to shadow traffic to a backend '{}' that "
                    "is not registered.".format(backend_tag))

            logger.info(
                "Shadowing '{}' of traffic to endpoint '{}' to backend '{}'.".
                format(proportion, endpoint_name, backend_tag))

            self.endpoint_state.shadow_traffic(endpoint_name, backend_tag,
                                               proportion)

    async def create_endpoint(
            self,
            endpoint: str,
            traffic_dict: Dict[str, float],
            route: Optional[str],
            methods: List[str],
    ) -> None:
        """Create a new endpoint with the specified route and methods.

        If the route is None, this is a "headless" endpoint that will not
        be exposed over HTTP and can only be accessed via a handle.
        """
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info(
                "Registering route '{}' to endpoint '{}' with methods '{}'.".
                format(route, endpoint, methods))

            self.endpoint_state.create_endpoint(endpoint, route, methods,
                                                TrafficPolicy(traffic_dict))

    async def delete_endpoint(self, endpoint: str) -> None:
        """Delete the specified endpoint.

        Does not modify any corresponding backends.
        """
        logger.info("Deleting endpoint '{}'".format(endpoint))
        async with self.write_lock:
            self.endpoint_state.delete_endpoint(endpoint)

    async def create_backend(
            self, backend_tag: BackendTag, backend_config: BackendConfig,
            replica_config: ReplicaConfig) -> Optional[GoalId]:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            return self.backend_state.create_backend(
                backend_tag, backend_config, replica_config)

    async def delete_backend(self,
                             backend_tag: BackendTag,
                             force_kill: bool = False) -> Optional[GoalId]:
        async with self.write_lock:
            # Check that the specified backend isn't used by any endpoints.
            for endpoint, info in self.endpoint_state.get_endpoints().items():
                if (backend_tag in info["traffic"]
                        or backend_tag in info["shadows"]):
                    raise ValueError("Backend '{}' is used by endpoint '{}' "
                                     "and cannot be deleted. Please remove "
                                     "the backend from all endpoints and try "
                                     "again.".format(backend_tag, endpoint))
            return self.backend_state.delete_backend(backend_tag, force_kill)

    async def update_backend_config(self, backend_tag: BackendTag,
                                    config_options: BackendConfig) -> GoalId:
        """Set the config for the specified backend."""
        async with self.write_lock:
            return self.backend_state.update_backend_config(
                backend_tag, config_options)

    def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
        """Get the current config for the specified backend."""
        if self.backend_state.get_backend(backend_tag) is None:
            raise ValueError(f"Backend {backend_tag} is not registered.")
        return self.backend_state.get_backend(backend_tag).backend_config

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    async def shutdown(self) -> None:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            for proxy in self.http_state.get_http_proxy_handles().values():
                ray.kill(proxy, no_restart=True)
            for replica_dict in self.backend_state.get_running_replica_handles(
            ).values():
                for replica in replica_dict.values():
                    ray.kill(replica, no_restart=True)
            self.kv_store.delete(CHECKPOINT_KEY)

    async def deploy(self, name: str, backend_config: BackendConfig,
                     replica_config: ReplicaConfig,
                     version: Optional[str]) -> Optional[GoalId]:
        """TODO."""
        async with self.write_lock:
            if version is None:
                version = RESERVED_VERSION_TAG
            else:
                if version == RESERVED_VERSION_TAG:
                    # TODO(edoakes): this is unlikely to ever be hit, but it's
                    # still ugly and should be removed once the old codepath
                    # can be deleted.
                    raise ValueError(
                        f"Version {RESERVED_VERSION_TAG} is reserved and "
                        "cannot be used by applications.")
            goal_id = self.backend_state.create_backend(
                name, backend_config, replica_config, version)

            self.endpoint_state.create_endpoint(name, f"/{name}",
                                                ["GET", "POST"],
                                                TrafficPolicy({
                                                    name: 1.0
                                                }))
            return goal_id
Exemplo n.º 13
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """

    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        self.long_poll_host = LongPollHost()

        self.goal_manager = AsyncGoalManager()
        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
        self.backend_state = BackendState(controller_name, detached,
                                          self.kv_store, self.long_poll_host,
                                          self.goal_manager)

        asyncio.get_event_loop().create_task(self.run_control_loop())

    async def wait_for_goal(self, goal_id: GoalId) -> None:
        await self.goal_manager.wait_for_goal(goal_id)

    async def _num_pending_goals(self) -> int:
        return self.goal_manager.num_pending_goals()

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                self.http_state.update()
                self.backend_state.update()

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _all_replica_handles(
            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
        """Used for testing."""
        return self.backend_state.get_running_replica_handles()

    def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
        """Returns a dictionary of backend tag to backend config."""
        return self.backend_state.get_backend_configs()

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
        """Returns a dictionary of backend tag to backend config."""
        return self.endpoint_state.get_endpoints()

    def _validate_traffic_dict(self, traffic_dict: Dict[str, float]):
        for backend in traffic_dict:
            if self.backend_state.get_backend(backend) is None:
                raise ValueError(
                    "Attempted to assign traffic to a backend '{}' that "
                    "is not registered.".format(backend))

    async def set_traffic(self, endpoint: str,
                          traffic_dict: Dict[str, float]) -> None:
        """Sets the traffic policy for the specified endpoint."""
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info("Setting traffic for endpoint "
                        f"'{endpoint}' to '{traffic_dict}'.")

            self.endpoint_state.set_traffic_policy(endpoint,
                                                   TrafficPolicy(traffic_dict))

    async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
                             proportion: float) -> None:
        """Shadow traffic from the endpoint to the backend."""
        async with self.write_lock:
            if self.backend_state.get_backend(backend_tag) is None:
                raise ValueError(
                    "Attempted to shadow traffic to a backend '{}' that "
                    "is not registered.".format(backend_tag))

            logger.info(
                "Shadowing '{}' of traffic to endpoint '{}' to backend '{}'.".
                format(proportion, endpoint_name, backend_tag))

            self.endpoint_state.shadow_traffic(endpoint_name, backend_tag,
                                               proportion)

    async def create_endpoint(
            self,
            endpoint: str,
            traffic_dict: Dict[str, float],
            route: Optional[str],
            methods: List[str],
    ) -> None:
        """Create a new endpoint with the specified route and methods.

        If the route is None, this is a "headless" endpoint that will not
        be exposed over HTTP and can only be accessed via a handle.
        """
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info(
                "Registering route '{}' to endpoint '{}' with methods '{}'.".
                format(route, endpoint, methods))

            self.endpoint_state.create_endpoint(endpoint, route, methods,
                                                TrafficPolicy(traffic_dict))

    async def delete_endpoint(self, endpoint: str) -> None:
        """Delete the specified endpoint.

        Does not modify any corresponding backends.
        """
        logger.info("Deleting endpoint '{}'".format(endpoint))
        async with self.write_lock:
            self.endpoint_state.delete_endpoint(endpoint)

    async def create_backend(
            self, backend_tag: BackendTag, backend_config: BackendConfig,
            replica_config: ReplicaConfig) -> Optional[GoalId]:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            backend_info = BackendInfo(
                worker_class=create_backend_replica(
                    replica_config.backend_def),
                version=RESERVED_VERSION_TAG,
                backend_config=backend_config,
                replica_config=replica_config)
            return self.backend_state.deploy_backend(backend_tag, backend_info)

    async def delete_backend(self,
                             backend_tag: BackendTag,
                             force_kill: bool = False) -> Optional[GoalId]:
        async with self.write_lock:
            # Check that the specified backend isn't used by any endpoints.
            for endpoint, info in self.endpoint_state.get_endpoints().items():
                if (backend_tag in info["traffic"]
                        or backend_tag in info["shadows"]):
                    raise ValueError("Backend '{}' is used by endpoint '{}' "
                                     "and cannot be deleted. Please remove "
                                     "the backend from all endpoints and try "
                                     "again.".format(backend_tag, endpoint))
            return self.backend_state.delete_backend(backend_tag, force_kill)

    async def update_backend_config(self, backend_tag: BackendTag,
                                    config_options: BackendConfig) -> GoalId:
        """Set the config for the specified backend."""
        async with self.write_lock:
            existing_info = self.backend_state.get_backend(backend_tag)
            if existing_info is None:
                raise ValueError(f"Backend {backend_tag} is not registered.")

            backend_info = BackendInfo(
                worker_class=existing_info.worker_class,
                version=existing_info.version,
                backend_config=existing_info.backend_config.copy(
                    update=config_options.dict(exclude_unset=True)),
                replica_config=existing_info.replica_config)
            return self.backend_state.deploy_backend(backend_tag, backend_info)

    def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
        """Get the current config for the specified backend."""
        if self.backend_state.get_backend(backend_tag) is None:
            raise ValueError(f"Backend {backend_tag} is not registered.")
        return self.backend_state.get_backend(backend_tag).backend_config

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    async def shutdown(self) -> None:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            for proxy in self.http_state.get_http_proxy_handles().values():
                ray.kill(proxy, no_restart=True)
            for replica_dict in self.backend_state.get_running_replica_handles(
            ).values():
                for replica in replica_dict.values():
                    ray.kill(replica, no_restart=True)
            self.kv_store.delete(CHECKPOINT_KEY)

    async def deploy(self, name: str, backend_config: BackendConfig,
                     replica_config: ReplicaConfig,
                     version: Optional[str]) -> Optional[GoalId]:
        # By default the path prefix is the deployment name.
        if replica_config.path_prefix is None:
            replica_config.path_prefix = f"/{name}"
            # Backend config should be synchronized so the backend worker
            # is aware of it.
            backend_config.internal_metadata.path_prefix = f"/{name}"

        if replica_config.is_asgi_app:
            # When the backend is asgi application, we want to proxy it
            # with a prefixed path as well as proxy all HTTP methods.
            # {wildcard:path} is used so HTTPProxy's Starlette router can match
            # arbitrary path.
            http_route = f"{replica_config.path_prefix}" + "/{wildcard:path}"
            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods
            http_methods = [
                "GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS",
                "TRACE", "PATCH"
            ]
        else:
            http_route = replica_config.path_prefix
            # Generic endpoint should support a limited subset of HTTP methods.
            http_methods = ["GET", "POST"]

        python_methods = []
        if inspect.isclass(replica_config.backend_def):
            for method_name, _ in inspect.getmembers(
                    replica_config.backend_def, inspect.isfunction):
                python_methods.append(method_name)

        async with self.write_lock:
            backend_info = BackendInfo(
                worker_class=create_backend_replica(
                    replica_config.backend_def),
                version=version,
                backend_config=backend_config,
                replica_config=replica_config)

            goal_id = self.backend_state.deploy_backend(name, backend_info)
            self.endpoint_state.create_endpoint(
                name,
                http_route,
                http_methods,
                TrafficPolicy({
                    name: 1.0
                }),
                python_methods=python_methods)
            return goal_id

    def delete_deployment(self, name: str) -> Optional[GoalId]:
        self.endpoint_state.delete_endpoint(name)
        return self.backend_state.delete_backend(name, force_kill=False)

    def get_deployment_info(self, name: str) -> Tuple[BackendInfo, str]:
        """Get the current information about a deployment.

        Args:
            name(str): the name of the deployment.

        Returns:
            (BackendInfo, route)

        Raises:
            KeyError if the deployment doesn't exist.
        """
        backend_info: BackendInfo = self.backend_state.get_backend(name)
        if backend_info is None:
            raise KeyError(f"Deployment {name} does not exist.")

        route = self.endpoint_state.get_endpoint_route(name)

        return backend_info, route
Exemplo n.º 14
0
class ServeController:
    """Responsible for managing the state of the serving system.

    The controller implements fault tolerance by persisting its state in
    a new checkpoint each time a state change is made. If the actor crashes,
    the latest checkpoint is loaded and the state is recovered. Checkpoints
    are written/read using a provided KV-store interface.

    All hard state in the system is maintained by this actor and persisted via
    these checkpoints. Soft state required by other components is fetched by
    those actors from this actor on startup and updates are pushed out from
    this actor.

    All other actors started by the controller are named, detached actors
    so they will not fate share with the controller if it crashes.

    The following guarantees are provided for state-changing calls to the
    controller:
        - If the call succeeds, the change was made and will be reflected in
          the system even if the controller or other actors die unexpectedly.
        - If the call fails, the change may have been made but isn't guaranteed
          to have been. The client should retry in this case. Note that this
          requires all implementations here to be idempotent.
    """

    async def __init__(self,
                       controller_name: str,
                       http_config: HTTPOptions,
                       detached: bool = False):
        # Used to read/write checkpoints.
        self.kv_store = RayInternalKVStore(namespace=controller_name)

        # Dictionary of backend_tag -> proxy_name -> most recent queue length.
        self.backend_stats = defaultdict(lambda: defaultdict(dict))

        # Used to ensure that only a single state-changing operation happens
        # at any given time.
        self.write_lock = asyncio.Lock()

        # Map of awaiting results
        # TODO(ilr): Checkpoint this once this becomes asynchronous
        self.inflight_results: Dict[UUID, asyncio.Event] = dict()
        self._serializable_inflight_results: Dict[UUID, FutureResult] = dict()

        # NOTE(simon): Currently we do all-to-all broadcast. This means
        # any listeners will receive notification for all changes. This
        # can be problem at scale, e.g. updating a single backend config
        # will send over the entire configs. In the future, we should
        # optimize the logic to support subscription by key.
        self.long_poll_host = LongPollHost()

        self.http_state = HTTPState(controller_name, detached, http_config)
        self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)

        checkpoint_bytes = self.kv_store.get(CHECKPOINT_KEY)
        if checkpoint_bytes is None:
            logger.debug("No checkpoint found")
            self.backend_state = BackendState(controller_name, detached)
        else:
            checkpoint: Checkpoint = pickle.loads(checkpoint_bytes)
            self.backend_state = BackendState(
                controller_name,
                detached,
                checkpoint=checkpoint.backend_state_checkpoint)

            self._serializable_inflight_results = checkpoint.inflight_reqs
            for uuid, fut_result in self._serializable_inflight_results.items(
            ):
                self._create_event_with_result(fut_result.requested_goal, uuid)

        self.notify_backend_configs_changed()
        self.notify_replica_handles_changed()

        asyncio.get_event_loop().create_task(self.run_control_loop())

    async def wait_for_event(self, uuid: UUID) -> bool:
        start = time.time()
        if uuid not in self.inflight_results:
            logger.debug(f"UUID ({uuid}) not found!!!")
            return True
        event = self.inflight_results[uuid]
        await event.wait()
        self.inflight_results.pop(uuid)
        self._serializable_inflight_results.pop(uuid)
        async with self.write_lock:
            self._checkpoint()
        logger.debug(f"Waiting for {uuid} took {time.time() - start} seconds")

        return True

    def _create_event_with_result(
            self,
            goal_state: Dict[str, any],
            recreation_uuid: Optional[UUID] = None) -> UUID:
        # NOTE(ilr) Must be called before checkpointing!
        event = asyncio.Event()
        event.result = FutureResult(goal_state)
        uuid_val = recreation_uuid or uuid4()
        self.inflight_results[uuid_val] = event
        self._serializable_inflight_results[uuid_val] = event.result
        return uuid_val

    async def _num_inflight_results(self) -> int:
        return len(self.inflight_results)

    def notify_replica_handles_changed(self):
        self.long_poll_host.notify_changed(
            LongPollKey.REPLICA_HANDLES, {
                backend_tag: list(replica_dict.values())
                for backend_tag, replica_dict in
                self.backend_state.backend_replicas.items()
            })

    def notify_backend_configs_changed(self):
        self.long_poll_host.notify_changed(
            LongPollKey.BACKEND_CONFIGS,
            self.backend_state.get_backend_configs())

    async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]):
        """Proxy long pull client's listen request.

        Args:
            keys_to_snapshot_ids (Dict[str, int]): Snapshot IDs are used to
              determine whether or not the host should immediately return the
              data or wait for the value to be changed.
        """
        return await (
            self.long_poll_host.listen_for_change(keys_to_snapshot_ids))

    def get_http_proxies(self) -> Dict[NodeId, ActorHandle]:
        """Returns a dictionary of node ID to http_proxy actor handles."""
        return self.http_state.get_http_proxy_handles()

    def _checkpoint(self) -> None:
        """Checkpoint internal state and write it to the KV store."""
        assert self.write_lock.locked()
        logger.debug("Writing checkpoint")
        start = time.time()

        checkpoint = pickle.dumps(
            Checkpoint(self.backend_state.checkpoint(),
                       self._serializable_inflight_results))

        self.kv_store.put(CHECKPOINT_KEY, checkpoint)
        logger.debug("Wrote checkpoint in {:.3f}s".format(time.time() - start))

        if random.random(
        ) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached:
            logger.warning("Intentionally crashing after checkpoint")
            os._exit(0)

    async def reconcile_current_and_goal_backends(self):
        pass

    def set_goal_id(self, goal_id: UUID) -> None:
        event = self.inflight_results.get(goal_id)
        logger.debug(f"Setting goal id {goal_id}")
        if event:
            event.set()

    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                self.http_state.update()

                completed_ids = self.backend_state.completed_goals()
                for done_id in completed_ids:
                    self.set_goal_id(done_id)
                delta_workers = await self.backend_state.update()
                if delta_workers:
                    self.notify_replica_handles_changed()
                    self._checkpoint()

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)

    def _all_replica_handles(
            self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]:
        """Used for testing."""
        return self.backend_state.get_replica_handles()

    def get_all_backends(self) -> Dict[BackendTag, BackendConfig]:
        """Returns a dictionary of backend tag to backend config."""
        return self.backend_state.get_backend_configs()

    def get_all_endpoints(self) -> Dict[EndpointTag, Dict[BackendTag, Any]]:
        """Returns a dictionary of backend tag to backend config."""
        return self.endpoint_state.get_endpoints()

    def _set_traffic(self, endpoint_name: str,
                     traffic_dict: Dict[str, float]) -> UUID:
        for backend in traffic_dict:
            if self.backend_state.get_backend(backend) is None:
                raise ValueError(
                    "Attempted to assign traffic to a backend '{}' that "
                    "is not registered.".format(backend))

        self.endpoint_state.set_traffic_policy(endpoint_name,
                                               TrafficPolicy(traffic_dict))

    def _validate_traffic_dict(self, traffic_dict: Dict[str, float]):
        for backend in traffic_dict:
            if self.backend_state.get_backend(backend) is None:
                raise ValueError(
                    "Attempted to assign traffic to a backend '{}' that "
                    "is not registered.".format(backend))

    async def set_traffic(self, endpoint_name: str,
                          traffic_dict: Dict[str, float]) -> None:
        """Sets the traffic policy for the specified endpoint."""
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)
            self._set_traffic(endpoint_name, traffic_dict)

    async def shadow_traffic(self, endpoint_name: str, backend_tag: BackendTag,
                             proportion: float) -> UUID:
        """Shadow traffic from the endpoint to the backend."""
        async with self.write_lock:
            if self.backend_state.get_backend(backend_tag) is None:
                raise ValueError(
                    "Attempted to shadow traffic to a backend '{}' that "
                    "is not registered.".format(backend_tag))

            logger.info(
                "Shadowing '{}' of traffic to endpoint '{}' to backend '{}'.".
                format(proportion, endpoint_name, backend_tag))

            self.endpoint_state.shadow_traffic(endpoint_name, backend_tag,
                                               proportion)

    # TODO(architkulkarni): add Optional for route after cloudpickle upgrade
    async def create_endpoint(self, endpoint: str,
                              traffic_dict: Dict[str, float], route,
                              methods: List[str]) -> UUID:
        """Create a new endpoint with the specified route and methods.

        If the route is None, this is a "headless" endpoint that will not
        be exposed over HTTP and can only be accessed via a handle.
        """
        async with self.write_lock:
            self._validate_traffic_dict(traffic_dict)

            logger.info(
                "Registering route '{}' to endpoint '{}' with methods '{}'.".
                format(route, endpoint, methods))

            self.endpoint_state.create_endpoint(endpoint, route, methods,
                                                TrafficPolicy(traffic_dict))

    async def delete_endpoint(self, endpoint: str) -> None:
        """Delete the specified endpoint.

        Does not modify any corresponding backends.
        """
        logger.info("Deleting endpoint '{}'".format(endpoint))
        async with self.write_lock:
            self.endpoint_state.delete_endpoint(endpoint)

    async def set_backend_goal(self, backend_tag: BackendTag,
                               backend_info: BackendInfo,
                               new_id: GoalId) -> None:
        # NOTE(ilr) Must checkpoint after doing this!
        existing_id_to_set = self.backend_state._set_backend_goal(
            backend_tag, backend_info, new_id)
        if existing_id_to_set:
            self.set_goal_id(existing_id_to_set)

    async def create_backend(self, backend_tag: BackendTag,
                             backend_config: BackendConfig,
                             replica_config: ReplicaConfig) -> UUID:
        """Register a new backend under the specified tag."""
        async with self.write_lock:
            # Ensures this method is idempotent.
            backend_info = self.backend_state.get_backend(backend_tag)
            if backend_info is not None:
                if (backend_info.backend_config == backend_config
                        and backend_info.replica_config == replica_config):
                    return

            backend_replica = create_backend_replica(
                replica_config.func_or_class)

            # Save creator that starts replicas, the arguments to be passed in,
            # and the configuration for the backends.
            backend_info = BackendInfo(
                worker_class=backend_replica,
                backend_config=backend_config,
                replica_config=replica_config)

            return_uuid = self._create_event_with_result({
                backend_tag: backend_info
            })

            await self.set_backend_goal(backend_tag, backend_info, return_uuid)

            try:
                # This call should be to run control loop
                self.backend_state.scale_backend_replicas(
                    backend_tag, backend_config.num_replicas)
            except RayServeException as e:
                del self.backend_state.backends[backend_tag]
                raise e

            # NOTE(edoakes): we must write a checkpoint before starting new
            # or pushing the updated config to avoid inconsistent state if we
            # crash while making the change.
            self._checkpoint()
            self.notify_backend_configs_changed()
            return return_uuid

    async def delete_backend(self,
                             backend_tag: BackendTag,
                             force_kill: bool = False) -> UUID:
        async with self.write_lock:
            # This method must be idempotent. We should validate that the
            # specified backend exists on the client.
            if self.backend_state.get_backend(backend_tag) is None:
                return

            # Check that the specified backend isn't used by any endpoints.
            for endpoint, info in self.endpoint_state.get_endpoints().items():
                if (backend_tag in info["traffic"]
                        or backend_tag in info["shadows"]):
                    raise ValueError("Backend '{}' is used by endpoint '{}' "
                                     "and cannot be deleted. Please remove "
                                     "the backend from all endpoints and try "
                                     "again.".format(backend_tag, endpoint))

            # Scale its replicas down to 0.
            self.backend_state.scale_backend_replicas(backend_tag, 0,
                                                      force_kill)

            # Remove the backend's metadata.
            del self.backend_state.backends[backend_tag]

            # Add the intention to remove the backend from the routers.
            self.backend_state.backends_to_remove.append(backend_tag)

            return_uuid = self._create_event_with_result({backend_tag: None})
            # Remove the backend's metadata.
            await self.set_backend_goal(backend_tag, None, return_uuid)
            # NOTE(edoakes): we must write a checkpoint before removing the
            # backend from the routers to avoid inconsistent state if we crash
            # after pushing the update.
            self._checkpoint()
            return return_uuid

    async def update_backend_config(self, backend_tag: BackendTag,
                                    config_options: BackendConfig) -> UUID:
        """Set the config for the specified backend."""
        async with self.write_lock:
            assert (self.backend_state.get_backend(backend_tag)
                    ), "Backend {} is not registered.".format(backend_tag)
            assert isinstance(config_options, BackendConfig)

            stored_backend_config = self.backend_state.get_backend(
                backend_tag).backend_config
            backend_config = stored_backend_config.copy(
                update=config_options.dict(exclude_unset=True))
            backend_config._validate_complete()
            self.backend_state.get_backend(
                backend_tag).backend_config = backend_config
            backend_info = self.backend_state.get_backend(backend_tag)

            return_uuid = self._create_event_with_result({
                backend_tag: backend_info
            })
            await self.set_backend_goal(backend_tag, backend_info, return_uuid)

            # Scale the replicas with the new configuration.

            # This should be to run the control loop
            self.backend_state.scale_backend_replicas(
                backend_tag, backend_config.num_replicas)

            # NOTE(edoakes): we must write a checkpoint before pushing the
            # update to avoid inconsistent state if we crash after pushing the
            # update.
            self._checkpoint()

            # Inform the routers and backend replicas about config changes.
            self.notify_backend_configs_changed()

            return return_uuid

    def get_backend_config(self, backend_tag: BackendTag) -> BackendConfig:
        """Get the current config for the specified backend."""
        assert (self.backend_state.get_backend(backend_tag)
                ), "Backend {} is not registered.".format(backend_tag)
        return self.backend_state.get_backend(backend_tag).backend_config

    def get_http_config(self):
        """Return the HTTP proxy configuration."""
        return self.http_state.get_config()

    async def shutdown(self) -> None:
        """Shuts down the serve instance completely."""
        async with self.write_lock:
            for proxy in self.http_state.get_http_proxy_handles().values():
                ray.kill(proxy, no_restart=True)
            for replica_dict in self.backend_state.get_replica_handles(
            ).values():
                for replica in replica_dict.values():
                    ray.kill(replica, no_restart=True)
            self.kv_store.delete(CHECKPOINT_KEY)