def _get_or_start_routers(self, host, port): """Get the HTTP proxy belonging to this serve instance. If the HTTP proxy does not already exist, it will be started. """ # TODO(simon): We don't handle nodes being added/removed. To do that, # we should implement some sort of control loop in master actor. for _, node_id_group in groupby(sorted(ray.state.node_ids())): for index, node_id in enumerate(node_id_group): proxy_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name) proxy_name += "-{}-{}".format(node_id, index) try: router = ray.get_actor(proxy_name) except ValueError: logger.info( "Starting HTTP proxy with name '{}' on node '{}' " "listening on port {}".format(proxy_name, node_id, port)) router = HTTPProxyActor.options( name=proxy_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_id: 0.01 }, ).remote(host, port, instance_name=self.instance_name) self.routers.append(router)
async def _start_backend_worker(self, backend_tag: str, replica_tag: str) -> ActorHandle: """Creates a backend worker and waits for it to start up. Assumes that the backend configuration has already been registered in self.backends. """ logger.debug("Starting worker '{}' for backend '{}'.".format( replica_tag, backend_tag)) backend_info = self.backends[backend_tag] replica_name = format_actor_name(replica_tag, self.instance_name) worker_handle = ray.remote(backend_info.worker_class).options( name=replica_name, lifetime="detached", max_restarts=-1, max_task_retries=-1, **backend_info.replica_config.ray_actor_options).remote( backend_tag, replica_tag, backend_info.replica_config.actor_init_args, backend_info.backend_config, instance_name=self.instance_name) # TODO(edoakes): we should probably have a timeout here. await worker_handle.ready.remote() return worker_handle
def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names
async def _start_backend_replica(self, config_store: ConfigurationStore, backend_tag: BackendTag, replica_tag: ReplicaTag) -> ActorHandle: """Start a replica and return its actor handle. Checks if the named actor already exists before starting a new one. Assumes that the backend configuration has already been registered in the ConfigurationStore. """ # NOTE(edoakes): the replicas may already be created if we # failed after creating them but before writing a # checkpoint. replica_name = format_actor_name(replica_tag, self.controller_name) try: replica_handle = ray.get_actor(replica_name) except ValueError: logger.debug("Starting replica '{}' for backend '{}'.".format( replica_tag, backend_tag)) backend_info = config_store.get_backend(backend_tag) replica_handle = ray.remote(backend_info.worker_class).options( name=replica_name, lifetime="detached" if self.detached else None, max_restarts=-1, max_task_retries=-1, **backend_info.replica_config.ray_actor_options).remote( backend_tag, replica_tag, backend_info.replica_config.actor_init_args, backend_info.backend_config, self.controller_name) return replica_handle
def _start_routers_if_needed(self, http_host: str, http_port: str, http_middlewares: List[Any]) -> None: """Start a router on every node if it doesn't already exist.""" for node_id, node_resource in get_all_node_ids(): if node_id in self.routers_cache: continue router_name = format_actor_name(SERVE_PROXY_NAME, self.controller_name, node_id) try: router = ray.get_actor(router_name) except ValueError: logger.info("Starting router with name '{}' on node '{}' " "listening on '{}:{}'".format( router_name, node_id, http_host, http_port)) router = HTTPProxyActor.options( name=router_name, lifetime="detached" if self.detached else None, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote(http_host, http_port, controller_name=self.controller_name, http_middlewares=http_middlewares) self.routers_cache[node_id] = router
async def _stop_pending_backend_replicas(self) -> None: """Stops the pending backend replicas in self.backend_replicas_to_stop. Removes backend_replicas from the router, kills them, and clears self.backend_replicas_to_stop. """ for backend_tag, replicas_list in self.backend_replicas_to_stop.items( ): for replica_tag in replicas_list: # NOTE(edoakes): the replicas may already be stopped if we # failed after stopping them but before writing a checkpoint. replica_name = format_actor_name(replica_tag, self.controller_name) try: replica = ray.get_actor(replica_name) except ValueError: continue # TODO(edoakes): this logic isn't ideal because there may be # pending tasks still executing on the replica. However, if we # use replica.__ray_terminate__, we may send it while the # replica is being restarted and there's no way to tell if it # successfully killed the worker or not. ray.kill(replica, no_restart=True) self.backend_replicas_to_stop.clear()
def _get_or_start_http_proxy(self, node_id, host, port): """Get the HTTP proxy belonging to this serve instance. If the HTTP proxy does not already exist, it will be started. """ proxy_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name) try: self.http_proxy = ray.get_actor(proxy_name) except ValueError: logger.info( "Starting HTTP proxy with name '{}' on node '{}'".format( proxy_name, node_id)) self.http_proxy = HTTPProxyActor.options( name=proxy_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_id: 0.01 }, ).remote(host, port, instance_name=self.instance_name) # Since router is a merged with HTTP proxy actor, the router will be # proxied via the HTTP actor. Even though the two variable names are # pointing to the same object, their semantic differences make the code # more readable. (e.g. http_proxy.set_route_table, router.add_worker) self.router = self.http_proxy
async def _enqueue_pending_scale_changes_loop(self, current_state: SystemState): for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ items(): for replica_tag in replicas_to_create: replica_handle = await self._start_backend_replica( current_state, backend_tag, replica_tag) ready_future = replica_handle.ready.remote().as_future() self.currently_starting_replicas[ready_future] = ( backend_tag, replica_tag, replica_handle) for backend_tag, replicas_to_stop in self.backend_replicas_to_stop.\ items(): for replica_tag in replicas_to_stop: replica_name = format_actor_name(replica_tag, self.controller_name) async def kill_actor(replica_name_to_use): # NOTE: the replicas may already be stopped if we failed # after stopping them but before writing a checkpoint. try: replica = ray.get_actor(replica_name_to_use) except ValueError: return # TODO(edoakes): this logic isn't ideal because there may # be pending tasks still executing on the replica. However, # if we use replica.__ray_terminate__, we may send it while # the replica is being restarted and there's no way to tell # if it successfully killed the worker or not. ray.kill(replica, no_restart=True) self.currently_stopping_replicas[asyncio.ensure_future( kill_actor(replica_name))] = (backend_tag, replica_tag)
def _start_proxies_if_needed(self) -> None: """Start a proxy on every node if it doesn't already exist.""" for node_id, node_resource in self._get_target_nodes(): if node_id in self._proxy_actors: continue name = format_actor_name(SERVE_PROXY_NAME, self._controller_name, node_id) try: proxy = ray.get_actor( name, namespace=self._controller_namespace) except ValueError: logger.info("Starting HTTP proxy with name '{}' on node '{}' " "listening on '{}:{}'".format( name, node_id, self._config.host, self._config.port)) proxy = HTTPProxyActor.options( num_cpus=self._config.num_cpus, name=name, lifetime="detached" if self._detached else None, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote( self._config.host, self._config.port, controller_name=self._controller_name, controller_namespace=self._controller_namespace, http_middlewares=self._config.middlewares) self._proxy_actors[node_id] = proxy
def _stop_pending_replicas(self): for backend_tag, replicas_to_stop in ( self.backend_replicas_to_stop.items()): for replica_tag, shutdown_timeout in replicas_to_stop: replica_name = format_actor_name(replica_tag, self._controller_name) async def kill_actor(replica_name_to_use): # NOTE: the replicas may already be stopped if we failed # after stopping them but before writing a checkpoint. try: replica = ray.get_actor(replica_name_to_use) except ValueError: return try: await asyncio.wait_for( replica.drain_pending_queries.remote(), timeout=shutdown_timeout) except asyncio.TimeoutError: # Graceful period passed, kill it forcefully. logger.debug( f"{replica_name_to_use} did not shutdown after " f"{shutdown_timeout}s, killing.") finally: ray.kill(replica, no_restart=True) self.currently_stopping_replicas[asyncio.ensure_future( kill_actor(replica_name))] = (backend_tag, replica_tag)
def _start_routers_if_needed(self): """Start a router on every node if it doesn't already exist.""" for node_id, node_resource in get_all_node_ids(): if node_id in self.routers: continue router_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name, node_id) try: router = ray.get_actor(router_name) except ValueError: logger.info("Starting router with name '{}' on node '{}' " "listening on '{}:{}'".format( router_name, node_id, self.http_host, self.http_port)) router = HTTPProxyActor.options( name=router_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote( node_id, self.http_host, self.http_port, instance_name=self.instance_name, _http_middlewares=self._http_middlewares) self.routers[node_id] = router
def init(name=None, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, metric_exporter=InMemoryExporter): """Initialize or connect to a serve cluster. If serve cluster is already initialized, this function will just return. If `ray.init` has not been called in this process, it will be called with no arguments. To specify kwargs to `ray.init`, it should be called separately before calling `serve.init`. Args: name (str): A unique name for this serve instance. This allows multiple serve instances to run on the same ray cluster. Must be specified in all subsequent serve.init() calls. http_host (str): Host for HTTP server. Default to "0.0.0.0". http_port (int): Port for HTTP server. Default to 8000. metric_exporter(ExporterInterface): The class aggregates metrics from all RayServe actors and optionally export them to external services. RayServe has two options built in: InMemoryExporter and PrometheusExporter """ if name is not None and not isinstance(name, str): raise TypeError("name must be a string.") # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve master actor if it exists global master_actor master_actor_name = format_actor_name(SERVE_MASTER_NAME, name) try: master_actor = ray.get_actor(master_actor_name) return except ValueError: pass # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) ray.register_custom_serializer(RequestMetadata, RequestMetadata.ray_serialize, RequestMetadata.ray_deserialize) # TODO(edoakes): for now, always start the HTTP proxy on the node that # serve.init() was run on. We should consider making this configurable # in the future. http_node_id = ray.state.current_node_id() master_actor = ServeMaster.options( name=master_actor_name, max_restarts=-1, max_task_retries=-1, ).remote(name, http_node_id, http_host, http_port, metric_exporter) block_until_http_ready("http://{}:{}/-/routes".format( http_host, http_port), timeout=HTTP_PROXY_TIMEOUT)
def _recover_actor_handles(self) -> None: # Refresh the RouterCache for node_id in self.routers_cache.keys(): router_name = format_actor_name(SERVE_PROXY_NAME, self.controller_name, node_id) self.routers_cache[node_id] = ray.get_actor(router_name) # Fetch actor handles for all of the backend replicas in the system. # All of these backend_replicas are guaranteed to already exist because # they would not be written to a checkpoint in self.backend_replicas # until they were created. for backend_tag, replica_dict in self.backend_replicas.items(): for replica_tag in replica_dict.keys(): replica_name = format_actor_name(replica_tag, self.controller_name) self.backend_replicas[backend_tag][ replica_tag] = ray.get_actor(replica_name)
def init(name=None, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, metric_exporter=InMemoryExporter, _http_middlewares=[]): """Initialize or connect to a serve cluster. If serve cluster is already initialized, this function will just return. If `ray.init` has not been called in this process, it will be called with no arguments. To specify kwargs to `ray.init`, it should be called separately before calling `serve.init`. Args: name (str): A unique name for this serve instance. This allows multiple serve instances to run on the same ray cluster. Must be specified in all subsequent serve.init() calls. http_host (str): Host for HTTP servers. Default to "0.0.0.0". Serve starts one HTTP server per node in the Ray cluster. http_port (int, List[int]): Port for HTTP server. Default to 8000. metric_exporter(ExporterInterface): The class aggregates metrics from all RayServe actors and optionally export them to external services. Ray Serve has two options built in: InMemoryExporter and PrometheusExporter """ if name is not None and not isinstance(name, str): raise TypeError("name must be a string.") # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve controller if it exists global controller controller_name = format_actor_name(SERVE_CONTROLLER_NAME, name) try: controller = ray.get_actor(controller_name) return except ValueError: pass controller = ServeController.options( name=controller_name, max_restarts=-1, max_task_retries=-1, ).remote(name, http_host, http_port, metric_exporter, _http_middlewares) futures = [] for node_id in ray.state.node_ids(): future = block_until_http_ready.options( num_cpus=0, resources={ node_id: 0.01 }).remote( "http://{}:{}/-/routes".format(http_host, http_port), timeout=HTTP_PROXY_TIMEOUT) futures.append(future) ray.get(futures)
def get_replica_actors(self, backend_tag: BackendTag) -> List[ActorHandle]: return_list = [] for replica_tag in self.replicas.get(backend_tag, []): try: replica_name = format_actor_name(replica_tag, self.controller_name) return_list.append(ray.get_actor(replica_name)) except ValueError: pass return return_list
def check_dead(): for actor_name in [ constants.SERVE_CONTROLLER_NAME, constants.SERVE_PROXY_NAME ]: try: ray.get_actor(format_actor_name(actor_name, instance_name)) return False except ValueError: pass return True
def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_options=dict(port=8003)) @serve.deployment def f(): pass f.deploy() serve_controller_name = serve.context._global_client._controller_name actor_names = [ serve_controller_name, format_actor_name( SERVE_PROXY_NAME, serve.context._global_client._controller_name, get_all_node_ids()[0][0], ), ] def check_alive(): alive = True for actor_name in actor_names: try: if actor_name == serve_controller_name: ray.get_actor( actor_name, namespace=ray.get_runtime_context().namespace) else: ray.get_actor(actor_name) except ValueError: alive = False return alive wait_for_condition(check_alive) serve.shutdown() with pytest.raises(RayServeException): serve.list_deployments() def check_dead(): for actor_name in actor_names: try: if actor_name == serve_controller_name: ray.get_actor( actor_name, namespace=ray.get_runtime_context().namespace) else: ray.get_actor(actor_name) return False except ValueError: pass return True wait_for_condition(check_dead)
def __init__(self, controller_name: str, detached: bool, replica_tag: ReplicaTag, backend_tag: BackendTag): self._actor_name = format_actor_name(replica_tag, controller_name) self._controller_name = controller_name self._detached = detached self._replica_tag = replica_tag self._backend_tag = backend_tag self._actor_handle = None self._startup_obj_ref = None self._drain_obj_ref = None self._state = ReplicaState.SHOULD_START
def _recover_actor_handles(self) -> None: # Fetch actor handles for all of the backend replicas in the system. # All of these backend_replicas are guaranteed to already exist because # they would not be written to a checkpoint in self.backend_replicas # until they were created. for backend_tag, replica_dict in self.backend_replicas.items(): for replica_tag in replica_dict.keys(): replica_name = format_actor_name(replica_tag, self.controller_name) self.backend_replicas[backend_tag][ replica_tag] = ray.get_actor(replica_name)
def check_dead(): for actor_name in [ client._controller_name, format_actor_name(SERVE_PROXY_NAME, client._controller_name) ]: try: ray.get_actor(actor_name) return False except ValueError: pass return True
def __init__(self, controller_name: str, detached: bool, kv_store: RayInternalKVStore, long_poll_host: LongPollHost, goal_manager: AsyncGoalManager): self._controller_name = controller_name self._detached = detached self._kv_store = kv_store self._long_poll_host = long_poll_host self._goal_manager = goal_manager # Non-checkpointed state. self.currently_starting_replicas: Dict[asyncio.Future, Tuple[BackendTag, ReplicaTag, ActorHandle]] = dict() self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[BackendTag, ReplicaTag]] = dict() # Checkpointed state. self.backends: Dict[BackendTag, BackendInfo] = dict() self.backend_replicas: Dict[BackendTag, Dict[ReplicaTag, ActorHandle]] = defaultdict(dict) self.backend_goals: Dict[BackendTag, GoalId] = dict() self.backend_replicas_to_start: Dict[ BackendTag, List[ReplicaTag]] = defaultdict(list) self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ ReplicaTag, Duration]]] = defaultdict(list) self.backends_to_remove: List[BackendTag] = list() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: (self.backends, self.backend_replicas, self.backend_goals, self.backend_replicas_to_start, self.backend_replicas_to_stop, self.backend_to_remove, pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) # Fetch actor handles for all backend replicas in the system. # All of these backend_replicas are guaranteed to already exist # because they would not be written to a checkpoint in # self.backend_replicas until they were created. for backend_tag, replica_dict in self.backend_replicas.items(): for replica_tag in replica_dict.keys(): replica_name = format_actor_name(replica_tag, self._controller_name) self.backend_replicas[backend_tag][ replica_tag] = ray.get_actor(replica_name) self._notify_backend_configs_changed() self._notify_replica_handles_changed()
def __init__(self, controller_name: str, detached: bool, replica_tag: ReplicaTag, backend_tag: BackendTag, version: str): self._actor = ActorReplicaWrapper( format_actor_name(replica_tag, controller_name), detached, controller_name, replica_tag, backend_tag) self._controller_name = controller_name self._replica_tag = replica_tag self._backend_tag = backend_tag self._version = version self._start_time = None self._prev_slow_startup_warning_time = None self._state = ReplicaState.SHOULD_START
async def _start_backend_replicas(self, config_store: ConfigurationStore, backend_tag: BackendTag, replica_tag: ReplicaTag) -> None: # NOTE(edoakes): the replicas may already be created if we # failed after creating them but before writing a # checkpoint. replica_name = format_actor_name(replica_tag, self.controller_name) try: replica_handle = ray.get_actor(replica_name) except ValueError: replica_handle = await self._start_single_replica( config_store, backend_tag, replica_tag, replica_name) self.backend_replicas[backend_tag][replica_tag] = replica_handle
def _get_or_start_metric_exporter(self, metric_exporter_class): """Get the metric exporter belonging to this serve instance. If the metric exporter does not already exist, it will be started. """ metric_sink_name = format_actor_name(SERVE_METRIC_SINK_NAME, self.instance_name) try: self.metric_exporter = ray.get_actor(metric_sink_name) except ValueError: logger.info("Starting metric exporter with name '{}'".format( metric_sink_name)) self.metric_exporter = MetricExporterActor.options( name=metric_sink_name).remote(metric_exporter_class)
def __init__(self, controller_name: str, detached: bool, replica_tag: ReplicaTag, backend_tag: BackendTag): self._actor_name = format_actor_name(replica_tag, controller_name) self._placement_group_name = self._actor_name + "_placement_group" self._controller_name = controller_name self._detached = detached self._replica_tag = replica_tag self._backend_tag = backend_tag self._actor_handle = None self._placement_group = None self._start_time = None self._prev_slow_startup_warning_time = None self._startup_obj_ref = None self._drain_obj_ref = None self._state = ReplicaState.SHOULD_START
def _get_or_start_router(self): """Get the router belonging to this serve cluster. If the router does not already exist, it will be started. """ router_name = format_actor_name(SERVE_ROUTER_NAME, self.cluster_name) try: self.router = ray.get_actor(router_name) except ValueError: logger.info("Starting router with name '{}'".format(router_name)) self.router = async_retryable(ray.remote(Router)).options( name=router_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, ).remote(cluster_name=self.cluster_name)
def __init__(self, controller_name: str, detached: bool, checkpoint: bytes = None): self.controller_name = controller_name self.detached = detached # Non-checkpointed state. self.currently_starting_replicas: Dict[asyncio.Future, Tuple[BackendTag, ReplicaTag, ActorHandle]] = dict() self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[BackendTag, ReplicaTag]] = dict() # Checkpointed state. self.backends: Dict[BackendTag, BackendInfo] = dict() self.backend_replicas: Dict[BackendTag, Dict[ReplicaTag, ActorHandle]] = defaultdict(dict) self.goals: Dict[BackendTag, GoalId] = dict() self.backend_replicas_to_start: Dict[ BackendTag, List[ReplicaTag]] = defaultdict(list) self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ ReplicaTag, Duration]]] = defaultdict(list) self.backends_to_remove: List[BackendTag] = list() if checkpoint is not None: (self.backends, self.backend_replicas, self.goals, self.backend_replicas_to_start, self.backend_replicas_to_stop, self.backend_to_remove) = pickle.loads(checkpoint) # Fetch actor handles for all of the backend replicas in the system. # All of these backend_replicas are guaranteed to already exist because # they would not be written to a checkpoint in self.backend_replicas # until they were created. for backend_tag, replica_dict in self.backend_replicas.items(): for replica_tag in replica_dict.keys(): replica_name = format_actor_name(replica_tag, self.controller_name) self.backend_replicas[backend_tag][ replica_tag] = ray.get_actor(replica_name)
def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_port=8003) @serve.deployment def f(): pass f.deploy() actor_names = [ serve.api._global_client._controller_name, format_actor_name(SERVE_PROXY_NAME, serve.api._global_client._controller_name, get_all_node_ids()[0][0]) ] def check_alive(): alive = True for actor_name in actor_names: try: ray.get_actor(actor_name) except ValueError: alive = False return alive wait_for_condition(check_alive) serve.shutdown() with pytest.raises(RayServeException): serve.list_backends() def check_dead(): for actor_name in actor_names: try: ray.get_actor(actor_name) return False except ValueError: pass return True wait_for_condition(check_dead)
async def _start_replica(self, backend_tag: str, replica_tag: str) -> None: # NOTE(edoakes): the replicas may already be created if we # failed after creating them but before writing a # checkpoint. replica_name = format_actor_name(replica_tag, self.controller_name) try: worker_handle = ray.get_actor(replica_name) except ValueError: worker_handle = await self._start_backend_worker( backend_tag, replica_tag, replica_name) self.replicas[backend_tag].append(replica_tag) self.workers[backend_tag][replica_tag] = worker_handle # Register the worker with the router. await asyncio.gather(*[ router.add_new_worker.remote(backend_tag, replica_tag, worker_handle) for router in self.routers.values() ])
def _get_or_start_http_proxy(self, node_id, host, port): """Get the HTTP proxy belonging to this serve cluster. If the HTTP proxy does not already exist, it will be started. """ proxy_name = format_actor_name(SERVE_PROXY_NAME, self.cluster_name) try: self.http_proxy = ray.get_actor(proxy_name) except ValueError: logger.info( "Starting HTTP proxy with name '{}' on node '{}'".format( proxy_name, node_id)) self.http_proxy = async_retryable(HTTPProxyActor).options( name=proxy_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, resources={ node_id: 0.01 }, ).remote(host, port, cluster_name=self.cluster_name)