async def wait_for_goal(self, goal_id: GoalId) -> Optional[Exception]: """ Wait for given goal_id to complete by external code calling complete_goal(goal_id), either result or exception could be set. Args: goal_id (GoalId): Target goal_id to wait on Returns: goal_id (GoalId): If goal_id finished successfully Raises: exception (Exception): If set by caller of complete_goal() with an exception object """ start = time.time() if goal_id not in self._pending_goals: logger.debug(f"Goal {goal_id} not found") return None async_goal = self._pending_goals[goal_id] await async_goal.wait() logger.debug( f"Waiting for goal {goal_id} took {time.time() - start} seconds") if async_goal.exception is not None: return async_goal.exception
def _set_backend_goal(self, backend_info: Optional[BackendInfo]) -> None: """ Set desirable state for a given backend, identified by tag. Args: backend_info (Optional[BackendInfo]): Contains backend and replica config, if passed in as None, we're marking target backend as shutting down. """ existing_goal_id = self._curr_goal new_goal_id = self._goal_manager.create_goal() if backend_info is not None: self._target_info = backend_info self._target_replicas = backend_info.backend_config.num_replicas self._target_version = BackendVersion( backend_info.version, user_config=backend_info.backend_config.user_config) else: self._target_replicas = 0 self._curr_goal = new_goal_id logger.debug( f"Set backend goal for {self._name} with version " f"{backend_info if backend_info is None else backend_info.version}" ) return new_goal_id, existing_goal_id
def start(self, backend_info: BackendInfo): self._actor_resources = backend_info.replica_config.resource_dict try: self._placement_group = ray.util.get_placement_group( self._placement_group_name) except ValueError: logger.debug( "Creating placement group '{}' for backend '{}'".format( self._placement_group_name, self._backend_tag)) self._placement_group = ray.util.placement_group( [self._actor_resources], lifetime="detached", name=self._placement_group_name) try: self._actor_handle = ray.get_actor(self._actor_name) except ValueError: logger.debug("Starting replica '{}' for backend '{}'.".format( self._replica_tag, self._backend_tag)) self._actor_handle = ray.remote(backend_info.worker_class).options( name=self._actor_name, lifetime="detached" if self._detached else None, placement_group=self._placement_group, placement_group_capture_child_tasks=False, **backend_info.replica_config.ray_actor_options).remote( self._backend_tag, self._replica_tag, backend_info.replica_config.init_args, backend_info.backend_config, self._controller_name) self._startup_obj_ref = self._actor_handle.ready.remote()
def check_stopped(self) -> bool: """Check if the replica has stopped. If so, transition to STOPPED. Should handle the case where the replica has already stopped. """ if self._state == ReplicaState.STOPPED: return True assert self._state == ReplicaState.STOPPING, ( f"State must be {ReplicaState.STOPPING}, *not* {self._state}") stopped = self._actor.check_stopped() if stopped: self._state = ReplicaState.STOPPED # Clean up any associated resources (e.g., placement group). self._actor.cleanup() return True timeout_passed = time.time() > self._shutdown_deadline if timeout_passed: # Graceful period passed, kill it forcefully. # This will be called repeatedly until the replica shuts down. logger.debug( f"Replica {self._replica_tag} did not shutdown after " f"{self._graceful_shutdown_timeout_s}s, force-killing.") self._actor.force_stop() return False
async def enqueue_request(self, request_meta, *request_args, **request_kwargs): service = request_meta.service logger.debug("Received a request for service {}".format(service)) # check if the slo specified is directly the # wall clock time if request_meta.absolute_slo_ms is not None: request_slo_ms = request_meta.absolute_slo_ms else: request_slo_ms = request_meta.adjust_relative_slo_ms() request_context = request_meta.request_context query = Query( request_args, request_kwargs, request_context, request_slo_ms, call_method=request_meta.call_method, async_future=asyncio.get_event_loop().create_future()) await self.service_queues[service].put(query) await self.flush() # Note: a future change can be to directly return the ObjectID from # replica task submission result = await query.async_future return result
async def enqueue_request(self, request_meta, *request_args, **request_kwargs): endpoint = request_meta.endpoint logger.debug("Received a request for endpoint {}".format(endpoint)) self.num_router_requests.labels(endpoint=endpoint).add() # check if the slo specified is directly the # wall clock time if request_meta.absolute_slo_ms is not None: request_slo_ms = request_meta.absolute_slo_ms else: request_slo_ms = request_meta.adjust_relative_slo_ms() request_context = request_meta.request_context query = Query(request_args, request_kwargs, request_context, request_slo_ms, call_method=request_meta.call_method, shard_key=request_meta.shard_key, async_future=asyncio.get_event_loop().create_future()) async with self.flush_lock: self.endpoint_queues[endpoint].appendleft(query) self.flush_endpoint_queue(endpoint) # Note: a future change can be to directly return the ObjectID from # replica task submission try: result = await query.async_future except RayTaskError as e: self.num_error_endpoint_request.labels(endpoint=endpoint).add() result = e return result
def _process_update(self, updates: Dict[str, UpdatedObject]): if isinstance(updates, (ray.exceptions.RayActorError)): # This can happen during shutdown where the controller is # intentionally killed, the client should just gracefully # exit. logger.debug("LongPollClient failed to connect to host. " "Shutting down.") return if isinstance(updates, (ray.exceptions.RayTaskError)): # Some error happened in the controller. It could be a bug or some # undesired state. logger.error("LongPollHost errored\n" + updates.traceback_str) self._poll_next() return logger.debug(f"LongPollClient {self} received updates for keys: " f"{list(updates.keys())}.") for key, update in updates.items(): self.object_snapshots[key] = update.object_snapshot self.snapshot_ids[key] = update.snapshot_id callback = self.key_listeners[key] # Bind the parameters because closures are late-binding. # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501 def chained(callback=callback, arg=update.object_snapshot): callback(arg) self._on_callback_completed(trigger_at=len(updates)) if self.event_loop is None: chained() else: self.event_loop.call_soon_threadsafe(chained)
def set_max_concurrent_queries(self, backend_config: BackendConfig): new_value: int = backend_config.max_concurrent_queries if new_value != self.max_concurrent_queries: self.max_concurrent_queries = new_value logger.debug( f"ReplicaSet: changing max_concurrent_queries to {new_value}") self.config_updated_event.set()
def check_stopped(self): if self._state == ReplicaState.STOPPED: return True assert self._state == ReplicaState.STOPPING, ( f"State must be {ReplicaState.STOPPING}, *not* {self._state}") try: replica = ray.get_actor(self._actor_name) except ValueError: self._state = ReplicaState.STOPPED return True ready, _ = ray.wait([self._drain_obj_ref], timeout=0) timeout_passed = time.time() > self._shutdown_deadline if len(ready) == 1 or timeout_passed: if timeout_passed: # Graceful period passed, kill it forcefully. logger.debug( f"{self._actor_name} did not shutdown after " f"{self._graceful_shutdown_timeout_s}s, force-killing.") ray.kill(replica, no_restart=True) self._state = ReplicaState.STOPPED return True return False
async def assign_replica(self, query: Query) -> ray.ObjectRef: """Given a query, submit it to a replica and return the object ref. This method will keep track of the in flight queries for each replicas and only send a query to available replicas (determined by the backend max_concurrent_quries value.) """ endpoint = query.metadata.endpoint self.num_queued_queries += 1 self.num_queued_queries_gauge.set(self.num_queued_queries, tags={"endpoint": endpoint}) assigned_ref = self._try_assign_replica(query) while assigned_ref is None: # Can't assign a replica right now. logger.debug("Failed to assign a replica for " f"query {query.metadata.request_id}") # Maybe there exists a free replica, we just need to refresh our # query tracker. num_finished = self._drain_completed_object_refs() # All replicas are really busy, wait for a query to complete or the # config to be updated. if num_finished == 0: logger.debug( "All replicas are busy, waiting for a free replica.") await asyncio.wait(self._all_query_refs + [self.config_updated_event.wait()], return_when=asyncio.FIRST_COMPLETED) if self.config_updated_event.is_set(): self.config_updated_event.clear() # We are pretty sure a free replica is ready now, let's recurse and # assign this query a replica. assigned_ref = self._try_assign_replica(query) self.num_queued_queries -= 1 self.num_queued_queries_gauge.set(self.num_queued_queries, tags={"endpoint": endpoint}) return assigned_ref
def start(self, backend_info: BackendInfo): self._actor_resources = backend_info.replica_config.resource_dict # Feature flagging because of placement groups doesn't handle # newly added nodes. # https://github.com/ray-project/ray/issues/15801 if USE_PLACEMENT_GROUP: try: self._placement_group = ray.util.get_placement_group( self._placement_group_name) except ValueError: logger.debug( "Creating placement group '{}' for backend '{}'".format( self._placement_group_name, self._backend_tag)) self._placement_group = ray.util.placement_group( [self._actor_resources], lifetime="detached" if self._detached else None, name=self._placement_group_name) try: self._actor_handle = ray.get_actor(self._actor_name) except ValueError: logger.debug("Starting replica '{}' for backend '{}'.".format( self._replica_tag, self._backend_tag)) self._actor_handle = backend_info.actor_def.options( name=self._actor_name, lifetime="detached" if self._detached else None, placement_group=self._placement_group, placement_group_capture_child_tasks=False, **backend_info.replica_config.ray_actor_options).remote( self._backend_tag, self._replica_tag, backend_info.replica_config.init_args, backend_info.backend_config, self._controller_name) self._startup_obj_ref = self._actor_handle.ready.remote()
async def _start_backend_replica(self, current_state: SystemState, backend_tag: BackendTag, replica_tag: ReplicaTag) -> ActorHandle: """Start a replica and return its actor handle. Checks if the named actor already exists before starting a new one. Assumes that the backend configuration is already in the Goal State. """ # NOTE(edoakes): the replicas may already be created if we # failed after creating them but before writing a # checkpoint. replica_name = format_actor_name(replica_tag, self.controller_name) try: replica_handle = ray.get_actor(replica_name) except ValueError: logger.debug("Starting replica '{}' for backend '{}'.".format( replica_tag, backend_tag)) backend_info = current_state.get_backend(backend_tag) replica_handle = ray.remote(backend_info.worker_class).options( name=replica_name, lifetime="detached" if self.detached else None, max_restarts=-1, max_task_retries=-1, **backend_info.replica_config.ray_actor_options).remote( backend_tag, replica_tag, backend_info.replica_config.actor_init_args, backend_info.backend_config, self.controller_name) return replica_handle
def update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: logger.debug(f"Got updated endpoints: {endpoints}.") existing_handles = set(self.handles.keys()) routes = [] route_info = {} for endpoint, info in endpoints.items(): # Default case where the user did not specify a route prefix. if info.route is None: route = f"/{endpoint}" else: route = info.route routes.append(route) route_info[route] = endpoint if endpoint in self.handles: existing_handles.remove(endpoint) else: self.handles[endpoint] = self._get_handle(endpoint) # Clean up any handles that are no longer used. for endpoint in existing_handles: del self.handles[endpoint] # Routes are sorted in order of decreasing length to enable longest # prefix matching. self.sorted_routes = sorted(routes, key=lambda x: len(x), reverse=True) self.route_info = route_info
async def _flush_service_queues(self): # perform traffic splitting for requests for service, queue in self.service_queues.items(): # while there are incoming requests and there are backends while queue.qsize() and len(self.traffic[service]): backend_names = list(self.traffic[service].keys()) backend_weights = list(self.traffic[service].values()) if len(self.traffic[service]) >= 2: # randomly pick 2 backends backend1, backend2 = np.random.choice(backend_names, 2, replace=False, p=backend_weights) # see the length of buffer queues of the two backends # and pick the one which has less no. of queries # in the buffer if (len(self.buffer_queues[backend1]) <= len( self.buffer_queues[backend2])): chosen_backend = backend1 else: chosen_backend = backend2 logger.debug("[Power of two chocies] found two backends " "{} and {}: choosing {}.".format( backend1, backend2, chosen_backend)) else: chosen_backend = np.random.choice( backend_names, replace=False, p=backend_weights).squeeze() request = await queue.get() self.buffer_queues[chosen_backend].add(request)
async def set_traffic(self, service, traffic_dict): logger.debug("Setting traffic for service %s to %s", service, traffic_dict) self.traffic[service] = traffic_dict backend_names = list(self.traffic[service].keys()) self.round_robin_iterator_map[service] = itertools.cycle(backend_names) await self.flush()
async def _start_single_replica(self, config_store: ConfigurationStore, backend_tag: BackendTag, replica_tag: ReplicaTag, replica_name: str) -> ActorHandle: """Creates a backend replica and waits for it to start up. Assumes that the backend configuration has already been registered in the ConfigurationStore. """ logger.debug("Starting replica '{}' for backend '{}'.".format( replica_tag, backend_tag)) backend_info = config_store.get_backend(backend_tag) replica_handle = ray.remote(backend_info.worker_class).options( name=replica_name, lifetime="detached" if self.detached else None, max_restarts=-1, max_task_retries=-1, **backend_info.replica_config.ray_actor_options).remote( backend_tag, replica_tag, backend_info.replica_config.actor_init_args, backend_info.backend_config, self.controller_name) # TODO(edoakes): we should probably have a timeout here. await replica_handle.ready.remote() return replica_handle
async def enqueue_request(self, request_meta, *request_args, **request_kwargs): endpoint = request_meta.endpoint logger.debug("Received request {} for endpoint {}.".format( request_meta.request_id, endpoint)) request_start = time.time() self.num_router_requests.record(1, tags={"endpoint": endpoint}) request_context = request_meta.request_context query = Query(request_args, request_kwargs, request_context, metadata=request_meta, async_future=asyncio.get_event_loop().create_future()) async with self.flush_lock: self.endpoint_queues[endpoint].appendleft(query) self.flush_endpoint_queue(endpoint) try: result = await query.async_future except RayTaskError as e: self.num_error_endpoint_requests.record( 1, tags={"endpoint": endpoint}) result = e request_time_ms = (time.time() - request_start) * 1000 logger.debug("Finished request {} in {:.2f}ms".format( request_meta.request_id, request_time_ms)) return result
async def _start_backend_worker(self, backend_tag, replica_tag): """Creates a backend worker and waits for it to start up. Assumes that the backend configuration has already been registered in self.backends. """ logger.debug("Starting worker '{}' for backend '{}'.".format( replica_tag, backend_tag)) (backend_worker, backend_config, replica_config) = self.backends[backend_tag] replica_name = format_actor_name(replica_tag, self.instance_name) worker_handle = ray.remote(backend_worker).options( name=replica_name, max_restarts=-1, max_task_retries=-1, **replica_config.ray_actor_options).remote( backend_tag, replica_tag, replica_config.actor_init_args, backend_config, instance_name=self.instance_name) # TODO(edoakes): we should probably have a timeout here. await worker_handle.ready.remote() return worker_handle
async def add_new_worker(self, backend_tag, replica_tag, worker_handle): backend_replica_tag = backend_tag + ":" + replica_tag if backend_replica_tag in self.replicas: return self.replicas[backend_replica_tag] = worker_handle logger.debug("New worker added for backend '{}'".format(backend_tag)) await self.mark_worker_idle(backend_tag, backend_replica_tag)
async def remove_endpoint(self, endpoint): logger.debug("Removing endpoint {}".format(endpoint)) async with self.flush_lock: self.flush_endpoint_queue(endpoint) if endpoint in self.endpoint_queues: del self.endpoint_queues[endpoint] if endpoint in self.traffic: del self.traffic[endpoint]
async def set_traffic(self, endpoint, traffic_dict): logger.debug("Setting traffic for endpoint %s to %s", endpoint, traffic_dict) self.traffic[endpoint] = traffic_dict backend_names = list(self.traffic[endpoint].keys()) self.round_robin_iterator_map[endpoint] = itertools.cycle( backend_names) await self.flush()
def notify_changed(self, object_key: str, updated_object: Any): self.snapshot_ids[object_key] += 1 self.object_snapshots[object_key] = updated_object logger.debug(f"LongPollerHost: {object_key} = {updated_object}") if object_key in self.notifier_events: for event in self.notifier_events.pop(object_key): event.set()
def __del__(self): if not self._detached: logger.debug( "Shutting down Ray Serve because client went out of " "scope. To prevent this, either keep a reference to " "the client or use serve.start(detached=True)." ) self.shutdown()
async def set_traffic(self, service, traffic_dict): logger.debug("Setting traffic for service %s to %s", service, traffic_dict) self.traffic[service] = traffic_dict backend_names = list(self.traffic[service].keys()) self.fixed_packing_iterator_map[service] = itertools.cycle( itertools.chain.from_iterable( itertools.repeat(x, self.packing_num) for x in backend_names)) await self.flush()
async def remove_service(self, service): logger.debug("Removing service {}".format(service)) async with self.flush_lock: await self._flush_service_queues() await self._flush_buffer_queues() if service in self.service_queues: del self.service_queues[service] if service in self.traffic: del self.traffic[service]
async def wait_for_goal(self, goal_id: GoalId) -> None: start = time.time() if goal_id not in self.pending_goals: logger.debug(f"Goal {goal_id} not found") return True event = self.pending_goals[goal_id] await event.wait() logger.debug( f"Waiting for goal {goal_id} took {time.time() - start} seconds")
async def __init__(self, kv_store_connector, router_class, router_kwargs, start_http_proxy, http_proxy_host, http_proxy_port, metric_gc_window_s): # Used to read/write checkpoints. # TODO(edoakes): namespace the master actor and its checkpoints. self.kv_store_client = kv_store_connector("serve_checkpoints") # path -> (endpoint, methods). self.routes = {} # backend -> (worker_creator, init_args, backend_config). self.backends = {} # backend -> replica_tags. self.replicas = defaultdict(list) # replicas that should be started if recovering from a checkpoint. self.replicas_to_start = defaultdict(list) # replicas that should be stopped if recovering from a checkpoint. self.replicas_to_stop = defaultdict(list) # endpoint -> traffic_dict self.traffic_policies = dict() # Dictionary of backend tag to dictionaries of replica tag to worker. # TODO(edoakes): consider removing this and just using the names. self.workers = defaultdict(dict) # Used to ensure that only a single state-changing operation happens # at any given time. self.write_lock = asyncio.Lock() # Cached handles to actors in the system. self.router = None self.http_proxy = None self.metric_monitor = None # If starting the actor for the first time, starts up the other system # components. If recovering, fetches their actor handles. self._get_or_start_router(router_class, router_kwargs) if start_http_proxy: self._get_or_start_http_proxy(http_proxy_host, http_proxy_port) self._get_or_start_metric_monitor(metric_gc_window_s) # NOTE(edoakes): unfortunately, we can't completely recover from a # checkpoint in the constructor because we block while waiting for # other actors to start up, and those actors fetch soft state from # this actor. Because no other tasks will start executing until after # the constructor finishes, if we were to run this logic in the # constructor it could lead to deadlock between this actor and a child. # However we do need to guarantee that we have fully recovered from a # checkpoint before any other state-changing calls run. We address this # by acquiring the write_lock and then posting the task to recover from # a checkpoint to the event loop. Other state-changing calls acquire # this lock and will be blocked until recovering from the checkpoint # finishes. checkpoint = self.kv_store_client.get("checkpoint") if checkpoint is None: logger.debug("No checkpoint found") else: await self.write_lock.acquire() asyncio.get_event_loop().create_task( self._recover_from_checkpoint(checkpoint))
async def remove_backend(self, backend): logger.debug("Removing backend {}".format(backend)) async with self.flush_lock: self.flush_backend_queues([backend]) if backend in self.backend_info: del self.backend_info[backend] if backend in self.worker_queues: del self.worker_queues[backend] if backend in self.backend_queues: del self.backend_queues[backend]
async def _do_query(self, backend, backend_replica_tag, req): # If the worker died, this will be a RayActorError. Just return it and # let the HTTP proxy handle the retry logic. logger.debug("Sending query to replica:" + backend_replica_tag) start = time.time() worker = self.replicas[backend_replica_tag] result = await worker.handle_request.remote(req) await self.mark_worker_idle(backend, backend_replica_tag) logger.debug("Got result in {:.2f}s", time.time() - start) return result
async def __init__(self, controller_name: str, http_config: HTTPConfig, detached: bool = False): # Used to read/write checkpoints. self.kv_store = RayInternalKVStore(namespace=controller_name) self.actor_reconciler = ActorStateReconciler(controller_name, detached) # backend -> AutoscalingPolicy self.autoscaling_policies = dict() # Dictionary of backend_tag -> proxy_name -> most recent queue length. self.backend_stats = defaultdict(lambda: defaultdict(dict)) # Used to ensure that only a single state-changing operation happens # at any given time. self.write_lock = asyncio.Lock() # Map of awaiting results # TODO(ilr): Checkpoint this once this becomes asynchronous self.inflight_results: Dict[UUID, asyncio.Event] = dict() self._serializable_inflight_results: Dict[UUID, FutureResult] = dict() # HTTP state doesn't currently require a checkpoint. self.http_state = HTTPState(controller_name, detached, http_config) checkpoint_bytes = self.kv_store.get(CHECKPOINT_KEY) if checkpoint_bytes is None: logger.debug("No checkpoint found") self.backend_state = BackendState() self.endpoint_state = EndpointState() else: checkpoint: Checkpoint = pickle.loads(checkpoint_bytes) self.backend_state = BackendState( checkpoint=checkpoint.backend_state_checkpoint) self.endpoint_state = EndpointState( checkpoint=checkpoint.endpoint_state_checkpoint) await self._recover_from_checkpoint(checkpoint) # NOTE(simon): Currently we do all-to-all broadcast. This means # any listeners will receive notification for all changes. This # can be problem at scale, e.g. updating a single backend config # will send over the entire configs. In the future, we should # optimize the logic to support subscription by key. self.long_poll_host = LongPollHost() # The configs pushed out here get updated by # self._recover_from_checkpoint in the failure scenario, so that must # be run before we notify the changes. self.notify_backend_configs_changed() self.notify_replica_handles_changed() self.notify_traffic_policies_changed() self.notify_route_table_changed() asyncio.get_event_loop().create_task(self.run_control_loop())