def __del__(self): if not self._detached: logger.info("Shutting down Ray Serve because client went out of " "scope. To prevent this, either keep a reference to " "the client object or use serve.start(detached=True).") self.shutdown()
def start( detached: bool = False, http_options: Optional[Union[dict, HTTPOptions]] = None, dedicated_cpu: bool = False, _checkpoint_path: str = DEFAULT_CHECKPOINT_PATH, **kwargs, ) -> Client: """Initialize a serve instance. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the instance will instead persist until serve.shutdown() is called. This is only relevant if connecting to a long-running Ray cluster (e.g., with ray.init(address="auto") or ray.init("ray://<remote_addr>")). Args: detached (bool): Whether not the instance should be detached from this script. If set, the instance will live on the Ray cluster until it is explicitly stopped with serve.shutdown(). http_options (Optional[Dict, serve.HTTPOptions]): Configuration options for HTTP proxy. You can pass in a dictionary or HTTPOptions object with fields: - host(str, None): Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". - port(int): Port for HTTP server. Defaults to 8000. - middlewares(list): A list of Starlette middlewares that will be applied to the HTTP servers in the cluster. Defaults to []. - location(str, serve.config.DeploymentMode): The deployment location of HTTP servers: - "HeadOnly": start one HTTP server on the head node. Serve assumes the head node is the node you executed serve.start on. This is the default. - "EveryNode": start one HTTP server per node. - "NoServer" or None: disable HTTP server. - num_cpus (int): The number of CPU cores to reserve for each internal Serve HTTP proxy actor. Defaults to 0. dedicated_cpu (bool): Whether to reserve a CPU core for the internal Serve controller actor. Defaults to False. """ http_deprecated_args = ["http_host", "http_port", "http_middlewares"] for key in http_deprecated_args: if key in kwargs: raise ValueError( f"{key} is deprecated, please use serve.start(http_options=" f'{{"{key}": {kwargs[key]}}}) instead.') # Initialize ray if needed. ray.worker.global_worker.filter_logs_by_job = False if not ray.is_initialized(): ray.init(namespace="serve") controller_namespace = _get_controller_namespace(detached) try: client = _get_global_client() logger.info("Connecting to existing Serve instance in namespace " f"'{controller_namespace}'.") return client except RayServeException: pass if detached: controller_name = SERVE_CONTROLLER_NAME else: controller_name = format_actor_name(get_random_letters(), SERVE_CONTROLLER_NAME) if isinstance(http_options, dict): http_options = HTTPOptions.parse_obj(http_options) if http_options is None: http_options = HTTPOptions() controller = ServeController.options( num_cpus=(1 if dedicated_cpu else 0), name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, # Pin Serve controller on the head node. resources={ get_current_node_resource_key(): 0.01 }, namespace=controller_namespace, max_concurrency=CONTROLLER_MAX_CONCURRENCY, ).remote( controller_name, http_options, _checkpoint_path, detached=detached, ) proxy_handles = ray.get(controller.get_http_proxies.remote()) if len(proxy_handles) > 0: try: ray.get( [handle.ready.remote() for handle in proxy_handles.values()], timeout=HTTP_PROXY_TIMEOUT, ) except ray.exceptions.GetTimeoutError: raise TimeoutError( "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.") client = Client(controller, controller_name, detached=detached) _set_global_client(client) logger.info(f"Started{' detached ' if detached else ' '}Serve instance in " f"namespace '{controller_namespace}'.") return client
ray.shutdown() ray.init(address="auto") client = serve.start() # These numbers need to correspond with the autoscaler config file. # The number of remote nodes in the autoscaler should upper bound # these because sometimes nodes fail to update. num_workers = 20 expected_num_nodes = num_workers + 1 cpus_per_node = 4 num_remote_cpus = expected_num_nodes * cpus_per_node # Wait until the expected number of nodes have joined the cluster. while True: num_nodes = len(ray.nodes()) logger.info("Waiting for nodes {}/{}".format(num_nodes, expected_num_nodes)) if num_nodes >= expected_num_nodes: break time.sleep(5) logger.info("Nodes have all joined. There are %s resources.", ray.cluster_resources()) def hey(_): time.sleep(0.01) # Sleep for 10ms return b"hey" num_connections = int(num_remote_cpus * 0.75) num_threads = 2 time_to_run = "10s"
def _scale_backend_replicas( self, backend_tag: BackendTag, target_replicas: int, target_version: str, ) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. The caller is responsible for then first writing a checkpoint and then actually starting/stopping the intended replicas. This avoids inconsistencies with starting/stopping a replica and then crashing before writing a checkpoint. """ assert (backend_tag in self._backend_metadata ), "Backend {} is not registered.".format(backend_tag) assert target_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") backend_info: BackendInfo = self._backend_metadata[backend_tag] graceful_shutdown_timeout_s = ( backend_info.backend_config. experimental_graceful_shutdown_timeout_s) stopped = self._stop_wrong_version_replicas( self._replicas[backend_tag], target_replicas, target_version, graceful_shutdown_timeout_s) if stopped > 0: logger.info(f"Stopping {stopped} replicas of backend " f"'{backend_tag}' with outdated versions.") current_replicas = self._replicas[backend_tag].count(states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING, ReplicaState.RUNNING ]) delta_replicas = target_replicas - current_replicas if delta_replicas == 0: return False elif delta_replicas > 0: # Don't ever exceed target_replicas. stopping_replicas = self._replicas[backend_tag].count(states=[ ReplicaState.SHOULD_STOP, ReplicaState.STOPPING, ]) to_add = max(delta_replicas - stopping_replicas, 0) if to_add > 0: logger.info(f"Adding {to_add} replicas " f"to backend '{backend_tag}'.") for _ in range(to_add): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) self._replicas[backend_tag].add( ReplicaState.SHOULD_START, BackendReplica(self._controller_name, self._detached, replica_tag, backend_tag, target_version)) elif delta_replicas < 0: to_remove = -delta_replicas logger.info(f"Removing {to_remove} replicas " f"from backend '{backend_tag}'.") replicas_to_stop = self._replicas[backend_tag].pop( states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING, ReplicaState.RUNNING ], max_replicas=to_remove) for replica in replicas_to_stop: replica.set_should_stop(graceful_shutdown_timeout_s) self._replicas[backend_tag].add(ReplicaState.SHOULD_STOP, replica) return True
def make_kv_store(checkpoint_path, namespace): """Create KVStore instance based on checkpoint_path configuration""" if checkpoint_path == DEFAULT_CHECKPOINT_PATH: logger.info( "Using RayInternalKVStore for controller checkpoint and recovery.") return RayInternalKVStore(namespace) else: parsed_url = urlparse(checkpoint_path) if parsed_url.scheme not in {"gs", "s3", "file", "custom"}: raise ValueError( f"Checkpoint must be one of `{DEFAULT_CHECKPOINT_PATH}`, " "`file://path...`, `gs://path...`, `s3://path...`, or " "`custom://my_module.ClassName?arg1=val1`. But it is " f"{checkpoint_path}") if parsed_url.scheme == "file": db_path = parsed_url.netloc + parsed_url.path logger.info("Using RayLocalKVStore for controller " f"checkpoint and recovery: path={db_path}") return RayLocalKVStore(namespace, db_path) if parsed_url.scheme == "gs": bucket = parsed_url.netloc # We need to strip leading "/" in path as right key to use in # gcs. Ex: gs://bucket/folder/file.zip -> key = "folder/file.zip" prefix = parsed_url.path.lstrip("/") logger.info("Using Ray GCS KVStore for controller checkpoint and" " recovery: " f"bucket={bucket} checkpoint_path={checkpoint_path}") return RayGcsKVStore( namespace, bucket=bucket, prefix=prefix, ) if parsed_url.scheme == "s3": bucket = parsed_url.netloc # We need to strip leading "/" in path as right key to use in # boto3. Ex: s3://bucket/folder/file.zip -> key = "folder/file.zip" prefix = parsed_url.path.lstrip("/") logger.info( "Using Ray S3 KVStore for controller checkpoint and recovery: " f"bucket={bucket} checkpoint_path={checkpoint_path}") return RayS3KVStore( namespace, bucket=bucket, prefix=prefix, ) if parsed_url.scheme == "custom": kwargs = dict(parse_qsl(parsed_url.query)) # Prepare the parameters to initialize imported class. checkpoint_provider = parsed_url.netloc KVStoreClass = import_attr(checkpoint_provider) if not issubclass(KVStoreClass, KVStoreBase): raise ValueError( f"{KVStoreClass} doesn't inherit from " "`ray.serve.storage.kv_store_base.KVStoreBase`.") logger.info( f"Using {checkpoint_provider} for controller checkpoint and " f"recovery: kwargs={kwargs}") return KVStoreClass(namespace=namespace, **kwargs) raise RuntimeError("This shouldn't be reachable.")
def main(): # Setup local cluster, note this cluster setup is the same for both # local and product ray cluster env. # Each test uses different ray namespace, thus kv storage key for each # checkpoint is different to avoid collision. namespace = uuid.uuid4().hex # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": path = Path("checkpoint.db") checkpoint_path = f"file://{path}" if path.exists(): path.unlink() else: checkpoint_path = ( "s3://serve-nightly-tests/fault-tolerant-test-checkpoint" # noqa: E501 ) _, cluster = setup_local_single_node_cluster( 1, checkpoint_path=checkpoint_path, namespace=namespace) # Deploy for the first time @serve.deployment(num_replicas=DEFAULT_NUM_REPLICAS) def hello(): return serve.get_replica_context().deployment for name in ["hello", "world"]: hello.options(name=name).deploy() for _ in range(5): response = request_with_retries(f"/{name}/", timeout=3) assert response.text == name logger.info("Initial deployment successful with working endpoint.") # Kill current cluster, recover from remote checkpoint and ensure endpoint # is still available with expected results ray.kill(serve.api._global_client._controller, no_restart=True) ray.shutdown() cluster.shutdown() serve.api._set_global_client(None) # Start another ray cluster with same namespace to resume from previous # checkpoints with no new deploy() call. setup_local_single_node_cluster(1, checkpoint_path=checkpoint_path, namespace=namespace) for name in ["hello", "world"]: for _ in range(5): response = request_with_retries(f"/{name}/", timeout=3) assert response.text == name logger.info("Deployment recovery from s3 checkpoint is successful " "with working endpoint.") # Delete dangling checkpoints. If script failed before this step, it's up # to the TTL policy on s3 to clean up, but won't lead to collision with # subsequent tests since each test run in different uuid namespace. serve.shutdown() ray.shutdown() cluster.shutdown() # Checkpoints in S3 bucket are moved after 7 days with explicit lifecycle # rules. Each checkpoint is ~260 Bytes in size from this test. # Save results save_test_results( {"result": "success"}, default_output_file="/tmp/serve_cluster_fault_tolerance.json", )
def _stop_wrong_version_replicas( self, backend_tag: BackendTag, replicas: ReplicaStateContainer, target_replicas: int, target_version: str, graceful_shutdown_timeout_s: float) -> int: """Stops replicas with outdated versions to implement rolling updates. This includes both explicit code version updates and changes to the user_config. """ # NOTE(edoakes): this short-circuits when using the legacy # `create_backend` codepath -- it can be removed once we deprecate # that as the version should never be None. if target_version is None: return 0 # Short circuit if target replicas is 0 (the backend is being deleted) # because this will be handled in the main loop. if target_replicas == 0: return 0 # We include SHOULD_START and STARTING_OR_UPDATING replicas here # because if there are replicas still pending startup, we may as well # terminate them and start new version replicas instead. old_running_replicas = replicas.count( exclude_version=target_version, states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING_OR_UPDATING, ReplicaState.RUNNING ]) old_stopping_replicas = replicas.count( exclude_version=target_version, states=[ReplicaState.SHOULD_STOP, ReplicaState.STOPPING]) new_running_replicas = replicas.count( version=target_version, states=[ReplicaState.RUNNING]) # If the backend is currently scaling down, let the scale down # complete before doing a rolling update. if target_replicas < old_running_replicas + old_stopping_replicas: return 0 # The number of replicas that are currently in transition between # an old version and the new version. Note that we cannot directly # count the number of stopping replicas because once replicas finish # stopping, they are removed from the data structure. pending_replicas = ( target_replicas - new_running_replicas - old_running_replicas) # Maximum number of replicas that can be updating at any given time. # There should never be more than rollout_size old replicas stopping # or rollout_size new replicas starting. rollout_size = max(int(0.2 * target_replicas), 1) max_to_stop = max(rollout_size - pending_replicas, 0) replicas_to_update = replicas.pop( exclude_version=target_version, states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING_OR_UPDATING, ReplicaState.RUNNING ], max_replicas=max_to_stop) code_version_changes = 0 user_config_changes = 0 for replica in replicas_to_update: # If the code version is a mismatch, we stop the replica. A new one # with the correct version will be started later as part of the # normal scale-up process. if replica.version.code_version != target_version.code_version: code_version_changes += 1 replica.set_should_stop(graceful_shutdown_timeout_s) replicas.add(ReplicaState.SHOULD_STOP, replica) # If only the user_config is a mismatch, we update it dynamically # without restarting the replica. elif (replica.version.user_config_hash != target_version.user_config_hash): user_config_changes += 1 replica.start_or_update(self._backend_metadata[backend_tag], target_version) replicas.add(ReplicaState.STARTING_OR_UPDATING, replica) else: assert False, "Update must be code version or user config." if code_version_changes > 0: logger.info(f"Stopping {code_version_changes} replicas of backend " f"'{backend_tag}' with outdated versions.") if user_config_changes > 0: logger.info(f"Updating {user_config_changes} replicas of backend " f"'{backend_tag}' with outdated user_configs.") return len(replicas_to_update)
async def _recover_from_checkpoint(self, checkpoint_bytes): """Recover the cluster state from the provided checkpoint. Performs the following operations: 1) Deserializes the internal state from the checkpoint. 2) Pushes the latest configuration to the HTTP proxy and router in case we crashed before updating them. 3) Starts/stops any worker replicas that are pending creation or deletion. NOTE: this requires that self.write_lock is already acquired and will release it before returning. """ assert self.write_lock.locked() start = time.time() logger.info("Recovering from checkpoint") # Load internal state from the checkpoint data. ( self.routes, self.backends, self.traffic_policies, self.replicas, self.replicas_to_start, self.replicas_to_stop, self.backends_to_remove, self.endpoints_to_remove, ) = pickle.loads(checkpoint_bytes) # Fetch actor handles for all of the backend replicas in the system. # All of these workers are guaranteed to already exist because they # would not be written to a checkpoint in self.workers until they # were created. for backend_tag, replica_tags in self.replicas.items(): for replica_tag in replica_tags: replica_name = format_actor_name(replica_tag, self.cluster_name) self.workers[backend_tag][replica_tag] = ray.util.get_actor( replica_name) # Push configuration state to the router. # TODO(edoakes): should we make this a pull-only model for simplicity? for endpoint, traffic_policy in self.traffic_policies.items(): await self.router.set_traffic.remote(endpoint, traffic_policy) for backend_tag, replica_dict in self.workers.items(): for replica_tag, worker in replica_dict.items(): await self.router.add_new_worker.remote( backend_tag, replica_tag, worker) for backend, (_, backend_config, _) in self.backends.items(): await self.router.set_backend_config.remote( backend, backend_config) # Push configuration state to the HTTP proxy. await self.http_proxy.set_route_table.remote(self.routes) # Start/stop any pending backend replicas. await self._start_pending_replicas() await self._stop_pending_replicas() # Remove any pending backends and endpoints. await self._remove_pending_backends() await self._remove_pending_endpoints() logger.info("Recovered from checkpoint in {:.3f}s".format(time.time() - start)) self.write_lock.release()
def main( min_replicas: Optional[int], max_replicas: Optional[int], trial_length: Optional[str], max_batch_size: Optional[int], ): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py if is_smoke_test(): min_replicas = min_replicas or DEFAULT_SMOKE_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_SMOKE_TEST_MAX_NUM_REPLICA trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info( f"Running local / smoke test with min {min_replicas} and max " f"{max_replicas} replicas ..\n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(max_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes ..\n") serve_client = setup_local_single_node_cluster(num_nodes)[0] else: min_replicas = min_replicas or DEFAULT_FULL_TEST_MIN_NUM_REPLICA max_replicas = max_replicas or DEFAULT_FULL_TEST_MAX_NUM_REPLICA trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with min {min_replicas} and max " f"{max_replicas} replicas ..\n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with min {min_replicas} and max {max_replicas} " f"target replicas ....\n") deploy_replicas(min_replicas, max_replicas, max_batch_size) logger.info("Warming up cluster ....\n") warm_up_one_cluster.remote(10, http_host, http_port, "echo") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_endpoints = list(serve.list_deployments().keys()) all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/autoscaling_single_deployment.json", )
async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None: """Recover the instance state from the provided checkpoint. Performs the following operations: 1) Deserializes the internal state from the checkpoint. 2) Pushes the latest configuration to the routers in case we crashed before updating them. 3) Starts/stops any worker replicas that are pending creation or deletion. NOTE: this requires that self.write_lock is already acquired and will release it before returning. """ assert self.write_lock.locked() start = time.time() logger.info("Recovering from checkpoint") # Load internal state from the checkpoint data. ( self.routes, router_node_ids, self.backends, self.traffic_policies, self.replicas, self.replicas_to_start, self.replicas_to_stop, self.backends_to_remove, self.endpoints_to_remove, ) = pickle.loads(checkpoint_bytes) for node_id in router_node_ids: router_name = format_actor_name(SERVE_PROXY_NAME, self.controller_name, node_id) self.routers[node_id] = ray.get_actor(router_name) # Fetch actor handles for all of the backend replicas in the system. # All of these workers are guaranteed to already exist because they # would not be written to a checkpoint in self.workers until they # were created. for backend_tag, replica_tags in self.replicas.items(): for replica_tag in replica_tags: replica_name = format_actor_name(replica_tag, self.controller_name) self.workers[backend_tag][replica_tag] = ray.get_actor( replica_name) # Push configuration state to the router. # TODO(edoakes): should we make this a pull-only model for simplicity? for endpoint, traffic_policy in self.traffic_policies.items(): await asyncio.gather(*[ router.set_traffic.remote(endpoint, traffic_policy) for router in self.routers.values() ]) for backend_tag, replica_dict in self.workers.items(): for replica_tag, worker in replica_dict.items(): await asyncio.gather(*[ router.add_new_worker.remote(backend_tag, replica_tag, worker) for router in self.routers.values() ]) for backend, info in self.backends.items(): await asyncio.gather(*[ router.set_backend_config.remote(backend, info.backend_config) for router in self.routers.values() ]) await self.broadcast_backend_config(backend) metadata = info.backend_config.internal_metadata if metadata.autoscaling_config is not None: self.autoscaling_policies[backend] = BasicAutoscalingPolicy( backend, metadata.autoscaling_config) # Push configuration state to the routers. await asyncio.gather(*[ router.set_route_table.remote(self.routes) for router in self.routers.values() ]) # Start/stop any pending backend replicas. await self._start_pending_replicas() await self._stop_pending_replicas() # Remove any pending backends and endpoints. await self._remove_pending_backends() await self._remove_pending_endpoints() logger.info("Recovered from checkpoint in {:.3f}s".format(time.time() - start)) self.write_lock.release()
def main(): # Setup local cluster, note this cluster setup is the same for both # local and product ray cluster env. # Each test uses different ray namespace, thus kv storage key for each # checkpoint is different to avoid collision. namespace = uuid.uuid4().hex # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": checkpoint_path = "file://checkpoint.db" else: checkpoint_path = ( "gs://kazi_test/test/fault-tolerant-test-checkpoint" # noqa: E501 ) _, cluster = setup_local_single_node_cluster( 1, checkpoint_path=checkpoint_path, namespace=namespace) # Deploy for the first time @serve.deployment(name="echo", num_replicas=DEFAULT_NUM_REPLICAS) class Echo: def __init__(self): return True def __call__(self, request): return "hii" Echo.deploy() # Ensure endpoint is working for _ in range(5): response = request_with_retries("/echo/", timeout=3) assert response.text == "hii" logger.info("Initial deployment successful with working endpoint.") # Kill current cluster, recover from remote checkpoint and ensure endpoint # is still available with expected results ray.kill(serve.api._global_client._controller, no_restart=True) ray.shutdown() cluster.shutdown() serve.api._set_global_client(None) # Start another ray cluster with same namespace to resume from previous # checkpoints with no new deploy() call. setup_local_single_node_cluster(1, checkpoint_path=checkpoint_path, namespace=namespace) for _ in range(5): response = request_with_retries("/echo/", timeout=3) assert response.text == "hii" logger.info("Deployment recovery from Google Cloud Storage checkpoint " "is successful with working endpoint.") # Delete dangling checkpoints. If script failed before this step, it's up # to the TTL policy on GCS to clean up, but won't lead to collision with # subsequent tests since each test run in different uuid namespace. serve.shutdown() ray.shutdown() cluster.shutdown() # Checkpoints in GCS bucket are moved after 7 days with explicit lifecycle # rules. Each checkpoint is ~260 Bytes in size from this test. # Save results save_test_results( {"result": "success"}, default_output_file="/tmp/serve_cluster_fault_tolerance.json", )
def deploy(self, name: str, backend_def: Union[Callable, Type[Callable], str], *init_args: Any, ray_actor_options: Optional[Dict] = None, config: Optional[Union[BackendConfig, Dict[str, Any]]] = None, version: Optional[str] = None, prev_version: Optional[str] = None, route_prefix: Optional[str] = None, url: str = "", _blocking: Optional[bool] = True) -> Optional[GoalId]: if config is None: config = {} if ray_actor_options is None: ray_actor_options = {} curr_job_env = ray.get_runtime_context().runtime_env if "runtime_env" in ray_actor_options: ray_actor_options["runtime_env"].setdefault( "uris", curr_job_env.get("uris")) else: ray_actor_options["runtime_env"] = curr_job_env if "working_dir" in ray_actor_options["runtime_env"]: del ray_actor_options["runtime_env"]["working_dir"] replica_config = ReplicaConfig(backend_def, *init_args, ray_actor_options=ray_actor_options) if isinstance(config, dict): backend_config = BackendConfig.parse_obj(config) elif isinstance(config, BackendConfig): backend_config = config else: raise TypeError("config must be a BackendConfig or a dictionary.") goal_id, updating = ray.get( self._controller.deploy.remote(name, backend_config.to_proto_bytes(), replica_config, version, prev_version, route_prefix, ray.get_runtime_context().job_id)) tag = f"component=serve deployment={name}" if updating: msg = f"Updating deployment '{name}'" if version is not None: msg += f" to version '{version}'" logger.info(f"{msg}. {tag}") else: logger.info(f"Deployment '{name}' is already at version " f"'{version}', not updating. {tag}") if _blocking: self._wait_for_goal(goal_id) logger.info( f"Deployment '{name}{':'+version if version else ''}' is ready" f" at `{url}`. {tag}") else: return goal_id
def main(num_replicas: Optional[int], num_trials: Optional[int], trial_length: Optional[str], max_batch_size: Optional[int], run_locally: Optional[bool]): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "0": num_replicas = num_replicas or DEFAULT_FULL_TEST_NUM_REPLICA num_trials = num_trials or DEFAULT_FULL_TEST_NUM_TRIALS trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with {num_replicas} replicas, " f"{num_trials} trials that lasts {trial_length} each.. \n") else: num_replicas = num_replicas or DEFAULT_SMOKE_TEST_NUM_REPLICA num_trials = num_trials or DEFAULT_SMOKE_TEST_NUM_TRIALS trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info(f"Running smoke test with {num_replicas} replicas, " f"{num_trials} trials that lasts {trial_length} each.. \n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to if run_locally: num_nodes = int(math.ceil(num_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes ....\n") serve_client = setup_local_single_node_cluster(num_nodes) else: logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with {num_replicas} target replicas ....\n") deploy_replicas(num_replicas, max_batch_size) logger.info("Warming up cluster ....\n") warm_up_cluster(5, http_host, http_port) avg_throughput = [] avg_latency = [] p50_latency = [] p75_latency = [] p90_latency = [] p99_latency = [] for iteration in range(num_trials): logger.info(f"Starting wrk trial # {iteration + 1} ....\n") run_one_trial_remote = ray.remote(run_one_trial) refs = [] for node in ray.nodes(): if node["Alive"]: node_resource = f"node:{node['NodeManagerAddress']}" refs.append( run_one_trial_remote.options(num_cpus=0, resources={ node_resource: 0.01 }).remote( trial_length, http_host, http_port)) for decoded_output in ray.get(refs): parsed = parse_wrk_decoded_stdout(decoded_output) avg_throughput.append(float(parsed["requests/sec"])) avg_latency.append(float(parsed["latency_avg_ms"])) p50_latency.append(float(parsed["P50_latency_ms"])) p75_latency.append(float(parsed["P75_latency_ms"])) p90_latency.append(float(parsed["P90_latency_ms"])) p99_latency.append(float(parsed["P99_latency_ms"])) final_results = { "avg_throughput_qps": sum(avg_throughput) / len(avg_throughput), "avg_latency_ms": sum(avg_latency) / len(avg_latency), "max_p50_latency_ms": max(p50_latency), "max_p75_latency_ms": max(p75_latency), "max_p90_latency_ms": max(p90_latency), "max_p99_latency_ms": max(p99_latency), } logger.info(f"Final results: {final_results}\n") test_output_json = os.environ.get( "TEST_OUTPUT_JSON", "/tmp/single_deployment_1k_noop_replica.json") with open(test_output_json, "wt") as f: json.dump(final_results, f)
def main(num_replicas: Optional[int], num_deployments: Optional[int], trial_length: Optional[str]): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "1") if smoke_test == "1": num_replicas = num_replicas or DEFAULT_SMOKE_TEST_NUM_REPLICA num_deployments = num_deployments or DEFAULT_SMOKE_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info(f"Running smoke test with {num_replicas} replicas, " f"{num_deployments} deployments .. \n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to num_nodes = int(math.ceil(num_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes .. \n") serve_client = setup_local_single_node_cluster(num_nodes) else: num_replicas = num_replicas or DEFAULT_FULL_TEST_NUM_REPLICA num_deployments = num_deployments or DEFAULT_FULL_TEST_NUM_DEPLOYMENTS trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with {num_replicas} replicas, " f"{num_deployments} deployments .. \n") logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with {num_replicas} target replicas ....\n") setup_multi_deployment_replicas(num_replicas, num_deployments) logger.info("Warming up cluster ....\n") rst_ray_refs = [] all_endpoints = list(serve.list_deployments().keys()) for endpoint in all_endpoints: rst_ray_refs.append( warm_up_one_cluster.options(num_cpus=0.1).remote( 10, http_host, http_port, endpoint)) for endpoint in ray.get(rst_ray_refs): logger.info(f"Finished warming up {endpoint}") logger.info(f"Starting wrk trial on all nodes for {trial_length} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? all_metrics, all_wrk_stdout = run_wrk_on_all_nodes( trial_length, NUM_CONNECTIONS, http_host, http_port, all_endpoints=all_endpoints) aggregated_metrics = aggregate_all_metrics(all_metrics) logger.info("Wrk stdout on each node: ") for wrk_stdout in all_wrk_stdout: logger.info(wrk_stdout) logger.info("Final aggregated metrics: ") for key, val in aggregated_metrics.items(): logger.info(f"{key}: {val}") save_test_results( aggregated_metrics, default_output_file="/tmp/multi_deployment_1k_noop_replica.json")
def warm_up_cluster(num_warmup_iterations: int, http_host: str, http_port: str) -> None: for _ in range(num_warmup_iterations): resp = requests.get(f"http://{http_host}:{http_port}/echo").text logger.info(resp) time.sleep(0.5)
async def trial(result_json, intermediate_handles, num_replicas, max_batch_size, max_concurrent_queries, data_size): trial_key_base = ( f"replica:{num_replicas}/batch_size:{max_batch_size}/" f"concurrent_queries:{max_concurrent_queries}/" f"data_size:{data_size}/intermediate_handle:{intermediate_handles}") logger.info(f"intermediate_handles={intermediate_handles}," f"num_replicas={num_replicas}," f"max_batch_size={max_batch_size}," f"max_concurrent_queries={max_concurrent_queries}," f"data_size={data_size}") deployment_name = "api" if intermediate_handles: deployment_name = "downstream" @serve.deployment("api", max_concurrent_queries=1000) class ForwardActor: def __init__(self): self.handle = serve.get_deployment(deployment_name).get_handle( sync=False) async def __call__(self, _): return await self.handle.remote() ForwardActor.deploy() routes = requests.get("http://localhost:8000/-/routes").json() assert "/api" in routes, routes @serve.deployment(name=deployment_name, num_replicas=num_replicas, max_concurrent_queries=max_concurrent_queries) class Backend: @serve.batch(max_batch_size=max_batch_size) async def batch(self, reqs): return [b"ok"] * len(reqs) async def __call__(self, req): if max_batch_size > 1: return await self.batch(req) else: return b"ok" Backend.deploy() routes = requests.get("http://localhost:8000/-/routes").json() assert f"/{deployment_name}" in routes, routes if data_size == "small": data = None elif data_size == "large": data = b"a" * 1024 * 1024 else: raise ValueError("data_size should be 'small' or 'large'.") async with aiohttp.ClientSession() as session: async def single_client(): for _ in range(CALLS_PER_BATCH): await fetch(session, data) single_client_avg_tps = await timeit( "single client {} data".format(data_size), single_client, multiplier=CALLS_PER_BATCH) key = "num_client:1/" + trial_key_base result_json.update({key: single_client_avg_tps}) clients = [Client.remote() for _ in range(NUM_CLIENTS)] ray.get([client.ready.remote() for client in clients]) async def many_clients(): ray.get([a.do_queries.remote(CALLS_PER_BATCH, data) for a in clients]) multi_client_avg_tps = await timeit( "{} clients {} data".format(len(clients), data_size), many_clients, multiplier=CALLS_PER_BATCH * len(clients)) key = f"num_client:{len(clients)}/" + trial_key_base result_json.update({key: multi_client_avg_tps}) logger.info(result_json)
def main(num_replicas: Optional[int], num_trials: Optional[int], trial_length: Optional[str], max_batch_size: Optional[int], run_locally: Optional[bool]): # Give default cluster parameter values based on smoke_test config # if user provided values explicitly, use them instead. # IS_SMOKE_TEST is set by args of releaser's e2e.py smoke_test = os.environ.get("IS_SMOKE_TEST", "0") if smoke_test == "0": num_replicas = num_replicas or DEFAULT_FULL_TEST_NUM_REPLICA num_trials = num_trials or DEFAULT_FULL_TEST_NUM_TRIALS trial_length = trial_length or DEFAULT_FULL_TEST_TRIAL_LENGTH logger.info(f"Running full test with {num_replicas} replicas, " f"{num_trials} trials that lasts {trial_length} each.. \n") else: num_replicas = num_replicas or DEFAULT_SMOKE_TEST_NUM_REPLICA num_trials = num_trials or DEFAULT_SMOKE_TEST_NUM_TRIALS trial_length = trial_length or DEFAULT_SMOKE_TEST_TRIAL_LENGTH logger.info(f"Running smoke test with {num_replicas} replicas, " f"{num_trials} trials that lasts {trial_length} each.. \n") # Choose cluster setup based on user config. Local test uses Cluster() # to mock actors that requires # of nodes to be specified, but ray # client doesn't need to if run_locally: num_nodes = int(math.ceil(num_replicas / NUM_CPU_PER_NODE)) logger.info( f"Setting up local ray cluster with {num_nodes} nodes ....\n") serve_client = setup_local_single_node_cluster(num_nodes) else: logger.info("Setting up anyscale ray cluster .. \n") serve_client = setup_anyscale_cluster() http_host = str(serve_client._http_config.host) http_port = str(serve_client._http_config.port) logger.info(f"Ray serve http_host: {http_host}, http_port: {http_port}") logger.info(f"Deploying with {num_replicas} target replicas ....\n") deploy_replicas(num_replicas, max_batch_size) logger.info("Warming up cluster ....\n") warm_up_cluster(5, http_host, http_port) final_result = [] for iteration in range(num_trials): logger.info(f"Starting wrk trial # {iteration + 1} ....\n") # For detailed discussion, see https://github.com/wg/wrk/issues/205 # TODO:(jiaodong) What's the best number to use here ? num_connections = int(num_replicas * DEFAULT_MAX_BATCH_SIZE * 0.75) decoded_out = run_one_trial(trial_length, num_connections, http_host, http_port) metrics_dict = parse_wrk_decoded_stdout(decoded_out) final_result.append(metrics_dict) logger.info(f"Final results: {final_result}\n") test_output_json = os.environ.get( "TEST_OUTPUT_JSON", "/tmp/single_deployment_1k_noop_replica.json") with open(test_output_json, "wt") as f: json.dump(final_result, f)
def deploy( self, name: str, deployment_def: Union[Callable, Type[Callable], str], init_args: Tuple[Any], init_kwargs: Dict[Any, Any], ray_actor_options: Optional[Dict] = None, config: Optional[Union[DeploymentConfig, Dict[str, Any]]] = None, version: Optional[str] = None, prev_version: Optional[str] = None, route_prefix: Optional[str] = None, url: Optional[str] = None, _blocking: Optional[bool] = True) -> Optional[GoalId]: if config is None: config = {} if ray_actor_options is None: ray_actor_options = {} curr_job_env = ray.get_runtime_context().runtime_env if "runtime_env" in ray_actor_options: ray_actor_options["runtime_env"].setdefault( "working_dir", curr_job_env.get("working_dir")) else: ray_actor_options["runtime_env"] = curr_job_env replica_config = ReplicaConfig( deployment_def, init_args=init_args, init_kwargs=init_kwargs, ray_actor_options=ray_actor_options) if isinstance(config, dict): deployment_config = DeploymentConfig.parse_obj(config) elif isinstance(config, DeploymentConfig): deployment_config = config else: raise TypeError( "config must be a DeploymentConfig or a dictionary.") if deployment_config.autoscaling_config is not None and \ deployment_config.max_concurrent_queries < deployment_config. \ autoscaling_config.target_num_ongoing_requests_per_replica: logger.warning("Autoscaling will never happen, " "because 'max_concurrent_queries' is less than " "'target_num_ongoing_requests_per_replica' now.") goal_id, updating = ray.get( self._controller.deploy.remote(name, deployment_config.to_proto_bytes(), replica_config, version, prev_version, route_prefix, ray.get_runtime_context().job_id)) tag = f"component=serve deployment={name}" if updating: msg = f"Updating deployment '{name}'" if version is not None: msg += f" to version '{version}'" logger.info(f"{msg}. {tag}") else: logger.info(f"Deployment '{name}' is already at version " f"'{version}', not updating. {tag}") if _blocking: self._wait_for_goal(goal_id) if url is not None: url_part = f" at `{url}`" else: url_part = "" logger.info( f"Deployment '{name}{':'+version if version else ''}' is ready" f"{url_part}. {tag}") else: return goal_id
def _scale_backend_replicas(self) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. The caller is responsible for then first writing a checkpoint and then actually starting/stopping the intended replicas. This avoids inconsistencies with starting/stopping a replica and then crashing before writing a checkpoint. """ assert self._target_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") graceful_shutdown_timeout_s = ( self._target_info.backend_config. experimental_graceful_shutdown_timeout_s) self._stop_wrong_version_replicas() current_replicas = self._replicas.count(states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING_OR_UPDATING, ReplicaState.RUNNING ]) delta_replicas = self._target_replicas - current_replicas if delta_replicas == 0: return False elif delta_replicas > 0: # Don't ever exceed self._target_replicas. stopping_replicas = self._replicas.count(states=[ ReplicaState.SHOULD_STOP, ReplicaState.STOPPING, ]) to_add = max(delta_replicas - stopping_replicas, 0) if to_add > 0: logger.info(f"Adding {to_add} replicas to deployment " f"'{self._name}'. component=serve " f"deployment={self._name}") for _ in range(to_add): replica_tag = "{}#{}".format(self._name, get_random_letters()) self._replicas.add( ReplicaState.SHOULD_START, BackendReplica(self._controller_name, self._detached, replica_tag, self._name, self._target_version)) logger.debug( f"Adding SHOULD_START to replica_tag: {replica_tag}, " f"backend_tag: {self._name}") elif delta_replicas < 0: to_remove = -delta_replicas logger.info(f"Removing {to_remove} replicas from deployment " f"'{self._name}'. component=serve " f"deployment={self._name}") replicas_to_stop = self._replicas.pop(states=[ ReplicaState.SHOULD_START, ReplicaState.STARTING_OR_UPDATING, ReplicaState.RUNNING ], max_replicas=to_remove) for replica in replicas_to_stop: logger.debug(f"Adding SHOULD_STOP to replica_tag: {replica}, " f"backend_tag: {self._name}") replica.set_should_stop(graceful_shutdown_timeout_s) self._replicas.add(ReplicaState.SHOULD_STOP, replica) return True