def test_not_killing_replicas(serve_instance): class BatchSimple: def __init__(self): self.count = 0 @serve.accept_batch def __call__(self, flask_request, temp=None): batch_size = serve.context.batch_size return [1] * batch_size serve.create_endpoint("bsimple", "/bsimple") b_config = BackendConfig(num_replicas=3, max_batch_size=2) serve.create_backend(BatchSimple, "bsimple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") bnew_config = serve.get_backend_config("bsimple:v1") # change the config bnew_config.max_batch_size = 5 # set the config serve.set_backend_config("bsimple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "bsimple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the old and new replica tag list should be identical # and should be subset of all_tag_list assert set(old_replica_tag_list) <= set(new_all_tag_list) assert set(old_replica_tag_list) == set(new_replica_tag_list)
def test_killing_replicas(serve_instance): class Simple: def __init__(self): self.count = 0 def __call__(self, flask_request, temp=None): return temp serve.create_endpoint("simple", "/simple") b_config = BackendConfig(num_replicas=3, num_cpus=2) serve.create_backend(Simple, "simple:v1", backend_config=b_config) global_state = serve.api._get_global_state() old_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") bnew_config = serve.get_backend_config("simple:v1") # change the config bnew_config.num_cpus = 1 # set the config serve.set_backend_config("simple:v1", bnew_config) new_replica_tag_list = global_state.backend_table.list_replicas( "simple:v1") global_state.refresh_actor_handle_cache() new_all_tag_list = list(global_state.actor_handle_cache.keys()) # the new_replica_tag_list must be subset of all_tag_list assert set(new_replica_tag_list) <= set(new_all_tag_list) # the old_replica_tag_list must not be subset of all_tag_list assert not set(old_replica_tag_list) <= set(new_all_tag_list)
def test_scaling_replicas(serve_instance): class Counter: def __init__(self): self.count = 0 def __call__(self, _): self.count += 1 return self.count serve.create_endpoint("counter", "/increment") # Keep checking the routing table until /increment is populated while "/increment" not in requests.get("http://127.0.0.1:8000/").json(): time.sleep(0.2) b_config = BackendConfig(num_replicas=2) serve.create_backend(Counter, "counter:v1", backend_config=b_config) serve.link("counter", "counter:v1") counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # If the load is shared among two replicas. The max result cannot be 10. assert max(counter_result) < 10 b_config = serve.get_backend_config("counter:v1") b_config.num_replicas = 1 serve.set_backend_config("counter:v1", b_config) counter_result = [] for _ in range(10): resp = requests.get("http://127.0.0.1:8000/increment").json()["result"] counter_result.append(resp) # Give some time for a replica to spin down. But majority of the request # should be served by the only remaining replica. assert max(counter_result) - min(counter_result) > 6
serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns)) @serve.route("/echo") @serve.accept_batch def echo(_): time.sleep(0.01) # Sleep for 10ms ray.show_in_webui(str(serve.context.batch_size), key="Current batch size") return ["hi {}".format(i) for i in range(serve.context.batch_size)] print("Scaling to 30 replicas") config = serve.get_backend_config("echo:v0") config.num_replicas = 30 config.max_batch_size = 16 serve.set_backend_config("echo:v0", config) print("Warming up") for _ in range(5): resp = requests.get("http://127.0.0.1:8000/echo").json() print(resp) time.sleep(0.5) connections = int(config.num_replicas * config.max_batch_size * 0.75) proc = subprocess.Popen([ "./hey_linux_amd64", "-c", str(connections), "-z", "360m", "http://127.0.0.1:8000/echo" ], stdout=PIPE, stderr=PIPE) print("started load testing")
# as well as within the ray system. # We can also add a new backend and split the traffic. def echo_v2(flask_request): # magic, only from web. return "something new" serve.create_backend(echo_v2, "echo:v2") backend_config_v2 = serve.get_backend_config("echo:v2") # The two backend will now split the traffic 50%-50%. serve.split("my_endpoint", {"echo:v1": 0.5, "echo:v2": 0.5}) # Observe requests are now split between two backends. for _ in range(10): print(requests.get("http://127.0.0.1:8000/echo").json()) time.sleep(0.5) # You can also change number of replicas # for each backend independently. backend_config_v1.num_replicas = 2 serve.set_backend_config("echo:v1", backend_config_v1) backend_config_v2.num_replicas = 2 serve.set_backend_config("echo:v2", backend_config_v2) # As well as retrieving relevant system metrics print(pformat_color_json(serve.stat()))