async def startup_event(): ray.init(address="auto") client = serve.start() class GPT2: def __init__(self): self.nlp_model = pipeline('text-generation', model='gpt2') def __call__(self, request): return self.nlp_model(request._data, max_length=50) backend_config = serve.BackendConfig(num_replicas=10) client.create_backend("gpt-2", GPT2, config=backend_config) client.create_endpoint("generate", backend="gpt-2") global serve_handle serve_handle = client.get_handle("generate")
async def startup_event(): ray.init(address="auto") # Connect to the running Ray cluster. serve.start(http_host=None) # Start the Ray Serve instance. # Define a callable class to use for our Ray Serve backend. class GPT2: def __init__(self): self.nlp_model = pipeline("text-generation", model="gpt2") async def __call__(self, request): return self.nlp_model(await request.body(), max_length=50) # Set up a Ray Serve backend with the desired number of replicas. backend_config = serve.BackendConfig(num_replicas=2) serve.create_backend("gpt-2", GPT2, config=backend_config) serve.create_endpoint("generate", backend="gpt-2") # Get a handle to our Ray Serve endpoint so we can query it in Python. global serve_handle serve_handle = serve.get_handle("generate")
# File name: deploy_serve.py import ray from ray import serve # Connect to the running Ray cluster. ray.init(address="auto") # Start a detached Ray Serve instance. It will persist after the script exits. client = serve.start(http_host=None, detached=True) # Define a function to serve. Alternatively, you could define a stateful class. async def my_model(request): data = await request.body() return f"Model received data: {data}" # Set up a backend with the desired number of replicas and set up an endpoint. backend_config = serve.BackendConfig(num_replicas=2) client.create_backend("my_backend", my_model, config=backend_config) client.create_endpoint("my_endpoint", backend="my_backend")
def main(config_file): with open(config_file) as f: config = json.load(f) names = [] for model in config["models"]: client.create_backend(model["name"], ImportedBackend(model["class"]), *model.get("args", []), config=serve.BackendConfig(**model.get("config", {}))) client.create_endpoint(model["name"], backend=model["name"]) names.append(model["name"]) client.create_backend(config["name"], ImportedBackend("serve_pipeline.ModelPipeline"), names) client.create_endpoint(config["name"], backend=config["name"], route=config.get("route", None))
"--batch_size") else: args["--batch_size"] = 1 ray.init(address=args["--ray_address"], redis_password=args["--ray_password"]) serve.init(start_server=False) input_p = Path(args["--input_directory"]) output_p = Path(args["--output_directory"]) all_wavs = list(input_p.rglob("**/*.WAV")) # model = RunSplitter() # predictions = model(None, audio_paths=all_wavs[0:10]) # print(predictions) serve.create_endpoint("splitter") serve.create_backend( RunSplitter, "splitter:v0", backend_config=serve.BackendConfig(num_replicas=args["--num_nodes"], max_batch_size=args["--batch_size"]), ) serve.link("splitter", "splitter:v0") handle = serve.get_handle("splitter") ids = [handle.remote(audio_paths=audio_path) for audio_path in all_wavs] results = ray.get(ids) print(results)
"chmod +x hey_linux_amd64" ]) ray.init(address=cluster.address, include_webui=True, webui_host="0.0.0.0") serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns)) @serve.accept_batch def echo(_): time.sleep(0.01) # Sleep for 10ms ray.show_in_webui(str(serve.context.batch_size), key="Current batch size") return ["hi {}".format(i) for i in range(serve.context.batch_size)] serve.create_endpoint("echo", "/echo") config = serve.BackendConfig(num_replicas=30, max_batch_size=16) serve.create_backend(echo, "echo:v1", backend_config=config) serve.set_traffic("echo", {"echo:v1": 1}) print("Warming up") for _ in range(5): resp = requests.get("http://127.0.0.1:8000/echo").text print(resp) time.sleep(0.5) connections = int(config.num_replicas * config.max_batch_size * 0.75) while True: proc = subprocess.Popen([ "./hey_linux_amd64", "-c", str(connections), "-z", "60m", "http://127.0.0.1:8000/echo"