def benchmark_get_calls(ray): value = ray.put(0) def get_small(): ray.get(value) timeit("client: get calls", get_small)
def benchmark_simple_actor(ray, results): @ray.remote(num_cpus=0) class Actor: def small_value(self): return b"ok" def small_value_arg(self, x): return b"ok" def small_value_batch(self, n): ray.get([self.small_value.remote() for _ in range(n)]) a = Actor.remote() def actor_sync(): ray.get(a.small_value.remote()) results += timeit("client: 1:1 actor calls sync", actor_sync) def actor_async(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("client: 1:1 actor calls async", actor_async, 1000) a = Actor.options(max_concurrency=16).remote() def actor_concurrent(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("client: 1:1 actor calls concurrent", actor_concurrent, 1000)
def benchmark_remote_put_calls(ray): @ray.remote def do_put_small(): for _ in range(100): ray.put(0) def put_multi_small(): ray.get([do_put_small.remote() for _ in range(10)]) timeit("client: remote put calls", put_multi_small, 1000)
def benchmark_put_large(ray, results): arr = np.zeros(100 * 1024 * 1024, dtype=np.int64) def put_large(): ray.put(arr) results += timeit("client: put gigabytes", put_large, 8 * 0.1)
def benchmark_remote_put_calls(ray, results): @ray.remote def do_put_small(): for _ in range(100): ray.put(0) def put_multi_small(): ray.get([do_put_small.remote() for _ in range(10)]) results += timeit("client: tasks and put batch", put_multi_small, 1000)
def benchmark_tasks_and_get_batch(ray, results): @ray.remote def small_value(): return b"ok" def small_value_batch(): submitted = [small_value.remote() for _ in range(1000)] ray.get(submitted) return 0 results += timeit("client: tasks and get batch", small_value_batch)
def benchmark_put_calls(ray, results): def put_small(): ray.put(0) results += timeit("client: put calls", put_small)
def main(results=None): results = results or [] check_optimized_build() print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks") ray.init(_system_config={"put_small_object_in_memory_store": True}) value = ray.put(0) def get_small(): ray.get(value) results += timeit("single client get calls", get_small) def put_small(): ray.put(0) results += timeit("single client put calls", put_small) @ray.remote def do_put_small(): for _ in range(100): ray.put(0) def put_multi_small(): ray.get([do_put_small.remote() for _ in range(10)]) results += timeit("multi client put calls", put_multi_small, 1000) ray.shutdown() ray.init(_system_config={"put_small_object_in_memory_store": False}) value = ray.put(0) arr = np.zeros(100 * 1024 * 1024, dtype=np.int64) results += timeit("single client get calls (Plasma Store)", get_small) results += timeit("single client put calls (Plasma Store)", put_small) results += timeit("multi client put calls (Plasma Store)", put_multi_small, 1000) def put_large(): ray.put(arr) results += timeit("single client put gigabytes", put_large, 8 * 0.1) @ray.remote def do_put(): for _ in range(10): ray.put(np.zeros(10 * 1024 * 1024, dtype=np.int64)) def put_multi(): ray.get([do_put.remote() for _ in range(10)]) results += timeit("multi client put gigabytes", put_multi, 10 * 8 * 0.1) def small_task(): ray.get(small_value.remote()) results += timeit("single client tasks sync", small_task) def small_task_async(): ray.get([small_value.remote() for _ in range(1000)]) results += timeit("single client tasks async", small_task_async, 1000) n = 10000 m = 4 actors = [Actor.remote() for _ in range(m)] def multi_task(): submitted = [a.small_value_batch.remote(n) for a in actors] ray.get(submitted) results += timeit("multi client tasks async", multi_task, n * m) a = Actor.remote() def actor_sync(): ray.get(a.small_value.remote()) results += timeit("1:1 actor calls sync", actor_sync) a = Actor.remote() def actor_async(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 actor calls async", actor_async, 1000) a = Actor.options(max_concurrency=16).remote() def actor_concurrent(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 actor calls concurrent", actor_concurrent, 1000) n = 5000 n_cpu = multiprocessing.cpu_count() // 2 actors = [Actor._remote() for _ in range(n_cpu)] client = Client.remote(actors) def actor_async_direct(): ray.get(client.small_value_batch.remote(n)) results += timeit("1:n actor calls async", actor_async_direct, n * len(actors)) n_cpu = multiprocessing.cpu_count() // 2 a = [Actor.remote() for _ in range(n_cpu)] @ray.remote def work(actors): ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)]) def actor_multi2(): ray.get([work.remote(a) for _ in range(m)]) results += timeit("n:n actor calls async", actor_multi2, m * n) n = 1000 actors = [Actor._remote() for _ in range(n_cpu)] clients = [Client.remote(a) for a in actors] def actor_multi2_direct_arg(): ray.get([c.small_value_batch_arg.remote(n) for c in clients]) results += timeit("n:n actor calls with arg async", actor_multi2_direct_arg, n * len(clients)) a = AsyncActor.remote() def actor_sync(): ray.get(a.small_value.remote()) results += timeit("1:1 async-actor calls sync", actor_sync) a = AsyncActor.remote() def async_actor(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 async-actor calls async", async_actor, 1000) a = AsyncActor.remote() def async_actor(): ray.get([a.small_value_with_arg.remote(i) for i in range(1000)]) results += timeit("1:1 async-actor calls with args async", async_actor, 1000) n = 5000 n_cpu = multiprocessing.cpu_count() // 2 actors = [AsyncActor.remote() for _ in range(n_cpu)] client = Client.remote(actors) def async_actor_async(): ray.get(client.small_value_batch.remote(n)) results += timeit("1:n async-actor calls async", async_actor_async, n * len(actors)) n = 5000 m = 4 n_cpu = multiprocessing.cpu_count() // 2 a = [AsyncActor.remote() for _ in range(n_cpu)] @ray.remote def async_actor_work(actors): ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)]) def async_actor_multi(): ray.get([async_actor_work.remote(a) for _ in range(m)]) results += timeit("n:n async-actor calls async", async_actor_multi, m * n) ray.shutdown() client_microbenchmark_main(results) return results
def main(results=None): results = results or [] check_optimized_build() print("Tip: set TESTS_TO_RUN='pattern' to run a subset of benchmarks") ray.init() value = ray.put(0) def get_small(): ray.get(value) def put_small(): ray.put(0) @ray.remote def do_put_small(): for _ in range(100): ray.put(0) def put_multi_small(): ray.get([do_put_small.remote() for _ in range(10)]) arr = np.zeros(100 * 1024 * 1024, dtype=np.int64) results += timeit("single client get calls (Plasma Store)", get_small) results += timeit("single client put calls (Plasma Store)", put_small) results += timeit("multi client put calls (Plasma Store)", put_multi_small, 1000) def put_large(): ray.put(arr) results += timeit("single client put gigabytes", put_large, 8 * 0.1) def small_value_batch(): submitted = [small_value.remote() for _ in range(1000)] ray.get(submitted) return 0 results += timeit("single client tasks and get batch", small_value_batch) @ray.remote def do_put(): for _ in range(10): ray.put(np.zeros(10 * 1024 * 1024, dtype=np.int64)) def put_multi(): ray.get([do_put.remote() for _ in range(10)]) results += timeit("multi client put gigabytes", put_multi, 10 * 8 * 0.1) obj_containing_ref = create_object_containing_ref.remote() def get_containing_object_ref(): ray.get(obj_containing_ref) results += timeit("single client get object containing 10k refs", get_containing_object_ref) def small_task(): ray.get(small_value.remote()) results += timeit("single client tasks sync", small_task) def small_task_async(): ray.get([small_value.remote() for _ in range(1000)]) results += timeit("single client tasks async", small_task_async, 1000) n = 10000 m = 4 actors = [Actor.remote() for _ in range(m)] def multi_task(): submitted = [a.small_value_batch.remote(n) for a in actors] ray.get(submitted) results += timeit("multi client tasks async", multi_task, n * m) a = Actor.remote() def actor_sync(): ray.get(a.small_value.remote()) results += timeit("1:1 actor calls sync", actor_sync) a = Actor.remote() def actor_async(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 actor calls async", actor_async, 1000) a = Actor.options(max_concurrency=16).remote() def actor_concurrent(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 actor calls concurrent", actor_concurrent, 1000) n = 5000 n_cpu = multiprocessing.cpu_count() // 2 actors = [Actor._remote() for _ in range(n_cpu)] client = Client.remote(actors) def actor_async_direct(): ray.get(client.small_value_batch.remote(n)) results += timeit("1:n actor calls async", actor_async_direct, n * len(actors)) n_cpu = multiprocessing.cpu_count() // 2 a = [Actor.remote() for _ in range(n_cpu)] @ray.remote def work(actors): ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)]) def actor_multi2(): ray.get([work.remote(a) for _ in range(m)]) results += timeit("n:n actor calls async", actor_multi2, m * n) n = 1000 actors = [Actor._remote() for _ in range(n_cpu)] clients = [Client.remote(a) for a in actors] def actor_multi2_direct_arg(): ray.get([c.small_value_batch_arg.remote(n) for c in clients]) results += timeit("n:n actor calls with arg async", actor_multi2_direct_arg, n * len(clients)) a = AsyncActor.remote() def actor_sync(): ray.get(a.small_value.remote()) results += timeit("1:1 async-actor calls sync", actor_sync) a = AsyncActor.remote() def async_actor(): ray.get([a.small_value.remote() for _ in range(1000)]) results += timeit("1:1 async-actor calls async", async_actor, 1000) a = AsyncActor.remote() def async_actor(): ray.get([a.small_value_with_arg.remote(i) for i in range(1000)]) results += timeit("1:1 async-actor calls with args async", async_actor, 1000) n = 5000 n_cpu = multiprocessing.cpu_count() // 2 actors = [AsyncActor.remote() for _ in range(n_cpu)] client = Client.remote(actors) def async_actor_async(): ray.get(client.small_value_batch.remote(n)) results += timeit("1:n async-actor calls async", async_actor_async, n * len(actors)) n = 5000 m = 4 n_cpu = multiprocessing.cpu_count() // 2 a = [AsyncActor.remote() for _ in range(n_cpu)] @ray.remote def async_actor_work(actors): ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)]) def async_actor_multi(): ray.get([async_actor_work.remote(a) for _ in range(m)]) results += timeit("n:n async-actor calls async", async_actor_multi, m * n) ray.shutdown() NUM_PGS = 100 NUM_BUNDLES = 1 ray.init(resources={"custom": 100}) def placement_group_create_removal(num_pgs): pgs = [ ray.util.placement_group(bundles=[{ "custom": 0.001 } for _ in range(NUM_BUNDLES)]) for _ in range(num_pgs) ] [pg.wait(timeout_seconds=30) for pg in pgs] # Include placement group removal here to clean up. # If we don't clean up placement groups, the whole performance # gets slower as it runs more. # Since timeit function runs multiple times without # the cleaning logic, we should have this method here. for pg in pgs: ray.util.remove_placement_group(pg) results += timeit("placement group create/removal", lambda: placement_group_create_removal(NUM_PGS), NUM_PGS) ray.shutdown() client_microbenchmark_main(results) return results
def test_placement_group_perf(num_pgs, num_bundles, num_pending_pgs): # Run the placement group performance benchmark given arguments. assert ray.cluster_resources()["custom"] >= (RESOURCES_VALUE * num_pgs * num_bundles) def placement_group_create(num_pgs): pgs = [ ray.util.placement_group(bundles=[{ "custom": 0.001 } for _ in range(num_bundles)], strategy="SPREAD") for _ in range(num_pgs) ] [pg.wait(timeout_seconds=30) for pg in pgs] for pg in pgs: ray.util.remove_placement_group(pg) print(f"Num pending pgs: {num_pending_pgs}, " f"Num pgs: {num_pgs}, " f"Num bundles {num_bundles}") # Get the throughput. throughput = timeit("placement group create per second", lambda: placement_group_create(num_pgs), num_pgs) # Get fine-grained scheduling stats. latencies = [] e2e_latencies = [] scheduling_attempts = [] for entry in ray.util.placement_group_table().values(): latency = entry["stats"]["scheduling_latency_ms"] e2e_latency = entry["stats"]["end_to_end_creation_latency_ms"] scheduling_attempt = entry["stats"]["scheduling_attempt"] latencies.append(latency) e2e_latencies.append(e2e_latency) scheduling_attempts.append(scheduling_attempt) latencies = sorted(latencies) e2e_latencies = sorted(e2e_latencies) scheduling_attempts = sorted(scheduling_attempts) # Pure scheduling latency without queuing time. print("P50 scheduling latency ms: " f"{latencies[int(len(latencies) * 0.5)]}") print("P95 scheduling latency ms: " f"{latencies[int(len(latencies) * 0.95)]}") print("P99 scheduling latency ms: " f"{latencies[int(len(latencies) * 0.99)]}") # Scheduling latency including queueing time. print("P50 e2e scheduling latency ms: " f"{e2e_latencies[int(len(e2e_latencies) * 0.5)]}") print("P95 e2e scheduling latency ms: " f"{e2e_latencies[int(len(e2e_latencies) * 0.95)]}") print("P99 e2e scheduling latency ms: " f"{e2e_latencies[int(len(e2e_latencies) * 0.99)]}") # Number of time scheduling was retried before succeeds. print("P50 scheduling attempts: " f"{scheduling_attempts[int(len(scheduling_attempts) * 0.5)]}") print("P95 scheduling attempts: " f"{scheduling_attempts[int(len(scheduling_attempts) * 0.95)]}") print("P99 scheduling attempts: " f"{scheduling_attempts[int(len(scheduling_attempts) * 0.99)]}") return { "pg_creation_per_second": throughput[0][1], "p50_scheduling_latency_ms": latencies[int(len(latencies) * 0.5)], "p50_e2e_pg_creation_latency_ms": e2e_latencies[int(len(e2e_latencies) * 0.5)] }
def benchmark_put_calls(ray): def put_small(): ray.put(0) timeit("client: put calls", put_small)