def test_multiple_waits_and_gets(shutdown_only): # It is important to use three workers here, so that the three tasks # launched in this experiment can run at the same time. ray.init(num_cpus=3) @ray.remote def f(delay): time.sleep(delay) return 1 @ray.remote def g(input_list): # The argument input_list should be a list containing one object ref. ray.wait([input_list[0]]) @ray.remote def h(input_list): # The argument input_list should be a list containing one object ref. ray.get(input_list[0]) # Make sure that multiple wait requests involving the same object ref # all return. x = f.remote(1) ray.get([g.remote([x]), g.remote([x])]) # Make sure that multiple get requests involving the same object ref all # return. x = f.remote(1) ray.get([h.remote([x]), h.remote([x])])
def test_wait_makes_object_local(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) cluster.add_node(num_cpus=2) ray.init(address=cluster.address) @ray.remote class Foo: def method(self): return np.zeros(1024 * 1024) a = Foo.remote() # Test get makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ray.get(x_id) assert ray.worker.global_worker.core_worker.object_exists(x_id) # Test wait makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ok, _ = ray.wait([x_id]) assert len(ok) == 1 assert ray.worker.global_worker.core_worker.object_exists(x_id)
def test_actor_pass_by_ref_order_optimization(shutdown_only): ray.init(num_cpus=4) @ray.remote class Actor: def __init__(self): pass def f(self, x): pass a = Actor.remote() @ray.remote def fast_value(): print("fast value") pass @ray.remote def slow_value(): print("start sleep") time.sleep(30) @ray.remote def runner(f): print("runner", a, f) return ray.get(a.f.remote(f.remote())) runner.remote(slow_value) time.sleep(1) x2 = runner.remote(fast_value) start = time.time() ray.get(x2) delta = time.time() - start assert delta < 10, "did not skip slow value"
def test_actor_distribution_balance(ray_start_cluster, args): cluster = ray_start_cluster node_count = args[0] actor_count = args[1] for i in range(node_count): cluster.add_node(memory=1024**3, _system_config={"gcs_actor_scheduling_enabled": True} if i == 0 else {}) ray.init(address=cluster.address) cluster.wait_for_nodes() @ray.remote(memory=100 * 1024**2, num_cpus=0.01) class Foo: def method(self): return ray.worker.global_worker.node.unique_id actor_distribution = {} actor_list = [Foo.remote() for _ in range(actor_count)] for actor in actor_list: node_id = ray.get(actor.method.remote()) if node_id not in actor_distribution.keys(): actor_distribution[node_id] = [] actor_distribution[node_id].append(actor) if node_count >= actor_count: assert len(actor_distribution) == actor_count for node_id, actors in actor_distribution.items(): assert len(actors) == 1 else: assert len(actor_distribution) == node_count for node_id, actors in actor_distribution.items(): assert len(actors) <= int(actor_count / node_count)
def test_task_arguments_inline_bytes_limit(ray_start_cluster): cluster = ray_start_cluster cluster.add_node( num_cpus=1, resources={"pin_head": 1}, _system_config={ "max_direct_call_object_size": 100 * 1024, # if task_rpc_inlined_bytes_limit is greater than # max_grpc_message_size, this test fails. "task_rpc_inlined_bytes_limit": 18 * 1024, "max_grpc_message_size": 20 * 1024, "put_small_object_in_memory_store": True, }, ) cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) ray.init(address=cluster.address) @ray.remote(resources={"pin_worker": 1}) def foo(ref1, ref2, ref3): return ref1 == ref2 + ref3 @ray.remote(resources={"pin_head": 1}) def bar(): # if the refs are inlined, the test fails. # refs = [ray.put(np.random.rand(1024) for _ in range(3))] # return ray.get( # foo.remote(refs[0], refs[1], refs[2])) return ray.get( foo.remote( np.random.rand(1024), # 8k np.random.rand(1024), # 8k np.random.rand(1024))) # 8k ray.get(bar.remote())
def test_future_resolution_skip_plasma(ray_start_cluster): cluster = ray_start_cluster # Disable worker caching so worker leases are not reused; set object # inlining size threshold and enable storing of small objects in in-memory # object store so the borrowed ref is inlined. cluster.add_node( num_cpus=1, resources={"pin_head": 1}, _system_config={ "worker_lease_timeout_milliseconds": 0, "max_direct_call_object_size": 100 * 1024, "put_small_object_in_memory_store": True, }, ) cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) ray.init(address=cluster.address) @ray.remote(resources={"pin_head": 1}) def f(x): return x + 1 @ray.remote(resources={"pin_worker": 1}) def g(x): borrowed_ref = x[0] f_ref = f.remote(borrowed_ref) # borrowed_ref should be inlined on future resolution and shouldn't be # in Plasma. assert ray.worker.global_worker.core_worker.object_exists( borrowed_ref, memory_store_only=True) return ray.get(f_ref) * 2 one = ray.put(1) g_ref = g.remote([one]) assert ray.get(g_ref) == 4
def test_internal_free(shutdown_only): ray.init(num_cpus=1) @ray.remote class Sampler: def sample(self): return [1, 2, 3, 4, 5] def sample_big(self): return np.zeros(1024 * 1024) sampler = Sampler.remote() # Free deletes from in-memory store. obj_ref = sampler.sample.remote() ray.get(obj_ref) ray.internal.free(obj_ref) with pytest.raises(Exception): ray.get(obj_ref) # Free deletes big objects from plasma store. big_id = sampler.sample_big.remote() ray.get(big_id) ray.internal.free(big_id) time.sleep(1) # wait for delete RPC to propagate with pytest.raises(Exception): ray.get(big_id)
def test_redefining_remote_functions(shutdown_only): ray.init(num_cpus=1) # Test that we can define a remote function in the shell. @ray.remote def f(x): return x + 1 assert ray.get(f.remote(0)) == 1 # Test that we can redefine the remote function. @ray.remote def f(x): return x + 10 while True: val = ray.get(f.remote(0)) assert val in [1, 10] if val == 10: break else: logger.info("Still using old definition of f, trying again.") # Check that we can redefine functions even when the remote function source # doesn't change (see https://github.com/ray-project/ray/issues/6130). @ray.remote def g(): return nonexistent() with pytest.raises(RayTaskError, match="nonexistent"): ray.get(g.remote()) def nonexistent(): return 1 # Redefine the function and make sure it succeeds. @ray.remote def g(): return nonexistent() assert ray.get(g.remote()) == 1 # Check the same thing but when the redefined function is inside of another # task. @ray.remote def h(i): @ray.remote def j(): return i return j.remote() for i in range(20): assert ray.get(ray.get(h.remote(i))) == i
def test_object_transfer_dump(ray_start_cluster_enabled): cluster = ray_start_cluster_enabled num_nodes = 3 for i in range(num_nodes): cluster.add_node(resources={str(i): 1}, object_store_memory=10 ** 9) ray.init(address=cluster.address) @ray.remote def f(x): return # These objects will live on different nodes. object_refs = [f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes)] # Broadcast each object from each machine to each other machine. for object_ref in object_refs: ray.get( [ f._remote(args=[object_ref], resources={str(i): 1}) for i in range(num_nodes) ] ) # The profiling information only flushes once every second. time.sleep(1.1) transfer_dump = ray.state.object_transfer_timeline() # Make sure the transfer dump can be serialized with JSON. json.loads(json.dumps(transfer_dump)) assert len(transfer_dump) >= num_nodes ** 2 assert ( len( { event["pid"] for event in transfer_dump if event["name"] == "transfer_receive" } ) == num_nodes ) assert ( len( { event["pid"] for event in transfer_dump if event["name"] == "transfer_send" } ) == num_nodes )
def test_use_dynamic_function_and_class(): # Test use dynamically defined functions # and classes for remote tasks and actors. # See https://github.com/ray-project/ray/issues/12834. ray.shutdown() current_path = os.path.dirname(__file__) job_config = ray.job_config.JobConfig(code_search_path=[current_path]) ray.init(job_config=job_config) def foo1(): @ray.remote def foo2(): return "OK" return foo2 @ray.remote class Foo: @ray.method(num_returns=1) def foo(self): return "OK" f = foo1() assert ray.get(f.remote()) == "OK" # Check whether the dynamic function is exported to GCS. # Note, the key format should be kept # the same as in `FunctionActorManager.export`. key_func = ( b"RemoteFunction:" + ray._private.worker.global_worker.current_job_id.hex().encode() + b":" + f._function_descriptor.function_id.binary() ) assert ray._private.worker.global_worker.gcs_client.internal_kv_exists( key_func, KV_NAMESPACE_FUNCTION_TABLE ) foo_actor = Foo.remote() assert ray.get(foo_actor.foo.remote()) == "OK" # Check whether the dynamic class is exported to GCS. # Note, the key format should be kept # the same as in `FunctionActorManager.export_actor_class`. key_cls = ( b"ActorClass:" + ray._private.worker.global_worker.current_job_id.hex().encode() + b":" + foo_actor._ray_actor_creation_function_descriptor.function_id.binary() ) assert ray._private.worker.global_worker.gcs_client.internal_kv_exists( key_cls, namespace=KV_NAMESPACE_FUNCTION_TABLE )
def test_defining_remote_functions(shutdown_only): ray.init(num_cpus=3) # Test that we can close over plain old data. data = [ np.zeros([3, 5]), (1, 2, "a"), [0.0, 1.0, 1 << 62], 1 << 60, {"a": np.zeros(3)}, ] @ray.remote def g(): return data ray.get(g.remote()) # Test that we can close over modules. @ray.remote def h(): return np.zeros([3, 5]) assert np.alltrue(ray.get(h.remote()) == np.zeros([3, 5])) @ray.remote def j(): return time.time() ray.get(j.remote()) # Test that we can define remote functions that call other remote # functions. @ray.remote def k(x): return x + 1 @ray.remote def k2(x): return ray.get(k.remote(x)) @ray.remote def m(x): return ray.get(k2.remote(x)) assert ray.get(k.remote(1)) == 2 assert ray.get(k2.remote(1)) == 2 assert ray.get(m.remote(1)) == 2
def client_mode_should_convert(*, auto_init: bool): """Determines if functions should be converted to client mode & if Ray should be auto-initialized. NOTE: `auto_init` must happen before we branch into regular ray or client code because the initialization may result in either mode. """ if auto_init: import ray if os.environ.get("RAY_ENABLE_AUTO_CONNECT", "") != "0" and not ray.is_initialized(): ray.init() # `is_client_mode_enabled_by_default` is used for testing with # `RAY_CLIENT_MODE=1`. This flag means all tests run with client mode. return (is_client_mode_enabled or is_client_mode_enabled_by_default) and \ _get_client_hook_status_on_thread()
def test_system_config_when_connecting(ray_start_cluster): config = {"object_timeout_milliseconds": 200} cluster = Cluster() cluster.add_node(_system_config=config, object_store_memory=100 * 1024 * 1024) cluster.wait_for_nodes() # Specifying _system_config when connecting to a cluster is disallowed. with pytest.raises(ValueError): ray.init(address=cluster.address, _system_config=config) # Check that the config was picked up (object pinning is disabled). ray.init(address=cluster.address) obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref ray.get(obj_ref)
def test_variable_number_of_args(shutdown_only): ray.init(num_cpus=1) @ray.remote def varargs_fct1(*a): return " ".join(map(str, a)) @ray.remote def varargs_fct2(a, *b): return " ".join(map(str, b)) x = varargs_fct1.remote(0, 1, 2) assert ray.get(x) == "0 1 2" x = varargs_fct2.remote(0, 1, 2) assert ray.get(x) == "1 2" @ray.remote def f1(*args): return args @ray.remote def f2(x, y, *args): return x, y, args assert ray.get(f1.remote()) == () assert ray.get(f1.remote(1)) == (1, ) assert ray.get(f1.remote(1, 2, 3)) == (1, 2, 3) with pytest.raises(Exception): f2.remote() with pytest.raises(Exception): f2.remote(1) assert ray.get(f2.remote(1, 2)) == (1, 2, ()) assert ray.get(f2.remote(1, 2, 3)) == (1, 2, (3, )) assert ray.get(f2.remote(1, 2, 3, 4)) == (1, 2, (3, 4)) def testNoArgs(self): @ray.remote def no_op(): pass self.ray_start() ray.get(no_op.remote())
def test_caching_functions_to_run(shutdown_only): # Test that we export functions to run on all workers before the driver # is connected. def f(worker_info): sys.path.append(1) ray.worker.global_worker.run_function_on_all_workers(f) def f(worker_info): sys.path.append(2) ray.worker.global_worker.run_function_on_all_workers(f) def g(worker_info): sys.path.append(3) ray.worker.global_worker.run_function_on_all_workers(g) def f(worker_info): sys.path.append(4) ray.worker.global_worker.run_function_on_all_workers(f) ray.init(num_cpus=1) @ray.remote def get_state(): time.sleep(1) return sys.path[-4], sys.path[-3], sys.path[-2], sys.path[-1] res1 = get_state.remote() res2 = get_state.remote() assert ray.get(res1) == (1, 2, 3, 4) assert ray.get(res2) == (1, 2, 3, 4) # Clean up the path on the workers. def f(worker_info): sys.path.pop() sys.path.pop() sys.path.pop() sys.path.pop() ray.worker.global_worker.run_function_on_all_workers(f)
def test_schedule_actor_and_normal_task(ray_start_cluster): cluster = ray_start_cluster cluster.add_node( memory=1024 ** 3, _system_config={"gcs_actor_scheduling_enabled": True} ) ray.init(address=cluster.address) cluster.wait_for_nodes() @ray.remote(memory=600 * 1024 ** 2, num_cpus=0.01) class Foo: def method(self): return 2 @ray.remote(memory=600 * 1024 ** 2, num_cpus=0.01) def fun(singal1, signal_actor2): signal_actor2.send.remote() ray.get(singal1.wait.remote()) return 1 singal1 = SignalActor.remote() signal2 = SignalActor.remote() o1 = fun.remote(singal1, signal2) # Make sure the normal task is executing. ray.get(signal2.wait.remote()) # The normal task is blocked now. # Try to create actor and make sure this actor is not created for the time # being. foo = Foo.remote() o2 = foo.method.remote() ready_list, remaining_list = ray.wait([o2], timeout=2) assert len(ready_list) == 0 and len(remaining_list) == 1 # Send a signal to unblock the normal task execution. ray.get(singal1.send.remote()) # Check the result of normal task. assert ray.get(o1) == 1 # Make sure the actor is created. assert ray.get(o2) == 2
def test_actor_call_order(shutdown_only): ray.init(num_cpus=4) @ray.remote def small_value(): time.sleep(0.01 * np.random.randint(0, 10)) return 0 @ray.remote class Actor: def __init__(self): self.count = 0 def inc(self, count, dependency): assert count == self.count self.count += 1 return count a = Actor.remote() assert ray.get([a.inc.remote(i, small_value.remote()) for i in range(100)]) == list(range(100))
def test_task_output_inline_bytes_limit(ray_start_cluster): cluster = ray_start_cluster # Disable worker caching so worker leases are not reused; set object # inlining size threshold and enable storing of small objects in in-memory # object store so the borrowed ref is inlined. # set task_output_inlined_bytes_limit which only allows inline 20 bytes. cluster.add_node( num_cpus=1, resources={"pin_head": 1}, _system_config={ "worker_lease_timeout_milliseconds": 0, "max_direct_call_object_size": 100 * 1024, "task_output_inlined_bytes_limit": 20, "put_small_object_in_memory_store": True, }, ) cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) ray.init(address=cluster.address) @ray.remote(num_returns=5, resources={"pin_head": 1}) def f(): return list(range(5)) @ray.remote(resources={"pin_worker": 1}) def sum(numbers): result = 0 for i, ref in enumerate(numbers): result += ray.get(ref) inlined = ray.worker.global_worker.core_worker.object_exists( ref, memory_store_only=True) if i < 2: assert inlined else: assert not inlined return result results = f.remote() g_ref = sum.remote(results) assert ray.get(g_ref) == 10
def test_worker_lease_reply_with_resources(ray_start_cluster_enabled): cluster = ray_start_cluster_enabled cluster.add_node( memory=2000 * 1024**2, num_cpus=1, _system_config={ "gcs_resource_report_poll_period_ms": 1000000, "gcs_actor_scheduling_enabled": True, }, ) node2 = cluster.add_node(memory=1000 * 1024**2, num_cpus=1) ray.init(address=cluster.address) cluster.wait_for_nodes() @ray.remote(memory=1500 * 1024**2, num_cpus=0.01) def fun(signal): signal.send.remote() time.sleep(30) return 0 signal = SignalActor.remote() fun.remote(signal) # Make sure that the `fun` is running. ray.get(signal.wait.remote()) @ray.remote(memory=800 * 1024**2, num_cpus=0.01) class Foo: def method(self): return ray.worker.global_worker.node.unique_id foo1 = Foo.remote() o1 = foo1.method.remote() ready_list, remaining_list = ray.wait([o1], timeout=10) # If RequestWorkerLeaseReply carries normal task resources, # GCS will then schedule foo1 to node2. Otherwise, # GCS would keep trying to schedule foo1 to # node1 and getting rejected. assert len(ready_list) == 1 and len(remaining_list) == 0 assert ray.get(o1) == node2.unique_id
def test_schedule_many_actors_and_normal_tasks(ray_start_cluster): cluster = ray_start_cluster node_count = 10 actor_count = 50 each_actor_task_count = 50 normal_task_count = 1000 node_memory = 2 * 1024**3 for i in range(node_count): cluster.add_node( memory=node_memory, _system_config={"gcs_actor_scheduling_enabled": True} if i == 0 else {}, ) ray.init(address=cluster.address) cluster.wait_for_nodes() @ray.remote(memory=100 * 1024**2, num_cpus=0.01) class Foo: def method(self): return 2 @ray.remote(memory=100 * 1024**2, num_cpus=0.01) def fun(): return 1 normal_task_object_list = [fun.remote() for _ in range(normal_task_count)] actor_list = [Foo.remote() for _ in range(actor_count)] actor_object_list = [ actor.method.remote() for _ in range(each_actor_task_count) for actor in actor_list ] for object in ray.get(actor_object_list): assert object == 2 for object in ray.get(normal_task_object_list): assert object == 1
def test_wait_cluster(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) ray.init(address=cluster.address) @ray.remote(resources={"RemoteResource": 1}) def f(): return # Make sure we have enough workers on the remote nodes to execute some # tasks. tasks = [f.remote() for _ in range(10)] start = time.time() ray.get(tasks) end = time.time() # Submit some more tasks that can only be executed on the remote nodes. tasks = [f.remote() for _ in range(10)] # Sleep for a bit to let the tasks finish. time.sleep((end - start) * 2) _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) # All remote tasks should have finished. assert len(unready) == 0
def test_call_matrix(shutdown_only): ray.init(object_store_memory=1000 * 1024 * 1024) @ray.remote class Actor: def small_value(self): return 0 def large_value(self): return np.zeros(10 * 1024 * 1024) def echo(self, x): if isinstance(x, list): x = ray.get(x[0]) return x @ray.remote def small_value(): return 0 @ray.remote def large_value(): return np.zeros(10 * 1024 * 1024) @ray.remote def echo(x): if isinstance(x, list): x = ray.get(x[0]) return x def check(source_actor, dest_actor, is_large, out_of_band): print("CHECKING", "actor" if source_actor else "task", "to", "actor" if dest_actor else "task", "large_object" if is_large else "small_object", "out_of_band" if out_of_band else "in_band") if source_actor: a = Actor.remote() if is_large: x_id = a.large_value.remote() else: x_id = a.small_value.remote() else: if is_large: x_id = large_value.remote() else: x_id = small_value.remote() if out_of_band: x_id = [x_id] if dest_actor: b = Actor.remote() x = ray.get(b.echo.remote(x_id)) else: x = ray.get(echo.remote(x_id)) if is_large: assert isinstance(x, np.ndarray) else: assert isinstance(x, int) for is_large in [False, True]: for source_actor in [False, True]: for dest_actor in [False, True]: for out_of_band in [False, True]: check(source_actor, dest_actor, is_large, out_of_band)