class TrialRunnerTest(unittest.TestCase): def setUp(self): self.cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 4, "num_gpus": 1, }) self.cluster.add_node(num_cpus=2, num_gpus=1) self.cluster.wait_for_nodes() def tearDown(self): ray.shutdown() self.cluster.shutdown() def testAvailableResources(self): assert len(nodes()) == 2 assert default_device(refresh=True) == "GPU" def testOthersTakingResources(self): # Let someone occupy the head node pg = placement_group([{"CPU": 4, "GPU": 1}]) ray.get(pg.ready()) # We are left with the second node assert len(nodes()) == 1 assert default_device(refresh=True) == "GPU" pg = placement_group([{"GPU": 1}]) ray.get(pg.ready()) # Default device should be CPU assert default_device(refresh=True) == "CPU" assert len(nodes()) == 1
def test_namespace_client(): cluster = Cluster() cluster.add_node(num_cpus=4, ray_client_server_port=8080) cluster.wait_for_nodes(1) template = """ import ray ray.util.connect("{address}", namespace="{namespace}") @ray.remote class DetachedActor: def ping(self): return "pong from other job" actor = DetachedActor.options(name="Pinger", lifetime="detached").remote() ray.get(actor.ping.remote()) print("Done!!!") """ print( run_string_as_driver( template.format(address="localhost:8080", namespace="test"))) ray.util.connect("localhost:8080", namespace="test") pinger = ray.get_actor("Pinger") assert ray.get(pinger.ping.remote()) == "pong from other job" ray.util.disconnect() cluster.shutdown() # This piece of cleanup doesn't seem to happen automatically. ray._private.client_mode_hook._explicitly_disable_client_mode()
def build_cluster(num_nodes, num_cpus, object_store_memory): cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpus, object_store_memory=object_store_memory) cluster.wait_for_nodes() return cluster
def test_pull_request_retry(shutdown_only): cluster = Cluster() cluster.add_node(num_cpus=0, num_gpus=1, object_store_memory=100 * 2**20) cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 2**20) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def put(): return np.zeros(64 * 2**20, dtype=np.int8) @ray.remote(num_cpus=0, num_gpus=1) def driver(): local_ref = ray.put(np.zeros(64 * 2**20, dtype=np.int8)) remote_ref = put.remote() ready, _ = ray.wait([remote_ref], timeout=30) assert len(ready) == 1 del local_ref # This should always complete within 10 seconds. ready, _ = ray.wait([remote_ref], timeout=20) assert len(ready) > 0 # Pretend the GPU node is the driver. We do this to force the placement of # the driver and `put` task on different nodes. ray.get(driver.remote())
def test_task_args(shutdown_only): cluster = Cluster() cluster.add_node( num_cpus=1, object_store_memory=80 * 1024 * 1024, _system_config={ "local_fs_capacity_threshold": 0, }, resources={"out_of_memory": 1}, ) cluster.add_node( num_cpus=1, object_store_memory=200 * 1024 * 1024, resources={"sufficient_memory": 1}, ) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def foo(): return np.random.rand(20 * 1024 * 1024) # 160 MB data @ray.remote def bar(obj): print(obj) ref = foo.options(resources={"sufficient_memory": 1}).remote() try: ray.get(bar.options(resources={"out_of_memory": 1}).remote(ref)) except ray.exceptions.RayTaskError as e: assert isinstance(e.cause, ray.exceptions.OutOfDiskError)
def test_pull_bundles_admission_control(shutdown_only): cluster = Cluster() object_size = int(6e6) num_objects = 10 num_tasks = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can only fit 1 task at a time. cluster.add_node(num_cpus=1, object_store_memory=1.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(*args): return args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) tasks = [foo.remote(*task_args) for task_args in args] ray.get(tasks)
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return x obj = ray.put(np.zeros(10**7, dtype=np.uint8)) result = dependent_task.options(resources={"node1": 1}).remote(obj) ray.get(result) del obj cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) for _ in range(20): ray.put(np.zeros(10**7, dtype=np.uint8)) if reconstruction_enabled: ray.get(result) else: with pytest.raises(ray.exceptions.UnreconstructableError): ray.get(result)
def test_actor(shutdown_only): cluster = Cluster() cluster.add_node( num_cpus=1, object_store_memory=80 * 1024 * 1024, _system_config={ "local_fs_capacity_threshold": 0, }, resources={"out_of_memory": 1}, ) cluster.add_node( num_cpus=1, object_store_memory=200 * 1024 * 1024, resources={"sufficient_memory": 1}, ) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def foo(): return np.random.rand(20 * 1024 * 1024) # 160 MB data @ray.remote class Actor: def __init__(self, obj): self._obj = obj def foo(self): print(self._obj) def args_ood(self, obj): print(obj) def return_ood(self): return np.random.rand(20 * 1024 * 1024) ref = foo.options(resources={"sufficient_memory": 1}).remote() with pytest.raises(ray.exceptions.RayActorError): a = Actor.options(resources={"out_of_memory": 0.001}).remote(ref) ray.get(a.foo.remote()) a = Actor.options(resources={"out_of_memory": 1}).remote(1) ray.get(a.foo.remote()) try: ray.get(a.args_ood.remote(ref)) except ray.exceptions.RayTaskError as e: assert isinstance(e.cause, ray.exceptions.OutOfDiskError) ray.get(a.foo.remote()) try: ray.get(a.return_ood.remote()) except ray.exceptions.RayTaskError as e: assert isinstance(e.cause, ray.exceptions.OutOfDiskError)
def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config, object_store_memory=10**8) node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def chain(x): return x @ray.remote def dependent_task(x): return x obj = large_object.remote() for _ in range(20): obj = chain.remote(obj) ray.get(dependent_task.remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, object_store_memory=10**8, _internal_config=config) if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause()
def test_namespace(): """ Most of the "checks" in this test case rely on the fact that `run_string_as_driver` will throw an exception if the driver string exits with a non-zero exit code (e.g. when the driver scripts throws an exception). Since all of these drivers start named, detached actors, the most likely failure case would be a collision of named actors if they're put in the same namespace. This test checks that: * When two drivers don't specify a namespace, they are placed in different anonymous namespaces. * When two drivers specify a namespace, they collide. * The namespace name (as provided by the runtime context) is correct. """ cluster = Cluster() cluster.add_node(num_cpus=4, ray_client_server_port=50055) cluster.wait_for_nodes(1) template = """ import ray ray.client("localhost:50055").namespace({namespace}).connect() @ray.remote class Foo: def ping(self): return "pong" a = Foo.options(lifetime="detached", name="abc").remote() ray.get(a.ping.remote()) print(ray.get_runtime_context().namespace) """ anon_driver = template.format(namespace="None") run_string_as_driver(anon_driver) # This second run will fail if the actors don't run in separate anonymous # namespaces. run_string_as_driver(anon_driver) run_in_namespace = template.format(namespace="'namespace'") script_namespace = run_string_as_driver(run_in_namespace) # The second run fails because the actors are run in the same namespace. with pytest.raises(subprocess.CalledProcessError): run_string_as_driver(run_in_namespace) assert script_namespace.strip() == "namespace" subprocess.check_output("ray stop --force", shell=True)
def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that # the object store's capacity starts off higher and is later consumed # dynamically by concurrent workers. cluster = Cluster() object_size = int(6e6) num_objects = 20 num_tasks = 20 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can fit 2 tasks at a time. cluster.add_node(num_cpus=1, object_store_memory=2.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(i, *args): print("foo", i) return @ray.remote def allocate(i): print("allocate", i) return np.zeros(object_size, dtype=np.uint8) args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) allocated = [allocate.remote(i) for i in range(num_objects)] ray.get(allocated) tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] ray.get(tasks) del allocated
def test_system_config_when_connecting(ray_start_cluster): config = {"object_timeout_milliseconds": 200} cluster = Cluster() cluster.add_node(_system_config=config, object_store_memory=100 * 1024 * 1024) cluster.wait_for_nodes() # Specifying _system_config when connecting to a cluster is disallowed. with pytest.raises(ValueError): ray.init(address=cluster.address, _system_config=config) # Check that the config was picked up (object pinning is disabled). ray.init(address=cluster.address) obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref ray.get(obj_ref)
def test_cached_object(ray_start_cluster): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node1": 1}).remote() ray.get(dependent_task.options(resources={"node2": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() ray.get(dependent_task.remote(obj))
def run_multi_nodes(): c = Cluster() c.add_node(num_cpus=4, object_store_memory=object_store_size, _system_config=system_config) ray.init(address=c.address) for _ in range(num_nodes - 1): # subtract a head node. c.add_node(num_cpus=4, object_store_memory=object_store_size) c.wait_for_nodes() # Run shuffle. print( f"\n\nTest streaming shuffle with {num_nodes} nodes.\n" f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" "GB") run_shuffle() time.sleep(5) display_spilling_info(c.address) ray.shutdown() c.shutdown() time.sleep(5)
def test_pull_bundles_pinning(shutdown_only): cluster = Cluster() object_size = int(50e6) num_objects = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=1000e6) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node cannot even fit a single task. cluster.add_node(num_cpus=1, object_store_memory=200e6) cluster.wait_for_nodes() @ray.remote(num_cpus=1) def foo(*args): return task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] ray.get(foo.remote(*task_args))
def test_ray_get_task_args_deadlock(shutdown_only): cluster = Cluster() object_size = int(6e6) num_objects = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=4 * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can only fit 1 task at a time. cluster.add_node(num_cpus=1, object_store_memory=1.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(*args): return @ray.remote def test_deadlock(get_args, task_args): foo.remote(*task_args) ray.get(get_args) for i in range(5): start = time.time() get_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] ray.get(test_deadlock.remote(get_args, task_args)) print(f"round {i} finished in {time.time() - start}")
# This number should be divisible by 3. resource_quantity = 999 num_nodes = 5 custom_resources = {"pg_custom": resource_quantity} # Create pg that uses 1 resource of cpu & custom resource. num_pg = resource_quantity # TODO(sang): Cluster setup. Remove when running in real clusters. cluster = Cluster() nodes = [] for _ in range(num_nodes): nodes.append( cluster.add_node(num_cpus=3, num_gpus=resource_quantity, resources=custom_resources)) cluster.wait_for_nodes() ray.init(address=cluster.address) while not ray.is_initialized(): time.sleep(0.1) # Scenario 1: Create bunch of placement groups and measure how long it takes. total_creating_time = 0 total_removing_time = 0 repeat = 1 total_trial = repeat * num_pg bundles = [{"GPU": 1, "pg_custom": 1}] * num_nodes # Create and remove placement groups. for _ in range(repeat): pgs = []
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))
2, "object_spilling_config": json.dumps( { "type": "filesystem", "params": { "directory_path": "/tmp/spill" } }, separators=(",", ":")) }) # Add fake 4 nodes cluster. for _ in range(num_nodes - 1): # subtract a head node. c.add_node(num_cpus=4, object_store_memory=object_store_size) c.wait_for_nodes() ray.init(address=c.address) @ray.remote class Counter: def __init__(self): self.num_map = 0 self.num_reduce = 0 def inc(self): self.num_map += 1 print("Num map tasks finished", self.num_map) def inc2(self): self.num_reduce += 1
def test_locality_aware_scheduling_for_dead_nodes(shutdown_only): """Test that locality-ware scheduling can handle dead nodes.""" # Create a cluster with 4 nodes. config = { "num_heartbeats_timeout": 5, "raylet_heartbeat_period_milliseconds": 50, } cluster = Cluster() cluster.add_node(num_cpus=4, resources={"node1": 1}, _system_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address) node2 = cluster.add_node(num_cpus=4, resources={"node2": 1}) node3 = cluster.add_node(num_cpus=4, resources={"node3": 1}) node4 = cluster.add_node(num_cpus=4, resources={"node4": 1}) cluster.wait_for_nodes() # Create 2 objects on node 2. @ray.remote(resources={"node2": 0.1}) def create_object(): return np.zeros(10 * 1024 * 1024, dtype=np.uint8) obj1 = create_object.remote() obj2 = create_object.remote() # Push these 2 objects to other nodes. # node2 will have obj1 and obj2. # node3 will have obj1. # node4 will have obj2. @ray.remote class MyActor: def __init__(self, obj_refs): # Note, we need to keep obj_refs to prevent the objects from # being garbage collected. self.obj_refs = obj_refs self.obj = ray.get(obj_refs) def ready(self): return True actors = [ MyActor.options(resources={ "node2": 0.1 }).remote([obj1, obj2]), MyActor.options(resources={ "node3": 0.1 }).remote([obj1]), MyActor.options(resources={ "node4": 0.1 }).remote([obj2]), ] assert all(ray.get(actor.ready.remote()) is True for actor in actors) # This function requires obj1 and obj2. @ray.remote def func(obj1, obj2): return ray.worker.global_worker.node.unique_id # This function should be scheduled to node2. As node2 has both objects. assert ray.get(func.remote(obj1, obj2)) == node2.unique_id # Kill node2, and re-schedule the function. # It should be scheduled to either node3 or node4. node2.kill_raylet() # Waits for the driver to receive the NodeRemoved notification. time.sleep(1) target_node = ray.get(func.remote(obj1, obj2)) assert target_node == node3.unique_id or target_node == node4.unique_id
def test_fate_sharing(ray_start_cluster): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the parent actor. node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) # Node to place the child actor. cluster.add_node(num_cpus=1, resources={"child": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def sleep(): time.sleep(1000) @ray.remote(resources={"child": 1}) def probe(): return @ray.remote class Actor(object): def __init__(self): return def start_child(self, use_actors): if use_actors: child = Actor.options(resources={"child": 1}).remote() ray.get(child.sleep.remote()) else: ray.get(sleep.options(resources={"child": 1}).remote()) def sleep(self): time.sleep(1000) def get_pid(self): return os.getpid() # Returns whether the "child" resource is available. def child_resource_available(): p = probe.remote() ready, _ = ray.wait([p], timeout=1) return len(ready) > 0 # Test fate sharing if the parent process dies. def test_process_failure(use_actors): a = Actor.options(resources={"parent": 1}).remote() pid = ray.get(a.get_pid.remote()) a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. assert wait_for_condition( lambda: not child_resource_available(), timeout_ms=10000) # Kill the parent process. os.kill(pid, 9) assert wait_for_condition(child_resource_available, timeout_ms=10000) # Test fate sharing if the parent node dies. def test_node_failure(node_to_kill, use_actors): a = Actor.options(resources={"parent": 1}).remote() a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. assert wait_for_condition( lambda: not child_resource_available(), timeout_ms=10000) # Kill the parent process. cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) assert wait_for_condition(child_resource_available, timeout_ms=10000) return node_to_kill test_process_failure(use_actors=True) test_process_failure(use_actors=False) node_to_kill = test_node_failure(node_to_kill, use_actors=True) node_to_kill = test_node_failure(node_to_kill, use_actors=False)
def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, } cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the parent actor. node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) # Node to place the child actor. cluster.add_node(num_cpus=1, resources={"child": 1}) cluster.wait_for_nodes() @ray.remote def sleep(): time.sleep(1000) @ray.remote(resources={"child": 1}) def probe(): return # TODO(swang): This test does not pass if max_restarts > 0 for the # raylet codepath. Add this parameter once the GCS actor service is enabled # by default. @ray.remote class Actor(object): def __init__(self): return def start_child(self, use_actors): if use_actors: child = Actor.options(resources={"child": 1}).remote() ray.get(child.sleep.remote()) else: ray.get(sleep.options(resources={"child": 1}).remote()) def sleep(self): time.sleep(1000) def get_pid(self): return os.getpid() # Returns whether the "child" resource is available. def child_resource_available(): p = probe.remote() ready, _ = ray.wait([p], timeout=1) return len(ready) > 0 # Test fate sharing if the parent process dies. def test_process_failure(use_actors): a = Actor.options(resources={"parent": 1}).remote() pid = ray.get(a.get_pid.remote()) a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. os.kill(pid, 9) wait_for_condition(child_resource_available) # Test fate sharing if the parent node dies. def test_node_failure(node_to_kill, use_actors): a = Actor.options(resources={"parent": 1}).remote() a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) wait_for_condition(child_resource_available) return node_to_kill if node_failure: test_node_failure(node_to_kill, use_actors) else: test_process_failure(use_actors)
def test_reconstruction_cached_dependency(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() @ray.remote(max_retries=0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def chain(x): return x @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node2": 1}).remote() obj = chain.options(resources={"node1": 1}).remote(obj) ray.get(dependent_task.options(resources={"node1": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() if reconstruction_enabled: ray.get(dependent_task.remote(obj)) else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) with pytest.raises(ray.exceptions.UnreconstructableError): raise e.as_instanceof_cause()