def build_cluster(num_nodes, num_cpus, object_store_memory): cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpus, object_store_memory=object_store_memory) cluster.wait_for_nodes() return cluster
def test_pull_bundles_admission_control(shutdown_only): cluster = Cluster() object_size = int(6e6) num_objects = 10 num_tasks = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can only fit 1 task at a time. cluster.add_node(num_cpus=1, object_store_memory=1.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(*args): return args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) tasks = [foo.remote(*task_args) for task_args in args] ray.get(tasks)
def test_pull_request_retry(shutdown_only): cluster = Cluster() cluster.add_node(num_cpus=0, num_gpus=1, object_store_memory=100 * 2**20) cluster.add_node(num_cpus=1, num_gpus=0, object_store_memory=100 * 2**20) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def put(): return np.zeros(64 * 2**20, dtype=np.int8) @ray.remote(num_cpus=0, num_gpus=1) def driver(): local_ref = ray.put(np.zeros(64 * 2**20, dtype=np.int8)) remote_ref = put.remote() ready, _ = ray.wait([remote_ref], timeout=1) assert len(ready) == 0 del local_ref # This should always complete within 10 seconds. ready, _ = ray.wait([remote_ref], timeout=20) assert len(ready) > 0 # Pretend the GPU node is the driver. We do this to force the placement of # the driver and `put` task on different nodes. ray.get(driver.remote())
def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that # the object store's capacity starts off higher and is later consumed # dynamically by concurrent workers. cluster = Cluster() object_size = int(6e6) num_objects = 10 num_tasks = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can fit 2 tasks at a time. cluster.add_node(num_cpus=1, object_store_memory=2.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(i, *args): print("foo", i) return @ray.remote def allocate(i): print("allocate", i) return np.zeros(object_size, dtype=np.uint8) args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] allocated = [allocate.remote(i) for i in range(num_objects)] ray.get(tasks) del allocated
def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, } cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the parent actor. node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) # Node to place the child actor. cluster.add_node(num_cpus=1, resources={"child": 1}) cluster.wait_for_nodes() @ray.remote def sleep(): time.sleep(1000) @ray.remote(resources={"child": 1}) def probe(): return # TODO(swang): This test does not pass if max_restarts > 0 for the # raylet codepath. Add this parameter once the GCS actor service is enabled # by default. @ray.remote class Actor(object): def __init__(self): return def start_child(self, use_actors): if use_actors: child = Actor.options(resources={"child": 1}).remote() ray.get(child.sleep.remote()) else: ray.get(sleep.options(resources={"child": 1}).remote()) def sleep(self): time.sleep(1000) def get_pid(self): return os.getpid() # Returns whether the "child" resource is available. def child_resource_available(): p = probe.remote() ready, _ = ray.wait([p], timeout=1) return len(ready) > 0 # Test fate sharing if the parent process dies. def test_process_failure(use_actors): a = Actor.options(resources={"parent": 1}).remote() pid = ray.get(a.get_pid.remote()) a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. os.kill(pid, 9) wait_for_condition(child_resource_available) # Test fate sharing if the parent node dies. def test_node_failure(node_to_kill, use_actors): a = Actor.options(resources={"parent": 1}).remote() a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) wait_for_condition(child_resource_available) return node_to_kill if node_failure: test_node_failure(node_to_kill, use_actors) else: test_process_failure(use_actors) ray.state.state._check_connected() keys = [ key for r in ray.state.state.redis_clients for key in r.keys("WORKER_FAILURE*") ] if node_failure: assert len(keys) <= 1, len(keys) else: assert len(keys) <= 2, len(keys)
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))