def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster(initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(address=cluster.address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_namespace(): """ Most of the "checks" in this test case rely on the fact that `run_string_as_driver` will throw an exception if the driver string exits with a non-zero exit code (e.g. when the driver scripts throws an exception). Since all of these drivers start named, detached actors, the most likely failure case would be a collision of named actors if they're put in the same namespace. This test checks that: * When two drivers don't specify a namespace, they are placed in different anonymous namespaces. * When two drivers specify a namespace, they collide. * The namespace name (as provided by the runtime context) is correct. """ cluster = Cluster() cluster.add_node(num_cpus=4, ray_client_server_port=50055) cluster.wait_for_nodes(1) template = """ import ray ray.client("localhost:50055").namespace({namespace}).connect() @ray.remote class Foo: def ping(self): return "pong" a = Foo.options(lifetime="detached", name="abc").remote() ray.get(a.ping.remote()) print(ray.get_runtime_context().namespace) """ anon_driver = template.format(namespace="None") run_string_as_driver(anon_driver) # This second run will fail if the actors don't run in separate anonymous # namespaces. run_string_as_driver(anon_driver) run_in_namespace = template.format(namespace="'namespace'") script_namespace = run_string_as_driver(run_in_namespace) # The second run fails because the actors are run in the same namespace. with pytest.raises(subprocess.CalledProcessError): run_string_as_driver(run_in_namespace) assert script_namespace.strip() == "namespace"
def setup_local_single_node_cluster(num_nodes): """Setup ray cluster locally via ray.init() and Cluster() Each actor is simulated in local process on single node, thus smaller scale by default. """ cluster = Cluster() for i in range(num_nodes): cluster.add_node( redis_port=6379 if i == 0 else None, num_cpus=NUM_CPU_PER_NODE, num_gpus=0, resources={str(i): 2}, ) ray.init(address=cluster.address, dashboard_host="0.0.0.0") serve_client = serve.start(http_options=dict(location="EveryNode")) return serve_client
def test_temp_plasma_store_socket(): ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket") assert os.path.exists( "/tmp/i_am_a_temp_socket"), "Specified socket path not found." ray.shutdown() try: os.remove("/tmp/i_am_a_temp_socket") except OSError: pass # It could have been removed by Ray. cluster = Cluster(True) cluster.add_node(plasma_store_socket_name="/tmp/i_am_a_temp_socket_2") assert os.path.exists( "/tmp/i_am_a_temp_socket_2"), "Specified socket path not found." cluster.shutdown() try: os.remove("/tmp/i_am_a_temp_socket_2") except OSError: pass # It could have been removed by Ray.
def test_ray_status_multinode(): from ray.cluster_utils import Cluster cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2) runner = CliRunner() def output_ready(): result = runner.invoke(scripts.status) result.stdout return not result.exception and "memory" in result.output wait_for_condition(output_ready) result = runner.invoke(scripts.status, []) _check_output_via_pattern("test_ray_status_multinode.txt", result) ray.shutdown() cluster.shutdown()
def test_raylet_socket_name(shutdown_only): sock1 = unix_socket_create_path("i_am_a_temp_socket_1") ray.init(raylet_socket_name=sock1) unix_socket_verify(sock1) ray.shutdown() try: unix_socket_delete(sock1) except OSError: pass # It could have been removed by Ray. cluster = Cluster(True) sock2 = unix_socket_create_path("i_am_a_temp_socket_2") cluster.add_node(raylet_socket_name=sock2) unix_socket_verify(sock2) cluster.shutdown() try: unix_socket_delete(sock2) except OSError: pass # It could have been removed by Ray.
def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that # the object store's capacity starts off higher and is later consumed # dynamically by concurrent workers. cluster = Cluster() object_size = int(6e6) num_objects = 20 num_tasks = 20 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=2 * num_tasks * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can fit 2 tasks at a time. cluster.add_node(num_cpus=1, object_store_memory=2.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(i, *args): print("foo", i) return @ray.remote def allocate(i): print("allocate", i) return np.zeros(object_size, dtype=np.uint8) args = [] for _ in range(num_tasks): task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] args.append(task_args) allocated = [allocate.remote(i) for i in range(num_objects)] ray.get(allocated) tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] ray.get(tasks) del allocated
def test_system_config_when_connecting(ray_start_cluster): config = {"object_timeout_milliseconds": 200} cluster = Cluster() cluster.add_node(_system_config=config, object_store_memory=100 * 1024 * 1024) cluster.wait_for_nodes() # Specifying _system_config when connecting to a cluster is disallowed. with pytest.raises(ValueError): ray.init(address=cluster.address, _system_config=config) # Check that the config was picked up (object pinning is disabled). ray.init(address=cluster.address) obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) del put_ref ray.get(obj_ref)
def test_ray_status_multinode(): cluster = Cluster() for _ in range(4): cluster.add_node(num_cpus=2) runner = CliRunner() def output_ready(): result = runner.invoke(scripts.status) result.stdout if not result.exception and "memory" in result.output: return True raise RuntimeError(f"result.exception={result.exception} " f"result.output={result.output}") wait_for_condition(output_ready) result = runner.invoke(scripts.status, []) _check_output_via_pattern("test_ray_status_multinode.txt", result) ray.shutdown() cluster.shutdown()
def run_multi_nodes(): c = Cluster() c.add_node(num_cpus=4, object_store_memory=object_store_size, _system_config=system_config) ray.init(address=c.address) for _ in range(num_nodes - 1): # subtract a head node. c.add_node(num_cpus=4, object_store_memory=object_store_size) c.wait_for_nodes() # Run shuffle. print( f"\n\nTest streaming shuffle with {num_nodes} nodes.\n" f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" "GB") run_shuffle() time.sleep(5) display_spilling_info(c.address) ray.shutdown() c.shutdown() time.sleep(5)
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config): object_spilling_config, temp_folder = object_spilling_config cluster = Cluster() cluster.add_node( num_cpus=0, object_store_memory=75 * 1024 * 1024, _system_config={"object_spilling_config": object_spilling_config}, ) ray.init(address=cluster.address) node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # This task will run on node 2 because node 1 has no CPU resource @ray.remote(num_cpus=1) def run_workload(): ids = [] for _ in range(2): arr = np.random.rand(5 * 1024 * 1024) # 40 MB ids.append(ray.put(arr)) return ids ids = ray.get(run_workload.remote()) assert not is_dir_empty(temp_folder) # Kill node 2 cluster.remove_node(node2) # Verify that the spill folder is not empty assert not is_dir_empty(temp_folder) # Start a new node cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # Verify that the spill folder is now cleaned up assert is_dir_empty(temp_folder) # We hold the object refs to prevent them from being deleted del ids ray.shutdown() cluster.shutdown()
def test_pull_bundles_pinning(shutdown_only): cluster = Cluster() object_size = int(50e6) num_objects = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=1000e6) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node cannot even fit a single task. cluster.add_node(num_cpus=1, object_store_memory=200e6) cluster.wait_for_nodes() @ray.remote(num_cpus=1) def foo(*args): return task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] ray.get(foo.remote(*task_args))
def setup_local_single_node_cluster(num_nodes): """Setup ray cluster locally via ray.init() and Cluster() Each actor is simulated in local process on single node, thus smaller scale by default. """ cluster = Cluster() for i in range(num_nodes): cluster.add_node( redis_port=6379 if i == 0 else None, num_redis_shards=NUM_REDIS_SHARDS if i == 0 else None, num_cpus=NUM_CPU_PER_NODE, num_gpus=0, resources={str(i): 2}, object_store_memory=OBJECT_STORE_MEMORY, redis_max_memory=REDIS_MAX_MEMORY, dashboard_host="0.0.0.0", ) ray.init(address=cluster.address, dashboard_host="0.0.0.0") serve_client = serve.start() return serve_client
def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "lineage_pinning_enabled": 1 if reconstruction_enabled else 0, "free_objects_period_milliseconds": -1, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address, _internal_config=config) @ray.remote(max_retries=1 if reconstruction_enabled else 0) def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return x obj = ray.put(np.zeros(10**7, dtype=np.uint8)) result = dependent_task.options(resources={"node1": 1}).remote(obj) ray.get(result) del obj cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8, _internal_config=config) for _ in range(20): ray.put(np.zeros(10**7, dtype=np.uint8)) if reconstruction_enabled: ray.get(result) else: with pytest.raises(ray.exceptions.UnreconstructableError): ray.get(result)
def ray_cluster(): try: from ray.cluster_utils import Cluster except ModuleNotFoundError: from ray._private.cluster_utils import Cluster cluster = Cluster() remote_nodes = [] num_nodes = 1 for i in range(num_nodes): remote_nodes.append(cluster.add_node(num_cpus=10)) if len(remote_nodes) == 1: ray.init(address=cluster.address) yield ray.shutdown()
def main(): cluster = Cluster( initialize_head=True, connect=True, head_node_args={"object_store_memory": 20 * 1024 * 1024 * 1024, "num_cpus": 16}, ) cluster.add_node( object_store_memory=20 * 1024 * 1024 * 1024, num_gpus=1, num_cpus=16 ) object_ref_list = [] for i in range(0, 10): object_ref = ray.put(np.random.rand(1024 * 128, 1024)) object_ref_list.append(object_ref) @ray.remote(num_gpus=1) def f(object_ref_list): diffs = [] for object_ref in object_ref_list: before = time.time() ray.get(object_ref) after = time.time() diffs.append(after - before) time.sleep(1) return np.mean(diffs), np.std(diffs) time_diff, time_diff_std = ray.get(f.remote(object_ref_list)) print( "latency to get an 1G object over network", round(time_diff, 2), "+-", round(time_diff_std, 2), ) ray.shutdown() cluster.shutdown()
def test_cluster_handle_affinity(): cluster = Cluster() # HACK: using two different ip address so the placement constraint for # resource check later will work. head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=4) cluster.add_node(node_ip_address="0.0.0.0", num_cpus=4) ray.init(head_node.address) # Make sure we have two nodes. node_ids = [n["NodeID"] for n in ray.nodes()] assert len(node_ids) == 2 # Start the backend. client = serve.start(http_port=randint(10000, 30000), detached=True) client.create_backend("hi:v0", lambda _: "hi") client.create_endpoint("hi", backend="hi:v0") # Try to retrieve the handle from both head and worker node, check the # router's node id. @ray.remote def check_handle_router_id(): client = serve.connect() handle = client.get_handle("hi") return get_node_id_for_actor(handle.router_handle) router_node_ids = ray.get([ check_handle_router_id.options(resources={ node_id: 0.01 }).remote() for node_id in ray.state.node_ids() ]) assert set(router_node_ids) == set(node_ids) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def test_ray_get_task_args_deadlock(shutdown_only): cluster = Cluster() object_size = int(6e6) num_objects = 10 # Head node can fit all of the objects at once. cluster.add_node(num_cpus=0, object_store_memory=4 * num_objects * object_size) cluster.wait_for_nodes() ray.init(address=cluster.address) # Worker node can only fit 1 task at a time. cluster.add_node(num_cpus=1, object_store_memory=1.5 * num_objects * object_size) cluster.wait_for_nodes() @ray.remote def foo(*args): return @ray.remote def test_deadlock(get_args, task_args): foo.remote(*task_args) ray.get(get_args) for i in range(5): start = time.time() get_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] task_args = [ ray.put(np.zeros(object_size, dtype=np.uint8)) for _ in range(num_objects) ] ray.get(test_deadlock.remote(get_args, task_args)) print(f"round {i} finished in {time.time() - start}")
def ray_cluster(): try: from ray.cluster_utils import Cluster except ModuleNotFoundError: from ray._private.cluster_utils import Cluster cluster = Cluster() remote_nodes = [] num_nodes = 3 for i in range(num_nodes): remote_nodes.append(cluster.add_node(num_cpus=10)) if len(remote_nodes) == 1: ray.init(address=cluster.address) mo.setup_cluster(address_to_resources=TEST_ADDRESS_TO_RESOURCES) yield RayActorDriver.stop_cluster() ray.shutdown() cluster.shutdown()
def ray_start_regular_shared(): try: from ray.cluster_utils import Cluster except ModuleNotFoundError: from ray._private.cluster_utils import Cluster cluster = Cluster() remote_nodes = [] num_nodes = 3 for i in range(num_nodes): remote_nodes.append(cluster.add_node(num_cpus=10)) if len(remote_nodes) == 1: ray.init() if hasattr(ray.util, "get_placement_group"): pg = ray.util.placement_group(name=pg_name, bundles=[{ 'CPU': n_process }], strategy="SPREAD") ray.get(pg.ready()) yield ray.shutdown()
def test_cached_object(ray_start_cluster): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, }) cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node1": 1}).remote() ray.get(dependent_task.options(resources={"node2": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) assert wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() ray.get(dependent_task.remote(obj))
def ray_large_cluster(): try: from ray.cluster_utils import Cluster except ModuleNotFoundError: from ray._private.cluster_utils import Cluster cluster = Cluster() remote_nodes = [] num_nodes = 3 for i in range(num_nodes): remote_nodes.append(cluster.add_node(num_cpus=10)) if len(remote_nodes) == 1: ray.init(address=cluster.address) register_ray_serializers() try: yield finally: unregister_ray_serializers() Router.set_instance(None) RayServer.clear() ray.shutdown() cluster.shutdown() if 'COV_CORE_SOURCE' in os.environ: # Remove this when https://github.com/ray-project/ray/issues/16802 got fixed subprocess.check_call(["ray", "stop", "--force"])
def _ray_start_cluster(**kwargs): init_kwargs = get_default_fixture_ray_kwargs() num_nodes = 0 do_init = False # num_nodes & do_init are not arguments for ray.init, so delete them. if "num_nodes" in kwargs: num_nodes = kwargs["num_nodes"] del kwargs["num_nodes"] if "do_init" in kwargs: do_init = kwargs["do_init"] del kwargs["do_init"] elif num_nodes > 0: do_init = True init_kwargs.update(kwargs) cluster = Cluster() remote_nodes = [] for _ in range(num_nodes): remote_nodes.append(cluster.add_node(**init_kwargs)) if do_init: ray.init(address=cluster.address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_fate_sharing(ray_start_cluster, use_actors, node_failure): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, } cluster = Cluster() # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the parent actor. node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) # Node to place the child actor. cluster.add_node(num_cpus=1, resources={"child": 1}) cluster.wait_for_nodes() @ray.remote def sleep(): time.sleep(1000) @ray.remote(resources={"child": 1}) def probe(): return # TODO(swang): This test does not pass if max_restarts > 0 for the # raylet codepath. Add this parameter once the GCS actor service is enabled # by default. @ray.remote class Actor(object): def __init__(self): return def start_child(self, use_actors): if use_actors: child = Actor.options(resources={"child": 1}).remote() ray.get(child.sleep.remote()) else: ray.get(sleep.options(resources={"child": 1}).remote()) def sleep(self): time.sleep(1000) def get_pid(self): return os.getpid() # Returns whether the "child" resource is available. def child_resource_available(): p = probe.remote() ready, _ = ray.wait([p], timeout=1) return len(ready) > 0 # Test fate sharing if the parent process dies. def test_process_failure(use_actors): a = Actor.options(resources={"parent": 1}).remote() pid = ray.get(a.get_pid.remote()) a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. os.kill(pid, 9) wait_for_condition(child_resource_available) # Test fate sharing if the parent node dies. def test_node_failure(node_to_kill, use_actors): a = Actor.options(resources={"parent": 1}).remote() a.start_child.remote(use_actors=use_actors) # Wait for the child to be scheduled. wait_for_condition(lambda: not child_resource_available()) # Kill the parent process. cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1}) wait_for_condition(child_resource_available) return node_to_kill if node_failure: test_node_failure(node_to_kill, use_actors) else: test_process_failure(use_actors)
object_store_memory = 10**8 num_nodes = 10 message = ("Make sure there is enough memory on this machine to run this " "workload. We divide the system memory by 2 to provide a buffer.") assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < ray.utils.get_system_memory() / 2), message # Simulate a cluster on one machine. cluster = Cluster() for i in range(num_nodes): cluster.add_node(redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, num_cpus=2, num_gpus=0, resources={str(i): 2}, object_store_memory=object_store_memory, redis_max_memory=redis_max_memory, webui_host="0.0.0.0") ray.init(address=cluster.address) # Run the workload. @ray.remote def f(*xs): return np.zeros(1024, dtype=np.uint8) iteration = 0 ids = []
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_system_config": { "num_heartbeats_timeout": 10 } }) self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only)) def testHeadBlocking(self): # Once resource requests are deprecated, remove this test os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources_for_trial(cpu_only_trial2)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources_for_trial(cpu_only_trial3))
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
def test_multiple_routers(): cluster = Cluster() head_node = cluster.add_node() cluster.add_node() ray.init(head_node.address) node_ids = ray.state.node_ids() assert len(node_ids) == 2 client = serve.start(http_port=8005) # noqa: F841 def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names wait_for_condition(lambda: len(get_proxy_names()) == 2) proxy_names = get_proxy_names() # Two actors should be started. def get_first_two_actors(): try: ray.get_actor(proxy_names[0]) ray.get_actor(proxy_names[1]) return True except ValueError: return False wait_for_condition(get_first_two_actors) # Wait for the actors to come up. ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Kill one of the servers, the HTTP server should still function. ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Add a new node to the cluster. This should trigger a new router to get # started. new_node = cluster.add_node() wait_for_condition(lambda: len(get_proxy_names()) == 3) third_proxy = get_proxy_names()[2] def get_third_actor(): try: ray.get_actor(third_proxy) return True # IndexErrors covers when cluster resources aren't updated yet. except (IndexError, ValueError): return False wait_for_condition(get_third_actor) # Remove the newly-added node from the cluster. The corresponding actor # should be removed as well. cluster.remove_node(new_node) def third_actor_removed(): try: ray.get_actor(third_proxy) return False except ValueError: return True # Check that the actor is gone and the HTTP server still functions. wait_for_condition(third_actor_removed) ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes")) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. if exp.get("config", {}).get("input") and \ not os.path.exists(exp["config"]["input"]): # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent input_file = rllib_dir.absolute().joinpath(exp["config"]["input"]) exp["config"]["input"] = str(input_file) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.torch: exp["config"]["use_pytorch"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 2 if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True ### Add Custom Callbacks exp["config"]["callbacks"] = CustomCallbacks if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) else: ray.init(address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) # NOTE: customs for exp in experiments.values(): exp["loggers"] = make_loggers(args) # launch training run_experiments(experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, verbose=verbose, concurrent=True)
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial) ), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "sync_config": { "upload_dir": args.upload_dir, }, } } # Ray UI. if args.no_ray_ui: deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False) args.ray_ui = False verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. input_ = exp.get("config", {}).get("input") if input_ and input_ != "sampler": # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent def patch_path(path): if isinstance(path, list): return [patch_path(i) for i in path] elif isinstance(path, dict): return { patch_path(k): patch_path(v) for k, v in path.items() } elif isinstance(path, str): if os.path.exists(path): return path else: abs_path = str(rllib_dir.absolute().joinpath(path)) return abs_path if os.path.exists(abs_path) else path else: return path exp["config"]["input"] = patch_path(input_) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.torch: deprecation_warning("--torch", "--framework=torch") exp["config"]["framework"] = "torch" elif args.eager: deprecation_warning("--eager", "--framework=[tf2|tfe]") exp["config"]["framework"] = "tfe" elif args.framework is not None: exp["config"]["framework"] = args.framework if args.trace: if exp["config"]["framework"] not in ["tf2", "tfe"]: raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 3 # Print details on trial result if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 # Print details on trial result if args.ray_num_nodes: # Import this only here so that train.py also works with # older versions (and user doesn't use `--ray-num-nodes`). from ray.cluster_utils import Cluster cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, ) ray.init(address=cluster.address) else: ray.init( include_dashboard=args.ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode, ) if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1) else: progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1) run_experiments( experiments, scheduler=create_scheduler(args.scheduler, **args.scheduler_config), resume=args.resume, verbose=verbose, progress_reporter=progress_reporter, concurrent=True, ) ray.shutdown()