def __do_spawn(self): try: create_or_update_cluster( self.config_file, no_restart=False, restart_only=False, no_config_cache=False, ) # need to re-load the config, as create_or_update_cluster() modifies it with open(self.config_file) as inp: self.config = yaml.safe_load(inp.read()) self.ready = True except BaseException as ex: self.spawner.exc = CannotSpawnCluster( "Cannot spawn cluster", cause=ex, traceback=traceback.format_exc() ) if not self.spawner.silent: sys.stderr.write(f"Cannot spawn cluster:\n{traceback.format_exc()}\n")
def test_up_and_down(self): """(1) Runs 'ray up' with a Kubernetes config that specifies min_workers=1. (2) Runs 'ray exec' to read monitor logs and confirm that worker and head are connected. (4) Rsyncs files up and down. (3) Runs 'ray down' and confirms that the cluster is gone.""" # get path to config config = get_config() # get a node provider provider_config = config["provider"] cluster_name = config["cluster_name"] self.provider = KubernetesNodeProvider(provider_config, cluster_name) # ray up sdk.create_or_update_cluster(config, no_config_cache=True) # Check for two pods (worker and head). while True: nodes = self.provider.non_terminated_nodes({}) if len(nodes) == 2: break else: time.sleep(1) # Read logs with ray exec and check that worker and head are connected. # (Since the config yaml is legacy-style, we check for # ray-legacy-*-node_type.) log_cmd = "tail -n 100 /tmp/ray/session_latest/logs/monitor*" while True: monitor_output = sdk.run_on_cluster(config, cmd=log_cmd, with_output=True).decode() if ("head-node" in monitor_output and "worker-node" in monitor_output): break else: time.sleep(1) # rsync with tempfile.NamedTemporaryFile("w") as test_file: test_file.write("test") test_file.flush() sdk.rsync(config, source=test_file.name, target="~/in_pod", down=False) with tempfile.NamedTemporaryFile() as test_file: sdk.rsync(config, target=test_file.name, source="~/in_pod", down=True) contents = open(test_file.name).read() assert contents == "test" # ray down sdk.teardown_cluster(config) # Check that there are no pods left in namespace ray to confirm that # the cluster is gone. while True: nodes = self.provider.non_terminated_nodes({}) if len(nodes) == 0: break else: time.sleep(1)