def submit_scaling_job(num_tasks): @ray.remote(num_cpus=1) def f(i): time.sleep(60) return i print(">>>Submitting tasks with Ray client.") futures = [f.remote(i) for i in range(num_tasks)] print(">>>Verifying scale-up.") # Expect as many pods as tasks. # (each Ray pod has 1 CPU) wait_for_pods(num_tasks) print(">>>Waiting for task output.") task_output = ray.get(futures, timeout=360) assert task_output == list(range(num_tasks)), "Tasks did not"\ "complete with expected output."
def submit_scaling_job(client_port, num_tasks): @ray.remote(num_cpus=1) def f(i): time.sleep(60) return i print(">>>Submitting tasks with Ray client.") ray.util.connect(f"127.0.0.1:{client_port}") futures = [f.remote(i) for i in range(num_tasks)] print(">>>Verifying scale-up.") # Operator pod plus number of tasks # (each Ray pod has 1 CPU). wait_for_pods(num_tasks + 1) print(">>>Waiting for task output.") task_output = ray.get(futures, timeout=360) assert task_output == list(range(num_tasks)), "Tasks did not"\ "complete with expected output."
def test_scaling(self): with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ tempfile.NamedTemporaryFile("w+") as example_cluster_file2, \ tempfile.NamedTemporaryFile("w+") as operator_file: example_cluster_config_path = get_operator_config_path( "example_cluster.yaml") operator_config_path = get_operator_config_path( "operator_cluster_scoped.yaml") crd_path = get_operator_config_path("cluster_crd.yaml") operator_config = list( yaml.safe_load_all(open(operator_config_path).read())) example_cluster_config = yaml.safe_load( open(example_cluster_config_path).read()) # Set image and pull policy podTypes = example_cluster_config["spec"]["podTypes"] pod_specs = [operator_config[-1]["spec"]["template"]["spec"]] + [ podType["podConfig"]["spec"] for podType in podTypes ] for pod_spec in pod_specs: pod_spec["containers"][0]["image"] = IMAGE pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY # Config set-up for this test. example_cluster_config["spec"]["maxWorkers"] = 100 example_cluster_config["spec"]["idleTimeoutMinutes"] = 1 worker_type = podTypes[1] # Make sure we have the right type assert "worker" in worker_type["name"] worker_type["maxWorkers"] = 100 # Key for the first part of this test: worker_type["minWorkers"] = 30 # Config for a small cluster with the same name to be launched # in another namespace. example_cluster_config2 = copy.deepcopy(example_cluster_config) example_cluster_config2["spec"]["podTypes"][1]["minWorkers"] = 1 # Test overriding default client port. example_cluster_config["spec"]["headServicePorts"] = [{ "name": "client", "port": 10002, "targetPort": 10001 }] yaml.dump(example_cluster_config, example_cluster_file) yaml.dump(example_cluster_config2, example_cluster_file2) yaml.dump_all(operator_config, operator_file) files = [example_cluster_file, operator_file] for file in files: file.flush() print(">>>Creating operator.") cmd = f"kubectl apply -f {operator_file.name}" subprocess.check_call(cmd, shell=True) # Test creating operator before CRD. print(">>>Waiting for Ray operator to enter running state.") wait_for_operator() print(">>>Creating RayCluster CRD.") cmd = f"kubectl apply -f {crd_path}" subprocess.check_call(cmd, shell=True) # Takes a bit of time for CRD to register. time.sleep(10) # Start a 30-pod cluster. print(">>>Starting a cluster.") cd = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}" subprocess.check_call(cd, shell=True) print(">>>Starting a cluster with same name in another namespace") # Assumes a namespace called {NAMESPACE}2 has been created. cd = f"kubectl -n {NAMESPACE}2 apply -f "\ f"{example_cluster_file2.name}" subprocess.check_call(cd, shell=True) # Check that autoscaling respects minWorkers by waiting for # 32 pods in one namespace and 2 pods in the other. print(">>>Waiting for pods to join cluster.") wait_for_pods(31) wait_for_pods(2, namespace=f"{NAMESPACE}2") # Check scale-down. print(">>>Decreasing min workers to 0.") example_cluster_edit = copy.deepcopy(example_cluster_config) # Set minWorkers to 0: example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 0 yaml.dump(example_cluster_edit, example_cluster_file) example_cluster_file.flush() cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}" subprocess.check_call(cm, shell=True) print(">>>Sleeping for a minute while workers time-out.") time.sleep(60) print(">>>Verifying scale-down.") wait_for_pods(1) with client_connect_to_k8s(port="10002"): # Test scale up and scale down after task submission. submit_scaling_job(num_tasks=15) print(">>>Sleeping for a minute while workers time-out.") time.sleep(60) print(">>>Verifying scale-down.") wait_for_pods(1)
def test_scaling(self): with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ tempfile.NamedTemporaryFile("w+") as operator_file: example_cluster_config_path = get_operator_config_path( "example_cluster.yaml") operator_config_path = get_operator_config_path("operator.yaml") operator_config = list( yaml.safe_load_all(open(operator_config_path).read())) example_cluster_config = yaml.safe_load( open(example_cluster_config_path).read()) # Set image and pull policy podTypes = example_cluster_config["spec"]["podTypes"] pod_specs = [operator_config[-1]["spec"]] + [ podType["podConfig"]["spec"] for podType in podTypes ] for pod_spec in pod_specs: pod_spec["containers"][0]["image"] = IMAGE pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY # Config set-up for this test. example_cluster_config["spec"]["maxWorkers"] = 100 example_cluster_config["spec"]["idleTimeoutMinutes"] = 1 worker_type = podTypes[1] # Make sure we have the right type assert "worker" in worker_type["name"] worker_type["maxWorkers"] = 100 # Key for the first part of this test: worker_type["minWorkers"] = 30 yaml.dump(example_cluster_config, example_cluster_file) yaml.dump_all(operator_config, operator_file) files = [example_cluster_file, operator_file] for file in files: file.flush() # Start operator and a 30-pod-cluster. print(">>>Starting operator and a cluster.") for file in files: cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}" subprocess.check_call(cmd, shell=True) # Check that autoscaling respects minWorkers by waiting for # 32 pods in the namespace. print(">>>Waiting for pods to join cluster.") wait_for_pods(32) # Check scale-down. print(">>>Decreasing min workers to 0.") example_cluster_edit = copy.deepcopy(example_cluster_config) # Set minWorkers to 0: example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 0 yaml.dump(example_cluster_edit, example_cluster_file) example_cluster_file.flush() cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}" subprocess.check_call(cm, shell=True) print(">>>Sleeping for a minute while workers time-out.") time.sleep(60) print(">>>Verifying scale-down.") wait_for_pods(2) # Test scale up and scale down after task submission. command = f"kubectl -n {NAMESPACE}"\ " port-forward service/example-cluster-ray-head 10001:10001" command = command.split() print(">>>Port-forwarding head service.") self.proc = subprocess.Popen(command) try: # Wait a bit for the port-forwarding connection to be # established. time.sleep(10) # Check that job submission works submit_scaling_job(client_port="10001", num_tasks=15) # Clean up self.proc.kill() except Exception as e: # Clean up on failure self.proc.kill() raise (e) print(">>>Sleeping for a minute while workers time-out.") time.sleep(60) print(">>>Verifying scale-down.") wait_for_pods(2)