Пример #1
0
 def testValidateDefaultConfig(self):
     for config_path in CONFIG_PATHS:
         try:
             if "aws/example-multi-node-type.yaml" in config_path:
                 # aws tested in testValidateDefaultConfigAWSMultiNodeTypes.
                 continue
             if "local" in config_path:
                 # local tested in testValidateLocal
                 continue
             if "fake_multi_node" in config_path:
                 # not supported with ray up
                 continue
             if "kuberay" in config_path:
                 # not supported with ray up
                 continue
             with open(config_path) as f:
                 config = yaml.safe_load(f)
             config = prepare_config(config)
             if config["provider"]["type"] == "kubernetes":
                 KubernetesNodeProvider.fillout_available_node_types_resources(
                     config)
             validate_config(config)
         except Exception:
             logging.exception("")
             self.fail(
                 f"Config {config_path} did not pass validation test!")
Пример #2
0
    def test_examples(self):

        # Validate terminate_node error handling
        provider = KubernetesNodeProvider({"namespace": NAMESPACE},
                                          "default_cluster_name")
        # 404 caught, no error
        provider.terminate_node("no-such-node")

        with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \
                tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\
                tempfile.NamedTemporaryFile("w+") as operator_file,\
                tempfile.NamedTemporaryFile("w+") as job_file:

            # Get paths to operator configs
            example_cluster_config_path = get_operator_config_path(
                "example_cluster.yaml")
            example_cluster2_config_path = get_operator_config_path(
                "example_cluster2.yaml")
            operator_config_path = get_operator_config_path(
                "operator_namespaced.yaml")
            job_path = os.path.join(RAY_PATH,
                                    "doc/kubernetes/job-example.yaml")

            # Load operator configs
            example_cluster_config = yaml.safe_load(
                open(example_cluster_config_path).read())
            example_cluster2_config = yaml.safe_load(
                open(example_cluster2_config_path).read())
            operator_config = list(
                yaml.safe_load_all(open(operator_config_path).read()))
            job_config = yaml.safe_load(open(job_path).read())

            # Fill image fields
            podTypes = example_cluster_config["spec"]["podTypes"]
            podTypes2 = example_cluster2_config["spec"]["podTypes"]
            pod_specs = (
                [operator_config[-1]["spec"]["template"]["spec"]] +
                [job_config["spec"]["template"]["spec"]] +
                [podType["podConfig"]["spec"] for podType in podTypes] +
                [podType["podConfig"]["spec"] for podType in podTypes2])
            for pod_spec in pod_specs:
                pod_spec["containers"][0]["image"] = IMAGE
                pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY

            # Use a custom Redis port for one of the clusters.
            example_cluster_config["spec"]["headStartRayCommands"][1] += \
                " --port 6400"
            example_cluster_config["spec"]["workerStartRayCommands"][1] = \
                " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400"

            # Dump to temporary files
            yaml.dump(example_cluster_config, example_cluster_file)
            yaml.dump(example_cluster2_config, example_cluster2_file)
            yaml.dump(job_config, job_file)
            yaml.dump_all(operator_config, operator_file)
            files = [
                example_cluster_file, example_cluster2_file, operator_file
            ]
            for file in files:
                file.flush()

            # Start operator and two clusters
            print(">>>Starting operator and two clusters.")
            for file in files:
                cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
                subprocess.check_call(cmd, shell=True)

            # Check that autoscaling respects minWorkers by waiting for
            # six pods in the namespace.
            print(">>>Waiting for pods to join clusters.")
            wait_for_pods(6)
            # Check that head services are present.
            print(">>>Checking that head services are present.")
            assert num_services() == 2

            # Check that logging output looks normal (two workers connected to
            # ray cluster example-cluster.)
            operator_pod = [pod for pod in pods() if "operator" in pod].pop()
            wait_for_logs(operator_pod)

            print(">>>Confirming 'Running' status for second cluster.")
            wait_for_status("example-cluster2", "Running")
            print(">>>Deleting second cluster's head.")
            head_pod = [pod for pod in pods() if "2-ray-head" in pod].pop()
            cd = f"kubectl -n {NAMESPACE} delete pod {head_pod}"
            subprocess.check_call(cd, shell=True)
            print(">>>Waiting for 'Error' status to register.")
            wait_for_status("example-cluster2", "Error")

            # Delete the second cluster
            print(">>>Deleting example-cluster2.")
            cmd = f"kubectl -n {NAMESPACE} delete -f"\
                f"{example_cluster2_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Four pods remain
            print(">>>Checking that example-cluster2 pods are gone.")
            wait_for_pods(4)
            # Cluster 2 service has been garbage-collected.
            print(">>>Checking that deleted cluster's service is gone.")
            assert num_services() == 1

            # Check job submission
            print(">>>Submitting a job to test Ray client connection.")
            cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
            subprocess.check_call(cmd, shell=True)
            job_pod = [pod for pod in pods() if "job" in pod].pop()
            time.sleep(10)
            wait_for_job(job_pod)
            cmd = f"kubectl -n {NAMESPACE} delete jobs --all"
            subprocess.check_call(cmd, shell=True)

            # Delete operator pod. Deployment controller should recover it,
            # allowing the rest of this test to succeed.
            print(">>>Deleting operator pod to test operator restart.")
            cmd = f"kubectl -n {NAMESPACE} delete pod {operator_pod}"
            subprocess.check_call(cmd, shell=True)
            # Check that cluster updates work: increase minWorkers to 3
            # and check that one worker is created.
            print(">>>Updating cluster size.")
            example_cluster_edit = copy.deepcopy(example_cluster_config)
            example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3
            yaml.dump(example_cluster_edit, example_cluster_file)
            example_cluster_file.flush()
            cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cm, shell=True)
            print(">>>Checking that new cluster size is respected.")
            wait_for_pods(5)

            # Delete the first cluster
            print(">>>Deleting second cluster.")
            cmd = f"kubectl -n {NAMESPACE} delete -f"\
                f"{example_cluster_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Only operator pod remains.
            print(">>>Checking that all Ray cluster pods are gone.")
            wait_for_pods(1)

            # Cluster 1 service has been garbage-collected.
            print(">>>Checking that all Ray cluster services are gone.")
            assert num_services() == 0
Пример #3
0
    def test_basic(self):
        # Validate terminate_node error handling
        provider = KubernetesNodeProvider({"namespace": NAMESPACE},
                                          "default_cluster_name")
        # 404 caught, no error
        provider.terminate_node("no-such-node")

        with tempfile.NamedTemporaryFile(
                "w+") as example_cluster_file, tempfile.NamedTemporaryFile(
                    "w+"
                ) as example_cluster2_file, tempfile.NamedTemporaryFile(
                    "w+") as operator_file, tempfile.NamedTemporaryFile(
                        "w+") as job_file:

            # Get paths to operator configs
            example_cluster_config_path = get_component_config_path(
                "example_cluster.yaml")
            operator_config_path = get_component_config_path(
                "operator_namespaced.yaml")
            job_path = os.path.join(RAY_PATH,
                                    "doc/kubernetes/job-example.yaml")

            # Load operator configs
            example_cluster_config = yaml.safe_load(
                open(example_cluster_config_path).read())
            example_cluster2_config = copy.deepcopy(example_cluster_config)
            # One worker for the second config
            example_cluster2_config["spec"]["podTypes"][1]["minWorkers"] = 1
            example_cluster2_config["metadata"]["name"] = "example-cluster2"
            operator_config = list(
                yaml.safe_load_all(open(operator_config_path).read()))
            job_config = yaml.safe_load(open(job_path).read())

            # Fill image fields
            podTypes = example_cluster_config["spec"]["podTypes"]
            podTypes2 = example_cluster2_config["spec"]["podTypes"]
            pod_specs = (
                [operator_config[-1]["spec"]["template"]["spec"]] +
                [job_config["spec"]["template"]["spec"]] +
                [podType["podConfig"]["spec"] for podType in podTypes] +
                [podType["podConfig"]["spec"] for podType in podTypes2])
            for pod_spec in pod_specs:
                pod_spec["containers"][0]["image"] = IMAGE
                pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY

            # Use a custom Redis port for one of the clusters.
            example_cluster_config["spec"]["headStartRayCommands"][
                1] += " --port 6400"
            example_cluster_config["spec"]["workerStartRayCommands"][
                1] = " ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6400"

            # Dump to temporary files
            yaml.dump(example_cluster_config, example_cluster_file)
            yaml.dump(example_cluster2_config, example_cluster2_file)
            yaml.dump(job_config, job_file)
            yaml.dump_all(operator_config, operator_file)
            files = [
                example_cluster_file, example_cluster2_file, operator_file
            ]
            for file in files:
                file.flush()

            # Start operator and two clusters
            print("\n>>>Starting operator and two clusters.")
            for file in files:
                cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
                subprocess.check_call(cmd, shell=True)

            # Check that autoscaling respects minWorkers by waiting for
            # six pods in the namespace.
            print(">>>Waiting for pods to join clusters.")
            wait_for_pods(6)
            # Check that head services are present.
            print(">>>Checking that head services are present.")
            wait_for_services(2)

            # Check that logging output looks normal (two workers connected to
            # ray cluster example-cluster.)
            operator_pod = [pod for pod in pods() if "operator" in pod].pop()
            wait_for_logs(operator_pod)

            print(">>>Checking that Ray client connection is uninterrupted by"
                  " operator restart.")
            with client_connect_to_k8s():

                @ray.remote
                class Test:
                    @ray.method()
                    def method(self):
                        return "success"

                actor = Test.remote()
                print(">>>Restarting operator pod.")
                cmd = f"kubectl -n {NAMESPACE} delete pod {operator_pod}"
                subprocess.check_call(cmd, shell=True)
                wait_for_pods(6)
                operator_pod = [pod for pod in pods()
                                if "operator" in pod].pop()
                wait_for_pod_status(operator_pod, "Running")
                time.sleep(5)
                print(">>>Confirming Ray is uninterrupted.")
                assert ray.get(actor.method.remote()) == "success"

            # Delete head node of the first cluster. Recovery logic should
            # allow the rest of the test to pass.
            print(">>>Deleting cluster's head to test recovery.")
            head_pod = [pod for pod in pods() if "r-ray-head" in pod].pop()
            cd = f"kubectl -n {NAMESPACE} delete pod {head_pod}"
            subprocess.check_call(cd, shell=True)
            print(">>>Confirming recovery.")
            # Status marked "Running".
            wait_for_status("example-cluster", "Running")
            # Head pod recovered.
            wait_for_pods(6)

            stat_cmd = "kubectl -n {namespace} exec {head_pod} -- ray status"
            print(">>>Waiting for success of `ray status` on recovered head.")
            wait_for_command_to_succeed_on_head(stat_cmd,
                                                head_filter="r-ray-head",
                                                namespace=NAMESPACE)
            print(">>>Stopping ray on the head node to test recovery.")
            stop_cmd = "kubectl -n {namespace} exec {head_pod} -- ray stop"
            wait_for_command_to_succeed_on_head(stop_cmd,
                                                head_filter="r-ray-head",
                                                namespace=NAMESPACE)
            # ray status should fail when called immediately after ray stop
            with pytest.raises(subprocess.CalledProcessError):
                subprocess.check_call(stat_cmd, shell=True)
            print(">>>Waiting for success of `ray status` on recovered head.")
            wait_for_command_to_succeed_on_head(stat_cmd,
                                                head_filter="r-ray-head",
                                                namespace=NAMESPACE)

            # Delete the second cluster
            print(">>>Deleting example-cluster2.")
            cmd = f"kubectl -n {NAMESPACE} delete -f" f"{example_cluster2_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Four pods remain
            print(">>>Checking that example-cluster2 pods are gone.")
            wait_for_pods(4)
            # Cluster 2 service has been garbage-collected.
            print(">>>Checking that deleted cluster's service is gone.")
            wait_for_services(1)

            # Check job submission
            print(">>>Submitting a job to test Ray client connection.")
            cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
            subprocess.check_call(cmd, shell=True)
            wait_for_pods(1, name_filter="job")
            job_pod = [pod for pod in pods() if "job" in pod].pop()
            time.sleep(10)
            wait_for_job(job_pod)
            cmd = f"kubectl -n {NAMESPACE} delete jobs --all"
            subprocess.check_call(cmd, shell=True)

            # Check that cluster updates work: increase minWorkers to 3
            # and check that one worker is created.
            print(">>>Updating cluster size.")
            example_cluster_edit = copy.deepcopy(example_cluster_config)
            example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3
            yaml.dump(example_cluster_edit, example_cluster_file)
            example_cluster_file.flush()
            cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cm, shell=True)
            print(">>>Checking that new cluster size is respected.")
            wait_for_pods(5)

            # Delete the first cluster
            print(">>>Deleting second cluster.")
            cmd = f"kubectl -n {NAMESPACE} delete -f" f"{example_cluster_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Only operator pod remains.
            print(">>>Checking that all Ray cluster pods are gone.")
            wait_for_pods(1)

            # Cluster 1 service has been garbage-collected.
            print(">>>Checking that all Ray cluster services are gone.")
            wait_for_services(0)

            # Verify that cluster deletion earlier in this test did not break
            # the operator.
            print(">>>Checking cluster creation again.")
            for file in [example_cluster_file, example_cluster2_file]:
                cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
                subprocess.check_call(cmd, shell=True)
            wait_for_pods(7)
            print(">>>Checking cluster deletion again.")
            for file in [example_cluster_file, example_cluster2_file]:
                cmd = f"kubectl -n {NAMESPACE} delete -f {file.name}"
                subprocess.check_call(cmd, shell=True)
            wait_for_pods(1)
Пример #4
0
    def test_examples(self):

        # Validate terminate_node error handling
        provider = KubernetesNodeProvider({
            "namespace": NAMESPACE
        }, "default_cluster_name")
        # 404 caught, no error
        provider.terminate_node("no-such-node")

        with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \
                tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\
                tempfile.NamedTemporaryFile("w+") as operator_file,\
                tempfile.NamedTemporaryFile("w+") as job_file:

            # Get paths to operator configs
            example_cluster_config_path = get_operator_config_path(
                "example_cluster.yaml")
            example_cluster2_config_path = get_operator_config_path(
                "example_cluster2.yaml")
            operator_config_path = get_operator_config_path("operator.yaml")
            job_path = os.path.join(RAY_PATH,
                                    "doc/kubernetes/job-example.yaml")

            # Load operator configs
            example_cluster_config = yaml.safe_load(
                open(example_cluster_config_path).read())
            example_cluster2_config = yaml.safe_load(
                open(example_cluster2_config_path).read())
            operator_config = list(
                yaml.safe_load_all(open(operator_config_path).read()))
            job_config = yaml.safe_load(open(job_path).read())

            # Fill image fields
            podTypes = example_cluster_config["spec"]["podTypes"]
            podTypes2 = example_cluster2_config["spec"]["podTypes"]
            pod_specs = ([operator_config[-1]["spec"]] + [
                job_config["spec"]["template"]["spec"]
            ] + [podType["podConfig"]["spec"] for podType in podTypes
                 ] + [podType["podConfig"]["spec"] for podType in podTypes2])
            for pod_spec in pod_specs:
                pod_spec["containers"][0]["image"] = IMAGE
                pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY

            # Dump to temporary files
            yaml.dump(example_cluster_config, example_cluster_file)
            yaml.dump(example_cluster2_config, example_cluster2_file)
            yaml.dump(job_config, job_file)
            yaml.dump_all(operator_config, operator_file)
            files = [
                example_cluster_file, example_cluster2_file, operator_file
            ]
            for file in files:
                file.flush()

            # Start operator and two clusters
            print(">>>Starting operator and two clusters.")
            for file in files:
                cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
                subprocess.check_call(cmd, shell=True)

            # Check that autoscaling respects minWorkers by waiting for
            # six pods in the namespace.
            print(">>>Waiting for pods to join clusters.")
            wait_for_pods(6)

            # Check that logging output looks normal (two workers connected to
            # ray cluster example-cluster.)
            print(">>>Checking monitor logs for head and workers.")
            wait_for_logs()

            # Delete the second cluster
            print(">>>Deleting example-cluster2.")
            cmd = f"kubectl -n {NAMESPACE} delete -f"\
                f"{example_cluster2_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Four pods remain
            print(">>>Checking that example-cluster2 pods are gone.")
            wait_for_pods(4)

            # Check job submission
            print(">>>Submitting a job to test Ray client connection.")
            cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
            subprocess.check_call(cmd, shell=True)

            cmd = f"kubectl -n {NAMESPACE} get pods --no-headers -o"\
                " custom-columns=\":metadata.name\""
            pods = subprocess.check_output(cmd, shell=True).decode().split()
            job_pod = [pod for pod in pods if "job" in pod].pop()
            time.sleep(10)
            wait_for_job(job_pod)
            cmd = f"kubectl -n {NAMESPACE} delete jobs --all"
            subprocess.check_call(cmd, shell=True)

            # Check that cluster updates work: increase minWorkers to 3
            # and check that one worker is created.
            print(">>>Updating cluster size.")
            example_cluster_edit = copy.deepcopy(example_cluster_config)
            example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3
            yaml.dump(example_cluster_edit, example_cluster_file)
            example_cluster_file.flush()
            cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cm, shell=True)
            print(">>>Checking that new cluster size is respected.")
            wait_for_pods(5)

            # Delete the first cluster
            print(">>>Deleting second cluster.")
            cmd = f"kubectl -n {NAMESPACE} delete -f"\
                f"{example_cluster_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Only operator pod remains.
            print(">>>Checking that all Ray cluster pods are gone.")
            wait_for_pods(1)
Пример #5
0
    def test_up_and_down(self):
        """(1) Runs 'ray up' with a Kubernetes config that specifies
        min_workers=1.
        (2) Runs 'ray exec' to read monitor logs and confirm that worker and
        head are connected.
        (4) Rsyncs files up and down.
        (3) Runs 'ray down' and confirms that the cluster is gone."""

        # get path to config
        config = get_config()

        # get a node provider
        provider_config = config["provider"]
        cluster_name = config["cluster_name"]
        self.provider = KubernetesNodeProvider(provider_config, cluster_name)

        # ray up
        sdk.create_or_update_cluster(config, no_config_cache=True)

        # Check for two pods (worker and head).
        while True:
            nodes = self.provider.non_terminated_nodes({})
            if len(nodes) == 2:
                break
            else:
                time.sleep(1)

        # Read logs with ray exec and check that worker and head are connected.
        # (Since the config yaml is legacy-style, we check for
        # ray-legacy-*-node_type.)
        log_cmd = "tail -n 100 /tmp/ray/session_latest/logs/monitor*"
        while True:
            monitor_output = sdk.run_on_cluster(config,
                                                cmd=log_cmd,
                                                with_output=True).decode()
            if ("head-node" in monitor_output
                    and "worker-node" in monitor_output):
                break
            else:
                time.sleep(1)

        # rsync
        with tempfile.NamedTemporaryFile("w") as test_file:
            test_file.write("test")
            test_file.flush()
            sdk.rsync(config,
                      source=test_file.name,
                      target="~/in_pod",
                      down=False)
        with tempfile.NamedTemporaryFile() as test_file:
            sdk.rsync(config,
                      target=test_file.name,
                      source="~/in_pod",
                      down=True)
            contents = open(test_file.name).read()
        assert contents == "test"

        # ray down
        sdk.teardown_cluster(config)

        # Check that there are no pods left in namespace ray to confirm that
        # the cluster is gone.
        while True:
            nodes = self.provider.non_terminated_nodes({})
            if len(nodes) == 0:
                break
            else:
                time.sleep(1)