示例#1
0
    def testFindCreateClusterEnvSucceed(self):
        # No existing compute, create new, and succeed
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.assertTrue(self.cluster_manager.cluster_env_name)
        self.assertFalse(self.cluster_manager.cluster_env_id)

        self.sdk.returns["search_cluster_environments"] = APIDict(
            metadata=APIDict(next_paging_token=None, ),
            results=[
                APIDict(
                    name="no_match",
                    id="wrong",
                ),
            ],
        )
        self.sdk.returns["create_cluster_environment"] = APIDict(
            result=APIDict(id="correct", ))
        self.cluster_manager.create_cluster_env()
        # Both APIs were called twice (retry after fail)
        self.assertEqual(self.cluster_manager.cluster_env_id, "correct")
        self.assertEqual(self.sdk.call_counter["search_cluster_environments"],
                         1)
        self.assertEqual(self.sdk.call_counter["create_cluster_environment"],
                         1)
        self.assertEqual(len(self.sdk.call_counter), 2)
示例#2
0
 def testBuildClusterEnvPreBuildSucceeded(self):
     self.cluster_manager.set_cluster_env(self.cluster_env)
     self.cluster_manager.cluster_env_id = "correct"
     # (Second) build succeeded
     self.cluster_manager.cluster_env_build_id = None
     self.sdk.reset()
     self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[
         APIDict(
             id="build_failed",
             status="failed",
             created_at=0,
         ),
         APIDict(
             id="build_succeeded",
             status="succeeded",
             created_at=1,
         ),
     ])
     self.cluster_manager.build_cluster_env(timeout=600)
     self.assertTrue(self.cluster_manager.cluster_env_build_id)
     self.assertEqual(self.cluster_manager.cluster_env_build_id,
                      "build_succeeded")
     self.assertEqual(
         self.sdk.call_counter["list_cluster_environment_builds"], 1)
     self.assertEqual(len(self.sdk.call_counter), 1)
示例#3
0
    def testFindCreateClusterEnvFailFail(self):
        # No existing compute, create new, but fail both times
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.assertTrue(self.cluster_manager.cluster_env_name)
        self.assertFalse(self.cluster_manager.cluster_env_id)

        self.sdk.returns["search_cluster_environments"] = APIDict(
            metadata=APIDict(next_paging_token=None, ),
            results=[
                APIDict(
                    name="no_match",
                    id="wrong",
                ),
            ],
        )
        self.sdk.returns["create_cluster_environment"] = fail_always
        with self.assertRaises(ClusterEnvCreateError):
            self.cluster_manager.create_cluster_env()
        # No cluster ID found or created
        self.assertFalse(self.cluster_manager.cluster_env_id)
        # Both APIs were called twice (retry after fail)
        self.assertEqual(self.sdk.call_counter["search_cluster_environments"],
                         2)
        self.assertEqual(self.sdk.call_counter["create_cluster_environment"],
                         2)
        self.assertEqual(len(self.sdk.call_counter), 2)
示例#4
0
    def testFindCreateClusterComputeCreateFailSucceed(self):
        # No existing compute, create new, fail once, succeed afterwards
        self.cluster_manager.set_cluster_compute(self.cluster_compute)
        self.assertTrue(self.cluster_manager.cluster_compute_name)
        self.assertFalse(self.cluster_manager.cluster_compute_id)

        self.sdk.returns["search_cluster_computes"] = APIDict(
            metadata=APIDict(
                next_paging_token=None,
            ),
            results=[
                APIDict(
                    name="no_match",
                    id="wrong",
                ),
            ],
        )
        self.sdk.returns["create_cluster_compute"] = fail_once(
            result=APIDict(
                result=APIDict(
                    id="correct",
                )
            )
        )
        self.cluster_manager.create_cluster_compute()
        # Both APIs were called twice (retry after fail)
        self.assertEqual(self.cluster_manager.cluster_compute_id, "correct")
        self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2)
        self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2)
        self.assertEqual(len(self.sdk.call_counter), 2)
示例#5
0
    def testSessionStartStartupError(self):
        self.cluster_manager.cluster_env_id = "correct"
        self.cluster_manager.cluster_compute_id = "correct"

        self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
        self.sdk.returns["start_cluster"] = _fail

        with self.assertRaises(ClusterStartupError):
            self.cluster_manager.start_cluster()
示例#6
0
    def setUp(self) -> None:
        self.sdk = MockSDK()
        self.sdk.returns["get_project"] = APIDict(result=APIDict(
            name="release_unit_tests"))

        self.cluster_env = TEST_CLUSTER_ENV
        self.cluster_compute = TEST_CLUSTER_COMPUTE

        self.cluster_manager = self.cls(
            project_id=UNIT_TEST_PROJECT_ID,
            sdk=self.sdk,
            test_name=f"unit_test__{self.__class__.__name__}",
        )
        self.sdk.reset()
示例#7
0
 def testClusterName(self):
     sdk = MockSDK()
     sdk.returns["get_project"] = APIDict(result=APIDict(
         name="release_unit_tests"))
     cluster_manager = self.cls(test_name="test",
                                project_id=UNIT_TEST_PROJECT_ID,
                                smoke_test=False,
                                sdk=sdk)
     self.assertRegex(cluster_manager.cluster_name, r"^test_\d+$")
     cluster_manager = self.cls(test_name="test",
                                project_id=UNIT_TEST_PROJECT_ID,
                                smoke_test=True,
                                sdk=sdk)
     self.assertRegex(cluster_manager.cluster_name,
                      r"^test-smoke-test_\d+$")
示例#8
0
    def testInvalidClusterIdOverride(self):
        result = Result()

        self._succeed_until("driver_setup")

        self.sdk.returns["get_cluster_environment"] = None

        with self.assertRaises(ClusterEnvCreateError):
            self._run(result, cluster_env_id="existing")

        self.sdk.returns["get_cluster_environment"] = APIDict(result=APIDict(
            config_json={"overridden": True}))

        with self.assertRaises(Exception) as cm:  # Fail somewhere else
            self._run(result, cluster_env_id="existing")
            self.assertNotIsInstance(cm.exception, ClusterEnvCreateError)
示例#9
0
    def testBuildClusterEnvNotFound(self):
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.cluster_manager.cluster_env_id = "correct"

        # Environment build not found
        self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[])
        with self.assertRaisesRegex(ClusterEnvBuildError, "No build found"):
            self.cluster_manager.build_cluster_env(timeout=600)
示例#10
0
    def testBuildClusterBuildSucceed(self):
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.cluster_manager.cluster_env_id = "correct"
        # Build, succeed after 300 seconds
        self.cluster_manager.cluster_env_build_id = None
        self.sdk.reset()
        self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[
            APIDict(
                id="build_failed",
                status="failed",
                created_at=0,
            ),
            APIDict(
                id="build_succeeded",
                status="pending",
                created_at=1,
            ),
        ])
        with freeze_time() as frozen_time:
            self.sdk.returns["get_build"] = _DelayedResponse(
                lambda: frozen_time.tick(delta=10),
                finish_after=300,
                before=APIDict(result=APIDict(status="in_progress")),
                after=APIDict(result=APIDict(status="succeeded")),
            )
            self.cluster_manager.build_cluster_env(timeout=600)

        self.assertTrue(self.cluster_manager.cluster_env_build_id)
        self.assertEqual(
            self.sdk.call_counter["list_cluster_environment_builds"], 1)
        self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9)
        self.assertEqual(len(self.sdk.call_counter), 2)
示例#11
0
    def testBuildClusterEnvPreBuildFailed(self):
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.cluster_manager.cluster_env_id = "correct"

        # Build failed on first lookup
        self.cluster_manager.cluster_env_build_id = None
        self.sdk.reset()
        self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[
            APIDict(
                id="build_failed",
                status="failed",
                created_at=0,
            )
        ])
        with self.assertRaisesRegex(ClusterEnvBuildError,
                                    "Cluster env build failed"):
            self.cluster_manager.build_cluster_env(timeout=600)
        self.assertFalse(self.cluster_manager.cluster_env_build_id)
        self.assertEqual(
            self.sdk.call_counter["list_cluster_environment_builds"], 1)
        self.assertEqual(len(self.sdk.call_counter), 1)
示例#12
0
    def testFindCreateClusterComputeExisting(self):
        # Find existing compute and succeed
        self.cluster_manager.set_cluster_compute(self.cluster_compute)
        self.assertTrue(self.cluster_manager.cluster_compute_name)
        self.assertFalse(self.cluster_manager.cluster_compute_id)

        self.sdk.returns["search_cluster_computes"] = APIDict(
            metadata=APIDict(next_paging_token=None, ),
            results=[
                APIDict(
                    name="no_match",
                    id="wrong",
                ),
                APIDict(name=self.cluster_manager.cluster_compute_name,
                        id="correct"),
            ],
        )
        self.cluster_manager.create_cluster_compute()
        self.assertEqual(self.cluster_manager.cluster_compute_id, "correct")
        self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1)
        self.assertEqual(len(self.sdk.call_counter), 1)
示例#13
0
    def testBuildClusterEnvPreBuildFailed(self):
        """Pre-build fails, but is kicked off again."""
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.cluster_manager.cluster_env_id = "correct"

        # Build failed on first lookup
        self.cluster_manager.cluster_env_build_id = None
        self.sdk.reset()
        self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[
            APIDict(
                id="build_failed",
                status="failed",
                created_at=0,
                error_message=None,
                config_json={},
            )
        ])
        self.sdk.returns["create_cluster_environment_build"] = APIDict(
            result=APIDict(id="new_build_id"))
        self.sdk.returns["get_build"] = APIDict(result=APIDict(
            id="build_now_succeeded",
            status="failed",
            created_at=0,
            error_message=None,
            config_json={},
        ))
        with self.assertRaisesRegex(ClusterEnvBuildError,
                                    "Cluster env build failed"):
            self.cluster_manager.build_cluster_env(timeout=600)
        self.assertFalse(self.cluster_manager.cluster_env_build_id)
        self.assertEqual(
            self.sdk.call_counter["list_cluster_environment_builds"], 1)
        self.assertEqual(
            self.sdk.call_counter["create_cluster_environment_build"], 1)
        self.assertEqual(len(self.sdk.call_counter), 3)
示例#14
0
    def testSessionStartStartupSuccess(self):
        self.cluster_manager.cluster_env_id = "correct"
        self.cluster_manager.cluster_compute_id = "correct"

        self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
        self.sdk.returns["start_cluster"] = APIDict(
            result=APIDict(id="cop_id", completed=False)
        )

        with freeze_time() as frozen_time:
            frozen_time.tick(delta=0.1)
            self.sdk.returns["get_cluster_operation"] = _DelayedResponse(
                lambda: frozen_time.tick(delta=10),
                finish_after=300,
                before=APIDict(result=APIDict(completed=False)),
                after=APIDict(result=APIDict(completed=True)),
            )

            self.sdk.returns["get_cluster"] = APIDict(result=APIDict(state="Running"))

            # Timeout is long enough
            self.cluster_manager.start_cluster(timeout=400)
示例#15
0
 def testSetClusterEnv(self):
     sdk = MockSDK()
     sdk.returns["get_project"] = APIDict(result=APIDict(
         name="release_unit_tests"))
     cluster_manager = self.cls(test_name="test",
                                project_id=UNIT_TEST_PROJECT_ID,
                                smoke_test=False,
                                sdk=sdk)
     cluster_manager.set_cluster_env({})
     self.assertEqual(
         cluster_manager.cluster_env["env_vars"]
         ["RAY_USAGE_STATS_EXTRA_TAGS"],
         "test_name=test;smoke_test=False",
     )
     cluster_manager = self.cls(test_name="Test",
                                project_id=UNIT_TEST_PROJECT_ID,
                                smoke_test=True,
                                sdk=sdk)
     cluster_manager.set_cluster_env({})
     self.assertEqual(
         cluster_manager.cluster_env["env_vars"]
         ["RAY_USAGE_STATS_EXTRA_TAGS"],
         "test_name=Test;smoke_test=True",
     )
示例#16
0
    def testSessionStartStartupTimeout(self):
        self.cluster_manager.cluster_env_id = "correct"
        self.cluster_manager.cluster_compute_id = "correct"

        self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success"))
        self.sdk.returns["start_cluster"] = APIDict(
            result=APIDict(id="cop_id", completed=False)
        )

        with freeze_time() as frozen_time, self.assertRaises(ClusterStartupTimeout):
            self.sdk.returns["get_cluster_operation"] = _DelayedResponse(
                lambda: frozen_time.tick(delta=10),
                finish_after=300,
                before=APIDict(result=APIDict(completed=False)),
                after=APIDict(result=APIDict(completed=True)),
            )

            # Timeout before startup finishes
            self.cluster_manager.start_cluster(timeout=200)
示例#17
0
    def testBuildClusterEnvBuildTimeout(self):
        self.cluster_manager.set_cluster_env(self.cluster_env)
        self.cluster_manager.cluster_env_id = "correct"

        # Build, but timeout after 100 seconds
        self.cluster_manager.cluster_env_build_id = None
        self.sdk.reset()
        self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[
            APIDict(
                id="build_failed",
                status="failed",
                created_at=0,
                error_message=None,
                config_json={},
            ),
            APIDict(
                id="build_succeeded",
                status="pending",
                created_at=1,
                error_message=None,
                config_json={},
            ),
        ])
        with freeze_time() as frozen_time, self.assertRaisesRegex(
                ClusterEnvBuildTimeout, "Time out when building cluster env"):
            self.sdk.returns["get_build"] = _DelayedResponse(
                lambda: frozen_time.tick(delta=10),
                finish_after=300,
                before=APIDict(result=APIDict(
                    status="in_progress", error_message=None, config_json={})),
                after=APIDict(result=APIDict(
                    status="succeeded", error_message=None, config_json={})),
            )
            self.cluster_manager.build_cluster_env(timeout=100)

        self.assertFalse(self.cluster_manager.cluster_env_build_id)
        self.assertEqual(
            self.sdk.call_counter["list_cluster_environment_builds"], 1)
        self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9)
        self.assertEqual(len(self.sdk.call_counter), 2)
示例#18
0
    def setUp(self) -> None:
        self.tempdir = tempfile.mkdtemp()
        self.sdk = MockSDK()

        self.sdk.returns["get_project"] = APIDict(
            result=APIDict(name="unit_test_project")
        )

        self.writeClusterEnv("{'env': true}")
        self.writeClusterCompute("{'compute': true}")

        with open(os.path.join(self.tempdir, "driver_fail.sh"), "wt") as f:
            f.write("exit 1\n")

        with open(os.path.join(self.tempdir, "driver_succeed.sh"), "wt") as f:
            f.write("exit 0\n")

        this_sdk = self.sdk
        this_tempdir = self.tempdir

        self.cluster_manager_return = {}
        self.command_runner_return = {}
        self.file_manager_return = {}

        this_cluster_manager_return = self.cluster_manager_return
        this_command_runner_return = self.command_runner_return
        this_file_manager_return = self.file_manager_return

        class MockClusterManager(MockReturn, FullClusterManager):
            def __init__(
                self,
                test_name: str,
                project_id: str,
                sdk=None,
                smoke_test: bool = False,
            ):
                super(MockClusterManager, self).__init__(
                    test_name, project_id, this_sdk, smoke_test=smoke_test
                )
                self.return_dict = this_cluster_manager_return

        class MockCommandRunner(MockReturn, CommandRunner):
            return_dict = self.cluster_manager_return

            def __init__(
                self,
                cluster_manager: ClusterManager,
                file_manager: FileManager,
                working_dir: str,
            ):
                super(MockCommandRunner, self).__init__(
                    cluster_manager, file_manager, this_tempdir
                )
                self.return_dict = this_command_runner_return

        class MockFileManager(MockReturn, FileManager):
            def __init__(self, cluster_manager: ClusterManager):
                super(MockFileManager, self).__init__(cluster_manager)
                self.return_dict = this_file_manager_return

        self.mock_alert_return = None

        def mock_alerter(test: Test, result: Result):
            return self.mock_alert_return

        result_to_handle_map["unit_test_alerter"] = mock_alerter

        type_str_to_command_runner["unit_test"] = MockCommandRunner
        command_runner_to_cluster_manager[MockCommandRunner] = MockClusterManager
        command_runner_to_file_manager[MockCommandRunner] = MockFileManager

        self.test = Test(
            name="unit_test_end_to_end",
            run=dict(
                type="unit_test",
                prepare="prepare_cmd",
                script="test_cmd",
                wait_for_nodes=dict(num_nodes=4, timeout=40),
            ),
            working_dir=self.tempdir,
            cluster=dict(
                cluster_env="cluster_env.yaml", cluster_compute="cluster_compute.yaml"
            ),
            alert="unit_test_alerter",
            driver_setup="driver_fail.sh",
        )
        self.anyscale_project = "prj_unit12345678"
        self.ray_wheels_url = "http://mock.wheels/"