def setUp(self) -> None: self.sdk = MockSDK() self.sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) self.cluster_env = TEST_CLUSTER_ENV self.cluster_compute = TEST_CLUSTER_COMPUTE self.cluster_manager = self.cls( project_id=UNIT_TEST_PROJECT_ID, sdk=self.sdk, test_name=f"unit_test__{self.__class__.__name__}", ) self.sdk.reset()
def testClusterName(self): sdk = MockSDK() sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=False, sdk=sdk) self.assertRegex(cluster_manager.cluster_name, r"^test_\d+$") cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=True, sdk=sdk) self.assertRegex(cluster_manager.cluster_name, r"^test-smoke-test_\d+$")
def testSetClusterEnv(self): sdk = MockSDK() sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=False, sdk=sdk) cluster_manager.set_cluster_env({}) self.assertEqual( cluster_manager.cluster_env["env_vars"] ["RAY_USAGE_STATS_EXTRA_TAGS"], "test_name=test;smoke_test=False", ) cluster_manager = self.cls(test_name="Test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=True, sdk=sdk) cluster_manager.set_cluster_env({}) self.assertEqual( cluster_manager.cluster_env["env_vars"] ["RAY_USAGE_STATS_EXTRA_TAGS"], "test_name=Test;smoke_test=True", )
class MinimalSessionManagerTest(unittest.TestCase): cls = MinimalClusterManager def setUp(self) -> None: self.sdk = MockSDK() self.sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) self.cluster_env = TEST_CLUSTER_ENV self.cluster_compute = TEST_CLUSTER_COMPUTE self.cluster_manager = self.cls( project_id=UNIT_TEST_PROJECT_ID, sdk=self.sdk, test_name=f"unit_test__{self.__class__.__name__}", ) self.sdk.reset() @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterComputeExisting(self): # Find existing compute and succeed self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), APIDict(name=self.cluster_manager.cluster_compute_name, id="correct"), ], ) self.cluster_manager.create_cluster_compute() self.assertEqual(self.cluster_manager.cluster_compute_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1) self.assertEqual(len(self.sdk.call_counter), 1) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterComputeCreateFailFail(self): # No existing compute, create new, but fail both times self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_compute"] = fail_always with self.assertRaises(ClusterComputeCreateError): self.cluster_manager.create_cluster_compute() # No cluster ID found or created self.assertFalse(self.cluster_manager.cluster_compute_id) # Both APIs were called twice (retry after fail) self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterComputeCreateFailSucceed(self): # No existing compute, create new, fail once, succeed afterwards self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_compute"] = fail_once(result=APIDict( result=APIDict(id="correct", ))) self.cluster_manager.create_cluster_compute() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_compute_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterComputeCreateSucceed(self): # No existing compute, create new, and succeed self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_compute"] = APIDict( result=APIDict(id="correct", )) self.cluster_manager.create_cluster_compute() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_compute_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1) self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 1) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterEnvExisting(self): # Find existing env and succeed self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), APIDict(name=self.cluster_manager.cluster_env_name, id="correct"), ], ) self.cluster_manager.create_cluster_env() self.assertEqual(self.cluster_manager.cluster_env_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 1) self.assertEqual(len(self.sdk.call_counter), 1) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterEnvFailFail(self): # No existing compute, create new, but fail both times self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_environment"] = fail_always with self.assertRaises(ClusterEnvCreateError): self.cluster_manager.create_cluster_env() # No cluster ID found or created self.assertFalse(self.cluster_manager.cluster_env_id) # Both APIs were called twice (retry after fail) self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_environment"], 2) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterEnvFailSucceed(self): # No existing compute, create new, fail once, succeed afterwards self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.cluster_manager.cluster_env_id = None self.sdk.reset() self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_environment"] = fail_once( result=APIDict(result=APIDict(id="correct", ))) self.cluster_manager.create_cluster_env() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_env_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_environment"], 2) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testFindCreateClusterEnvSucceed(self): # No existing compute, create new, and succeed self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_environment"] = APIDict( result=APIDict(id="correct", )) self.cluster_manager.create_cluster_env() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_env_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 1) self.assertEqual(self.sdk.call_counter["create_cluster_environment"], 1) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterEnvNotFound(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Environment build not found self.sdk.returns["list_cluster_environment_builds"] = APIDict( results=[]) with self.assertRaisesRegex(ClusterEnvBuildError, "No build found"): self.cluster_manager.build_cluster_env(timeout=600) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterEnvPreBuildFailed(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build failed on first lookup self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ) ]) with self.assertRaisesRegex(ClusterEnvBuildError, "Cluster env build failed"): self.cluster_manager.build_cluster_env(timeout=600) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertEqual(len(self.sdk.call_counter), 1) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterEnvPreBuildSucceeded(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # (Second) build succeeded self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="succeeded", created_at=1, ), ]) self.cluster_manager.build_cluster_env(timeout=600) self.assertTrue(self.cluster_manager.cluster_env_build_id) self.assertEqual(self.cluster_manager.cluster_env_build_id, "build_succeeded") self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertEqual(len(self.sdk.call_counter), 1) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterBuildFails(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build, but fails after 300 seconds self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="pending", created_at=1, ), ]) with freeze_time() as frozen_time, self.assertRaisesRegex( ClusterEnvBuildError, "Cluster env build failed"): self.sdk.returns["get_build"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(status="in_progress")), after=APIDict(result=APIDict(status="failed")), ) self.cluster_manager.build_cluster_env(timeout=600) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterEnvBuildTimeout(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build, but timeout after 100 seconds self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="pending", created_at=1, ), ]) with freeze_time() as frozen_time, self.assertRaisesRegex( ClusterEnvBuildTimeout, "Time out when building cluster env"): self.sdk.returns["get_build"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(status="in_progress")), after=APIDict(result=APIDict(status="succeeded")), ) self.cluster_manager.build_cluster_env(timeout=100) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9) self.assertEqual(len(self.sdk.call_counter), 2) @patch("time.sleep", lambda *a, **kw: None) def testBuildClusterBuildSucceed(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build, succeed after 300 seconds self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="pending", created_at=1, ), ]) with freeze_time() as frozen_time: self.sdk.returns["get_build"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(status="in_progress")), after=APIDict(result=APIDict(status="succeeded")), ) self.cluster_manager.build_cluster_env(timeout=600) self.assertTrue(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9) self.assertEqual(len(self.sdk.call_counter), 2)
def setUp(self) -> None: self.tempdir = tempfile.mkdtemp() self.sdk = MockSDK() self.sdk.returns["get_project"] = APIDict( result=APIDict(name="unit_test_project") ) self.writeClusterEnv("{'env': true}") self.writeClusterCompute("{'compute': true}") with open(os.path.join(self.tempdir, "driver_fail.sh"), "wt") as f: f.write("exit 1\n") with open(os.path.join(self.tempdir, "driver_succeed.sh"), "wt") as f: f.write("exit 0\n") this_sdk = self.sdk this_tempdir = self.tempdir self.cluster_manager_return = {} self.command_runner_return = {} self.file_manager_return = {} this_cluster_manager_return = self.cluster_manager_return this_command_runner_return = self.command_runner_return this_file_manager_return = self.file_manager_return class MockClusterManager(MockReturn, FullClusterManager): def __init__( self, test_name: str, project_id: str, sdk=None, smoke_test: bool = False, ): super(MockClusterManager, self).__init__( test_name, project_id, this_sdk, smoke_test=smoke_test ) self.return_dict = this_cluster_manager_return class MockCommandRunner(MockReturn, CommandRunner): return_dict = self.cluster_manager_return def __init__( self, cluster_manager: ClusterManager, file_manager: FileManager, working_dir: str, ): super(MockCommandRunner, self).__init__( cluster_manager, file_manager, this_tempdir ) self.return_dict = this_command_runner_return class MockFileManager(MockReturn, FileManager): def __init__(self, cluster_manager: ClusterManager): super(MockFileManager, self).__init__(cluster_manager) self.return_dict = this_file_manager_return self.mock_alert_return = None def mock_alerter(test: Test, result: Result): return self.mock_alert_return result_to_handle_map["unit_test_alerter"] = mock_alerter type_str_to_command_runner["unit_test"] = MockCommandRunner command_runner_to_cluster_manager[MockCommandRunner] = MockClusterManager command_runner_to_file_manager[MockCommandRunner] = MockFileManager self.test = Test( name="unit_test_end_to_end", run=dict( type="unit_test", prepare="prepare_cmd", script="test_cmd", wait_for_nodes=dict(num_nodes=4, timeout=40), ), working_dir=self.tempdir, cluster=dict( cluster_env="cluster_env.yaml", cluster_compute="cluster_compute.yaml" ), alert="unit_test_alerter", driver_setup="driver_fail.sh", ) self.anyscale_project = "prj_unit12345678" self.ray_wheels_url = "http://mock.wheels/"