Exemplo n.º 1
0
    def testStartClusterFails(self):
        result = Result()

        self._succeed_until("cluster_env")

        # Fails because API response faulty
        with self.assertRaises(ClusterCreationError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)

        self.cluster_manager_return["cluster_id"] = "valid"

        # Fail for random cluster startup reason
        self.cluster_manager_return["start_cluster"] = _fail_on_call(
            ClusterStartupError
        )
        with self.assertRaises(ClusterStartupError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_ERROR.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)

        # Fail for cluster startup timeout
        self.cluster_manager_return["start_cluster"] = _fail_on_call(
            ClusterStartupTimeout
        )
        with self.assertRaises(ClusterStartupTimeout):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_STARTUP_TIMEOUT.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 2
0
    def testInvalidClusterCompute(self):
        result = Result()

        with patch(
            "ray_release.glue.load_test_cluster_compute",
            _fail_on_call(ReleaseTestConfigError),
        ), self.assertRaises(ReleaseTestConfigError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)

        # Fails because file not found
        os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml"))
        with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)

        # Fails because invalid jinja template
        self.writeClusterCompute("{{ INVALID")
        with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)

        # Fails because invalid json
        self.writeClusterCompute("{'test': true, 'fail}")
        with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"):
            self._run(result)

        self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
Exemplo n.º 3
0
    def testBuildConfigFailsClusterEnv(self):
        result = Result()

        self._succeed_until("cluster_compute")

        # Fails because API response faulty
        with self.assertRaisesRegex(ClusterEnvCreateError, "Unexpected"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)

        # Fails for random cluster env create reason
        self.cluster_manager_return["create_cluster_env"] = _fail_on_call(
            ClusterEnvCreateError, "Known"
        )
        with self.assertRaisesRegex(ClusterEnvCreateError, "Known"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)

        # Now, succeed creation but fail on cluster env build
        self.cluster_manager_return["cluster_env_id"] = "valid"
        self.cluster_manager_return["create_cluster_env"] = None
        self.cluster_manager_return["build_cluster_env"] = _fail_on_call(
            ClusterEnvBuildError
        )
        with self.assertRaises(ClusterEnvBuildError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_ERROR.value)

        # Now, fail on cluster env timeout
        self.cluster_manager_return["build_cluster_env"] = _fail_on_call(
            ClusterEnvBuildTimeout
        )
        with self.assertRaises(ClusterEnvBuildTimeout):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_ENV_BUILD_TIMEOUT.value)
Exemplo n.º 4
0
    def testDriverSetupFails(self):
        result = Result()

        self._succeed_until("local_env")

        with self.assertRaises(LocalEnvSetupError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value)
Exemplo n.º 5
0
def main(
    test_name: str,
    test_collection_file: Optional[str] = None,
    smoke_test: bool = False,
    report: bool = False,
    ray_wheels: Optional[str] = None,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    no_terminate: bool = False,
):
    test_collection_file = test_collection_file or os.path.join(
        os.path.dirname(__file__), "..", "..", "release_tests.yaml")
    test_collection = read_and_validate_release_test_collection(
        test_collection_file)
    test = find_test(test_collection, test_name)

    if not test:
        raise ReleaseTestCLIError(
            f"Test `{test_name}` not found in collection file: "
            f"{test_collection_file}")

    if smoke_test:
        test = as_smoke_test(test)

    ray_wheels_url = find_and_wait_for_ray_wheels_url(
        ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT)

    anyscale_project = os.environ.get("ANYSCALE_PROJECT", None)
    if not anyscale_project:
        raise ReleaseTestCLIError(
            "You have to set the ANYSCALE_PROJECT environment variable!")

    maybe_fetch_api_token()

    result = Result()

    reporters = [LogReporter()]
    if report:
        reporters.append(LegacyRDSReporter())

    try:
        result = run_release_test(
            test,
            anyscale_project=anyscale_project,
            result=result,
            ray_wheels_url=ray_wheels_url,
            reporters=reporters,
            smoke_test=smoke_test,
            cluster_id=cluster_id,
            cluster_env_id=cluster_env_id,
            no_terminate=no_terminate,
        )
    except ReleaseTestError as e:
        logger.exception(e)

    logger.info(f"Release test pipeline for test {test['name']} completed. "
                f"Returning with exit code = {result.return_code}")
    sys.exit(result.return_code)
Exemplo n.º 6
0
    def testInvalidPrepareLocalEnv(self):
        result = Result()

        self.command_runner_return["prepare_local_env"] = _fail_on_call(
            LocalEnvSetupError
        )
        with self.assertRaises(LocalEnvSetupError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value)
Exemplo n.º 7
0
    def testHandleAlert(self):
        # Unknown test suite
        with self.assertRaises(ReleaseTestConfigError):
            handle.handle_result(
                Test(name="unit_alert_test", alert="invalid"), Result(status="finished")
            )

        # Alert raised
        with self.assertRaises(ResultsAlert):
            handle.handle_result(
                Test(name="unit_alert_test", alert="default"),
                Result(status="unsuccessful"),
            )

        # Everything fine
        handle.handle_result(
            Test(name="unit_alert_test", alert="default"), Result(status="finished")
        )
Exemplo n.º 8
0
    def testSmokeUnstableTest(self):
        result = Result()

        self._succeed_until("complete")

        self.test["stable"] = False
        self._run(result, smoke_test=True)

        # Ensure stable and smoke_test are set correctly.
        assert not result.stable
        assert result.smoke_test
Exemplo n.º 9
0
    def testFetchResultFails(self):
        result = Result()

        self._succeed_until("test_command")

        self.command_runner_return["fetch_results"] = _fail_on_call(ResultsError)
        with self.assertLogs(logger, "ERROR") as cm:
            self._run(result)
            self.assertTrue(any("Could not fetch results" in o for o in cm.output))
        self.assertEqual(result.return_code, ExitCode.SUCCESS.value)
        self.assertEqual(result.status, "finished")

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 10
0
    def testPrepareRemoteEnvFails(self):
        result = Result()

        self._succeed_until("cluster_start")

        self.command_runner_return["prepare_remote_env"] = _fail_on_call(
            RemoteEnvSetupError
        )
        with self.assertRaises(RemoteEnvSetupError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.REMOTE_ENV_SETUP_ERROR.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 11
0
    def testWaitForNodesFails(self):
        result = Result()

        self._succeed_until("remote_env")

        # Wait for nodes command fails
        self.command_runner_return["wait_for_nodes"] = _fail_on_call(
            ClusterNodesWaitTimeout
        )
        with self.assertRaises(ClusterNodesWaitTimeout):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 12
0
    def testAlertFails(self):
        result = Result()

        self._succeed_until("get_last_logs")

        self.mock_alert_return = "Alert raised"

        with self.assertRaises(ResultsAlert):
            self._run(result)

        self.assertEqual(result.return_code, ExitCode.COMMAND_ALERT.value)
        self.assertEqual(result.status, "error")

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 13
0
    def testInvalidClusterIdOverride(self):
        result = Result()

        self._succeed_until("driver_setup")

        self.sdk.returns["get_cluster_environment"] = None

        with self.assertRaises(ClusterEnvCreateError):
            self._run(result, cluster_env_id="existing")

        self.sdk.returns["get_cluster_environment"] = APIDict(result=APIDict(
            config_json={"overridden": True}))

        with self.assertRaises(Exception) as cm:  # Fail somewhere else
            self._run(result, cluster_env_id="existing")
            self.assertNotIsInstance(cm.exception, ClusterEnvCreateError)
Exemplo n.º 14
0
    def testReportFails(self):
        result = Result()

        self._succeed_until("complete")

        class FailReporter(Reporter):
            def report_result(self, test: Test, result: Result):
                raise RuntimeError

        with self.assertLogs(logger, "ERROR") as cm:
            self._run(result, reporters=[FailReporter()])
            self.assertTrue(any("Error reporting results" in o for o in cm.output))

        self.assertEqual(result.return_code, ExitCode.SUCCESS.value)
        self.assertEqual(result.status, "finished")

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 15
0
    def testTestCommandTimeoutLongRunning(self):
        result = Result()

        self._succeed_until("fetch_results")

        # Test command times out
        self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout)
        with self.assertRaises(TestCommandTimeout):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value)

        # But now set test to long running
        self.test["run"]["long_running"] = True
        self._run(result)  # Will not fail this time

        self.assertGreaterEqual(result.results["last_update_diff"], 60.0)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 16
0
    def testTestCommandFails(self):
        result = Result()

        self._succeed_until("prepare_command")

        # Test command fails
        self.command_runner_return["run_command"] = _fail_on_call(CommandError)
        with self.assertRaises(TestCommandError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.COMMAND_ERROR.value)

        # Test command times out
        self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout)
        with self.assertRaises(TestCommandTimeout):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 17
0
    def testBuildConfigFailsClusterCompute(self):
        result = Result()

        self._succeed_until("driver_setup")

        # These commands should succeed
        self.command_runner_return["prepare_local_env"] = None

        # Fails because API response faulty
        with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)

        # Fails for random cluster compute reason
        self.cluster_manager_return["create_cluster_compute"] = _fail_on_call(
            ClusterComputeCreateError, "Known"
        )
        with self.assertRaisesRegex(ClusterComputeCreateError, "Known"):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value)
Exemplo n.º 18
0
    def testPrepareCommandFails(self):
        result = Result()

        self._succeed_until("wait_for_nodes")

        # Prepare command fails
        self.command_runner_return["run_prepare_command"] = _fail_on_call(CommandError)
        with self.assertRaises(PrepareCommandError):
            self._run(result)
        self.assertEqual(result.return_code, ExitCode.PREPARE_ERROR.value)

        # Prepare command times out
        self.command_runner_return["run_prepare_command"] = _fail_on_call(
            CommandTimeout
        )
        with self.assertRaises(PrepareCommandTimeout):
            self._run(result)
        # Special case: Prepare commands are usually waiting for nodes
        # (this may change in the future!)
        self.assertEqual(result.return_code, ExitCode.CLUSTER_WAIT_TIMEOUT.value)

        # Ensure cluster was terminated
        self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Exemplo n.º 19
0
def run_release_test(
    test: Test,
    anyscale_project: str,
    result: Result,
    ray_wheels_url: str,
    reporters: Optional[List[Reporter]] = None,
    smoke_test: bool = False,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    no_terminate: bool = False,
) -> Result:
    validate_test(test)

    result.wheels_url = ray_wheels_url
    result.stable = test.get("stable", True)

    buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "")
    if buildkite_url:
        buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "")
    result.buildkite_url = buildkite_url

    working_dir = test["working_dir"]

    old_wd = os.getcwd()
    new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir)
    os.chdir(new_wd)

    start_time = time.monotonic()

    run_type = test["run"].get("type", "sdk_command")

    command_runner_cls = type_str_to_command_runner.get(run_type)
    if not command_runner_cls:
        raise ReleaseTestConfigError(
            f"Unknown command runner type: {run_type}. Must be one of "
            f"{list(type_str_to_command_runner.keys())}")

    cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls]

    file_manager_str = test["run"].get("file_manager", None)
    if file_manager_str:
        if file_manager_str not in file_manager_str_to_file_manager:
            raise ReleaseTestConfigError(
                f"Unknown file manager: {file_manager_str}. Must be one of "
                f"{list(file_manager_str_to_file_manager.keys())}")
        file_manager_cls = file_manager_str_to_file_manager[file_manager_str]
    else:
        file_manager_cls = command_runner_to_file_manager[command_runner_cls]

    # Instantiate managers and command runner
    try:
        cluster_manager = cluster_manager_cls(test["name"], anyscale_project)
        file_manager = file_manager_cls(cluster_manager=cluster_manager)
        command_runner = command_runner_cls(cluster_manager, file_manager,
                                            working_dir)
    except Exception as e:
        raise ReleaseTestSetupError(
            f"Error setting up release test: {e}") from e

    pipeline_exception = None
    try:
        # Load configs
        cluster_env = load_test_cluster_env(test,
                                            ray_wheels_url=ray_wheels_url)
        cluster_compute = load_test_cluster_compute(test)

        if cluster_env_id:
            try:
                cluster_manager.cluster_env_id = cluster_env_id
                cluster_manager.build_cluster_env()
                cluster_manager.fetch_build_info()
                logger.info("Using overridden cluster environment with ID "
                            f"{cluster_env_id} and build ID "
                            f"{cluster_manager.cluster_env_build_id}")
            except Exception as e:
                raise ClusterEnvCreateError(
                    f"Could not get existing overridden cluster environment "
                    f"{cluster_env_id}: {e}") from e
        else:
            cluster_manager.set_cluster_env(cluster_env)

        cluster_manager.set_cluster_compute(cluster_compute)

        driver_setup_script = test.get("driver_setup", None)
        if driver_setup_script:
            try:
                run_bash_script(driver_setup_script)
            except Exception as e:
                raise LocalEnvSetupError(
                    f"Driver setup script failed: {e}") from e

        # Install local dependencies
        command_runner.prepare_local_env(ray_wheels_url)

        # Start session
        if cluster_id:
            # Re-use existing cluster ID for development
            cluster_manager.cluster_id = cluster_id
            cluster_manager.cluster_name = get_cluster_name(cluster_id)
        else:
            build_timeout = test["run"].get("build_timeout",
                                            DEFAULT_BUILD_TIMEOUT)

            if cluster_env_id:
                cluster_manager.cluster_env_id = cluster_env_id

            cluster_manager.build_configs(timeout=build_timeout)

            cluster_timeout = test["run"].get("session_timeout",
                                              DEFAULT_CLUSTER_TIMEOUT)

            autosuspend_mins = test["run"].get("autosuspend_mins", None)
            if autosuspend_mins:
                cluster_manager.autosuspend_minutes = autosuspend_mins

            cluster_manager.start_cluster(timeout=cluster_timeout)

        result.cluster_url = cluster_manager.get_cluster_url()

        # Upload files
        command_runner.prepare_remote_env()

        command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)

        wait_for_nodes = test["run"].get("wait_for_nodes", None)
        if wait_for_nodes:
            num_nodes = test["run"]["wait_for_nodes"]["num_nodes"]
            wait_timeout = test["run"]["wait_for_nodes"]["timeout"]
            command_runner.wait_for_nodes(num_nodes, wait_timeout)

        prepare_cmd = test["run"].get("prepare", None)
        if prepare_cmd:
            prepare_timeout = test["run"].get("prepare_timeout",
                                              command_timeout)
            try:
                command_runner.run_prepare_command(prepare_cmd,
                                                   timeout=prepare_timeout)
            except CommandError as e:
                raise PrepareCommandError(e)
            except CommandTimeout as e:
                raise PrepareCommandTimeout(e)

        command = test["run"]["script"]
        command_env = {}

        if smoke_test:
            command = f"{command} --smoke-test"
            command_env["IS_SMOKE_TEST"] = "1"

        try:
            command_runner.run_command(command,
                                       env=command_env,
                                       timeout=command_timeout)
        except CommandError as e:
            raise TestCommandError(e)
        except CommandTimeout as e:
            raise TestCommandTimeout(e)

        try:
            command_results = command_runner.fetch_results()
        except Exception as e:
            logger.error(f"Could not fetch results for test command: {e}")
            command_results = {}

        # Postprocess result:
        if "last_update" in command_results:
            command_results["last_update_diff"] = time.time(
            ) - command_results.get("last_update", 0.0)
        if smoke_test:
            command_results["smoke_test"] = True

        result.results = command_results
        result.status = "finished"

    except Exception as e:
        pipeline_exception = e

    try:
        last_logs = command_runner.get_last_logs()
    except Exception as e:
        logger.error(f"Error fetching logs: {e}")
        last_logs = "No logs could be retrieved."

    result.last_logs = last_logs

    if not no_terminate:
        try:
            cluster_manager.terminate_cluster(wait=False)
        except Exception as e:
            logger.error(f"Could not terminate cluster: {e}")

    time_taken = time.monotonic() - start_time
    result.runtime = time_taken

    os.chdir(old_wd)

    if not pipeline_exception:
        # Only handle results if we didn't run into issues earlier
        try:
            handle_result(test, result)
        except Exception as e:
            pipeline_exception = e

    if pipeline_exception:
        exit_code, error_type, runtime = handle_exception(pipeline_exception)

        result.return_code = exit_code.value
        result.status = error_type
        if runtime is not None:
            result.runtime = runtime

    reporters = reporters or []
    for reporter in reporters:
        try:
            reporter.report_result(test, result)
        except Exception as e:
            logger.error(f"Error reporting results via {type(reporter)}: {e}")

    if pipeline_exception:
        raise pipeline_exception

    return result
Exemplo n.º 20
0
 def testDefaultAlert(self):
     self.assertTrue(default.handle_result(self.test, Result(status="timeout")))
     self.assertFalse(default.handle_result(self.test, Result(status="finished")))
Exemplo n.º 21
0
def run_release_test(
    test: Test,
    anyscale_project: str,
    result: Result,
    ray_wheels_url: str,
    reporters: Optional[List[Reporter]] = None,
    smoke_test: bool = False,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    no_terminate: bool = False,
) -> Result:
    buildkite_group(":spiral_note_pad: Loading test configuration")

    validate_test(test)

    result.wheels_url = ray_wheels_url
    result.stable = test.get("stable", True)
    result.smoke_test = smoke_test

    buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "")
    if buildkite_url:
        buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "")
    result.buildkite_url = buildkite_url

    working_dir = test["working_dir"]

    old_wd = os.getcwd()
    new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir)
    os.chdir(new_wd)

    start_time = time.monotonic()

    run_type = test["run"].get("type", "sdk_command")

    command_runner_cls = type_str_to_command_runner.get(run_type)
    if not command_runner_cls:
        raise ReleaseTestConfigError(
            f"Unknown command runner type: {run_type}. Must be one of "
            f"{list(type_str_to_command_runner.keys())}")

    cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls]

    file_manager_str = test["run"].get("file_manager", None)
    if file_manager_str:
        if file_manager_str not in file_manager_str_to_file_manager:
            raise ReleaseTestConfigError(
                f"Unknown file manager: {file_manager_str}. Must be one of "
                f"{list(file_manager_str_to_file_manager.keys())}")
        file_manager_cls = file_manager_str_to_file_manager[file_manager_str]
    else:
        file_manager_cls = command_runner_to_file_manager[command_runner_cls]

    # Instantiate managers and command runner
    try:
        cluster_manager = cluster_manager_cls(test["name"],
                                              anyscale_project,
                                              smoke_test=smoke_test)
        file_manager = file_manager_cls(cluster_manager=cluster_manager)
        command_runner = command_runner_cls(cluster_manager, file_manager,
                                            working_dir)
    except Exception as e:
        raise ReleaseTestSetupError(
            f"Error setting up release test: {e}") from e

    pipeline_exception = None
    try:
        # Load configs
        cluster_env = load_test_cluster_env(test,
                                            ray_wheels_url=ray_wheels_url)
        cluster_compute = load_test_cluster_compute(test)

        if cluster_env_id:
            try:
                cluster_manager.cluster_env_id = cluster_env_id
                cluster_manager.build_cluster_env()
                cluster_manager.fetch_build_info()
                logger.info("Using overridden cluster environment with ID "
                            f"{cluster_env_id} and build ID "
                            f"{cluster_manager.cluster_env_build_id}")
            except Exception as e:
                raise ClusterEnvCreateError(
                    f"Could not get existing overridden cluster environment "
                    f"{cluster_env_id}: {e}") from e
        else:
            cluster_manager.set_cluster_env(cluster_env)

        cluster_manager.set_cluster_compute(cluster_compute)

        buildkite_group(":nut_and_bolt: Setting up local environment")
        driver_setup_script = test.get("driver_setup", None)
        if driver_setup_script:
            try:
                run_bash_script(driver_setup_script)
            except Exception as e:
                raise LocalEnvSetupError(
                    f"Driver setup script failed: {e}") from e

        # Install local dependencies
        command_runner.prepare_local_env(ray_wheels_url)
        command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)

        # Re-install anyscale package as local dependencies might have changed
        # from local env setup
        reinstall_anyscale_dependencies()

        # Print installed pip packages
        buildkite_group(":bulb: Local environment information")
        pip_packages = get_pip_packages()
        pip_package_string = "\n".join(pip_packages)
        logger.info(f"Installed python packages:\n{pip_package_string}")

        # Start cluster
        if cluster_id:
            buildkite_group(":rocket: Using existing cluster")
            # Re-use existing cluster ID for development
            cluster_manager.cluster_id = cluster_id
            cluster_manager.cluster_name = get_cluster_name(cluster_id)
        else:
            buildkite_group(":gear: Building cluster environment")
            build_timeout = test["run"].get("build_timeout",
                                            DEFAULT_BUILD_TIMEOUT)

            if cluster_env_id:
                cluster_manager.cluster_env_id = cluster_env_id

            cluster_manager.build_configs(timeout=build_timeout)

            cluster_timeout = test["run"].get("session_timeout",
                                              DEFAULT_CLUSTER_TIMEOUT)

            autosuspend_mins = test["cluster"].get("autosuspend_mins", None)
            if autosuspend_mins:
                cluster_manager.autosuspend_minutes = autosuspend_mins
            else:
                cluster_manager.autosuspend_minutes = min(
                    DEFAULT_AUTOSUSPEND_MINS,
                    int(command_timeout / 60) + 10)

            buildkite_group(":rocket: Starting up cluster")
            cluster_manager.start_cluster(timeout=cluster_timeout)

        result.cluster_url = cluster_manager.get_cluster_url()

        # Upload files
        buildkite_group(":wrench: Preparing remote environment")
        command_runner.prepare_remote_env()

        wait_for_nodes = test["run"].get("wait_for_nodes", None)
        if wait_for_nodes:
            buildkite_group(":stopwatch: Waiting for nodes to come up")
            num_nodes = test["run"]["wait_for_nodes"]["num_nodes"]
            wait_timeout = test["run"]["wait_for_nodes"].get(
                "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT)
            command_runner.wait_for_nodes(num_nodes, wait_timeout)

        prepare_cmd = test["run"].get("prepare", None)
        if prepare_cmd:
            prepare_timeout = test["run"].get("prepare_timeout",
                                              command_timeout)
            try:
                command_runner.run_prepare_command(prepare_cmd,
                                                   timeout=prepare_timeout)
            except CommandError as e:
                raise PrepareCommandError(e)
            except CommandTimeout as e:
                raise PrepareCommandTimeout(e)

        buildkite_group(":runner: Running test script")
        command = test["run"]["script"]
        command_env = {}

        if smoke_test:
            command = f"{command} --smoke-test"
            command_env["IS_SMOKE_TEST"] = "1"

        is_long_running = test["run"].get("long_running", False)

        try:
            command_runner.run_command(command,
                                       env=command_env,
                                       timeout=command_timeout)
        except CommandError as e:
            raise TestCommandError(e)
        except CommandTimeout as e:
            if not is_long_running:
                # Only raise error if command is not long running
                raise TestCommandTimeout(e)

        buildkite_group(":floppy_disk: Fetching results")
        try:
            command_results = command_runner.fetch_results()
        except Exception as e:
            logger.error("Could not fetch results for test command")
            logger.exception(e)
            command_results = {}

        # Postprocess result:
        if "last_update" in command_results:
            command_results["last_update_diff"] = time.time(
            ) - command_results.get("last_update", 0.0)
        if smoke_test:
            command_results["smoke_test"] = True

        result.results = command_results
        result.status = "finished"

    except Exception as e:
        logger.exception(e)
        buildkite_open_last()
        pipeline_exception = e

    try:
        last_logs = command_runner.get_last_logs()
    except Exception as e:
        logger.error(f"Error fetching logs: {e}")
        last_logs = "No logs could be retrieved."

    result.last_logs = last_logs

    if not no_terminate:
        buildkite_group(":earth_africa: Terminating cluster")
        try:
            cluster_manager.terminate_cluster(wait=False)
        except Exception as e:
            logger.error(f"Could not terminate cluster: {e}")

    time_taken = time.monotonic() - start_time
    result.runtime = time_taken

    os.chdir(old_wd)

    if not pipeline_exception:
        buildkite_group(":mag: Interpreting results")
        # Only handle results if we didn't run into issues earlier
        try:
            handle_result(test, result)
        except Exception as e:
            pipeline_exception = e

    if pipeline_exception:
        buildkite_group(":rotating_light: Handling errors")
        exit_code, error_type, runtime = handle_exception(pipeline_exception)

        result.return_code = exit_code.value
        result.status = error_type
        if runtime is not None:
            result.runtime = runtime

    buildkite_group(":memo: Reporting results", open=True)
    reporters = reporters or []
    for reporter in reporters:
        try:
            reporter.report_result(test, result)
        except Exception as e:
            logger.error(f"Error reporting results via {type(reporter)}: {e}")

    if pipeline_exception:
        raise pipeline_exception

    return result
Exemplo n.º 22
0
def main(
    test_name: str,
    test_collection_file: Optional[str] = None,
    smoke_test: bool = False,
    report: bool = False,
    ray_wheels: Optional[str] = None,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    env: Optional[str] = None,
    no_terminate: bool = False,
):
    test_collection_file = test_collection_file or os.path.join(
        os.path.dirname(__file__), "..", "..", "release_tests.yaml")
    test_collection = read_and_validate_release_test_collection(
        test_collection_file)
    test = find_test(test_collection, test_name)

    if not test:
        raise ReleaseTestCLIError(
            f"Test `{test_name}` not found in collection file: "
            f"{test_collection_file}")

    if smoke_test:
        test = as_smoke_test(test)

    env_to_use = env or test.get("env", DEFAULT_ENVIRONMENT)
    env_dict = load_environment(env_to_use)
    populate_os_env(env_dict)

    if "python" in test:
        python_version = parse_python_version(test["python"])
    else:
        python_version = DEFAULT_PYTHON_VERSION

    ray_wheels_url = find_and_wait_for_ray_wheels_url(
        ray_wheels,
        python_version=python_version,
        timeout=DEFAULT_WHEEL_WAIT_TIMEOUT)

    anyscale_project = os.environ.get("ANYSCALE_PROJECT", None)
    if not anyscale_project:
        raise ReleaseTestCLIError(
            "You have to set the ANYSCALE_PROJECT environment variable!")

    maybe_fetch_api_token()

    result = Result()

    reporters = [LogReporter()]

    if "BUILDKITE" in os.environ:
        reporters.append(ArtifactsReporter())

    if report:
        reporters.append(LegacyRDSReporter())
        reporters.append(DBReporter())

    try:
        result = run_release_test(
            test,
            anyscale_project=anyscale_project,
            result=result,
            ray_wheels_url=ray_wheels_url,
            reporters=reporters,
            smoke_test=smoke_test,
            cluster_id=cluster_id,
            cluster_env_id=cluster_env_id,
            no_terminate=no_terminate,
        )
        return_code = result.return_code
    except ReleaseTestError as e:
        logger.exception(e)
        return_code = e.exit_code.value

    logger.info(f"Release test pipeline for test {test['name']} completed. "
                f"Returning with exit code = {return_code}")
    sys.exit(result.return_code)