def _wait_job(self, command_id: int, timeout: int): from ray.job_submission import JobStatus # noqa: F811 start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 while True: now = time.monotonic() if now >= timeout_at: raise CommandTimeout( f"Cluster command timed out after {timeout} seconds.") if now >= next_status: logger.info(f"... command still running ..." f"({int(now - start_time)} seconds) ...") next_status += 30 status = self._get_job_status_with_retry(command_id) if status in { JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED }: break time.sleep(1) status = self._get_job_status_with_retry(command_id) # TODO(sang): Propagate JobInfo.error_type if status == JobStatus.SUCCEEDED: retcode = 0 else: retcode = -1 duration = time.time() - self.start_time[command_id] return retcode, duration
def wait_for_nodes(self, num_nodes: int, timeout: float = 900): ray_address = self.cluster_manager.get_cluster_address() try: if ray.is_initialized: ray.shutdown() ray.init(address=ray_address) start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 nodes_up = len(ray.nodes()) while nodes_up < num_nodes: now = time.monotonic() if now >= timeout_at: raise ClusterNodesWaitTimeout( f"Only {len(ray.nodes())}/{num_nodes} are up after " f"{timeout} seconds.") if now >= next_status: logger.info(f"Waiting for nodes to come up: " f"{len(ray.nodes())}/{num_nodes} " f"({now - start_time:.2f} seconds, " f"timeout: {timeout} seconds).") next_status += 30 time.sleep(1) nodes_up = len(ray.nodes()) ray.shutdown() except Exception as e: raise ClusterStartupError( f"Exception when waiting for nodes: {e}") from e logger.info(f"All {num_nodes} nodes are up.")
def install_matching_ray_locally(ray_wheels: Optional[str]): if not ray_wheels: logger.warning( "No Ray wheels found - can't install matching Ray wheels locally!") return assert "manylinux2014_x86_64" in ray_wheels, ray_wheels if sys.platform == "darwin": platform = "macosx_10_15_intel" elif sys.platform == "win32": platform = "win_amd64" else: platform = "manylinux2014_x86_64" ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform) logger.info(f"Installing matching Ray wheels locally: {ray_wheels}") subprocess.check_output("pip uninstall -y ray", shell=True, env=os.environ, text=True) subprocess.check_output( f"pip install -U {shlex.quote(ray_wheels)}", shell=True, env=os.environ, text=True, ) for module_name in RELOAD_MODULES: if module_name in sys.modules: importlib.reload(sys.modules[module_name])
def find_cloud_by_name(cloud_name: str, sdk: Optional["AnyscaleSDK"] = None) -> Optional[str]: sdk = sdk or get_anyscale_sdk() cloud_id = None logger.info(f"Looking up cloud with name `{cloud_name}`. ") paging_token = None while not cloud_id: result = sdk.search_clouds(clouds_query=dict( paging=dict(count=50, paging_token=paging_token))) paging_token = result.metadata.next_paging_token for res in result.results: if res.name == cloud_name: cloud_id = res.id logger.info( f"Found cloud with name `{cloud_name}` as `{cloud_id}`") break if not paging_token or cloud_id or not len(result.results): break return cloud_id
def report_result(self, test: Test, result: Result): logger.info("Persisting result to the databricks delta lake...") result_json = { "_table": "release_test_result", "report_timestamp_ms": int(time.time() * 1000), "status": result.status or "", "results": result.results or {}, "name": test.get("name", ""), "group": test.get("group", ""), "team": test.get("team", ""), "frequency": test.get("frequency", ""), "cluster_url": result.cluster_url or "", "wheel_url": result.wheels_url or "", "buildkite_url": result.buildkite_url or "", "runtime": result.runtime or -1.0, "stable": result.stable, "return_code": result.return_code, } logger.debug(f"Result json: {json.dumps(result_json)}") try: self.firehose.put_record( DeliveryStreamName="ray-ci-results", Record={"Data": json.dumps(result_json)}, ) except Exception: logger.exception( "Failed to persist result to the databricks delta lake") else: logger.info( "Result has been persisted to the databricks delta lake")
def run_command(self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0) -> float: full_env = self.get_full_command_env(env) if full_env: env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " else: env_str = "" full_command = f"{env_str}{command}" logger.info( f"Running command in cluster {self.cluster_manager.cluster_name}: " f"{full_command}") logger.info(f"Link to cluster: " f"{format_link(self.cluster_manager.get_cluster_url())}") status_code, time_taken = self.job_manager.run_and_wait( full_command, full_env, timeout=timeout) if status_code != 0: raise CommandError( f"Command returned non-success status: {status_code}") return time_taken
def reinstall_anyscale_dependencies() -> None: logger.info("Re-installing `anyscale` package") subprocess.check_output( "pip install -U anyscale", shell=True, text=True, )
def main( test_name: str, test_collection_file: Optional[str] = None, smoke_test: bool = False, report: bool = False, ray_wheels: Optional[str] = None, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ): test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) test = find_test(test_collection, test_name) if not test: raise ReleaseTestCLIError( f"Test `{test_name}` not found in collection file: " f"{test_collection_file}") if smoke_test: test = as_smoke_test(test) ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) anyscale_project = os.environ.get("ANYSCALE_PROJECT", None) if not anyscale_project: raise ReleaseTestCLIError( "You have to set the ANYSCALE_PROJECT environment variable!") maybe_fetch_api_token() result = Result() reporters = [LogReporter()] if report: reporters.append(LegacyRDSReporter()) try: result = run_release_test( test, anyscale_project=anyscale_project, result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, ) except ReleaseTestError as e: logger.exception(e) logger.info(f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {result.return_code}") sys.exit(result.return_code)
def find_and_wait_for_ray_wheels_url( ray_wheels: Optional[str] = None, python_version: Tuple[int, int] = DEFAULT_PYTHON_VERSION, timeout: float = 3600.0, ) -> str: ray_wheels_url = find_ray_wheels_url(ray_wheels, python_version=python_version) logger.info(f"Using Ray wheels URL: {ray_wheels_url}") return wait_for_url(ray_wheels_url, timeout=timeout)
def download(self, source: str, target: str): logger.info(f"Downloading {source or '<cwd>'} to {target or '<cwd>'} " f"using SessionController") self.session_controller.pull( session_name=self.cluster_manager.cluster_name, source=source, target=target, config=None, )
def install_cluster_env_packages(cluster_env: Dict[Any, Any]): os.environ.update(cluster_env.get("env_vars", {})) packages = cluster_env["python"]["pip_packages"] logger.info(f"Installing cluster env packages locally: {packages}") for package in packages: subprocess.check_output(f"pip install -U {package}", shell=True, env=os.environ, text=True)
def fetch_results(self) -> Dict[str, Any]: try: tmpfile = tempfile.mkstemp(suffix=".json")[1] logger.info(tmpfile) self.file_manager.download(self.result_output_json, tmpfile) with open(tmpfile, "rt") as f: data = json.load(f) os.unlink(tmpfile) return data except Exception as e: raise ResultsError( f"Could not fetch results from session: {e}") from e
def create_cluster_env(self, _repeat: bool = True): assert self.cluster_env_id is None if self.cluster_env: assert self.cluster_env_name logger.info( f"Test uses a cluster env with name " f"{self.cluster_env_name}. Looking up existing " f"cluster envs with this name." ) paging_token = None while not self.cluster_env_id: result = self.sdk.search_cluster_environments( dict( project_id=self.project_id, name=dict(equals=self.cluster_env_name), paging=dict(count=50, token=paging_token), ) ) paging_token = result.metadata.next_paging_token for res in result.results: if res.name == self.cluster_env_name: self.cluster_env_id = res.id logger.info( f"Cluster env already exists with ID " f"{self.cluster_env_id}" ) break if not paging_token or self.cluster_env_id: break if not self.cluster_env_id: logger.info("Cluster env not found. Creating new one.") try: result = self.sdk.create_cluster_environment( dict( name=self.cluster_env_name, project_id=self.project_id, config_json=self.cluster_env, ) ) self.cluster_env_id = result.result.id except Exception as e: if _repeat: logger.warning( f"Got exception when trying to create cluster " f"env: {e}. Sleeping for 10 seconds and then " f"try again once..." ) time.sleep(10) return self.create_cluster_env(_repeat=False) raise ClusterEnvCreateError("Could not create cluster env.") from e logger.info(f"Cluster env created with ID {self.cluster_env_id}")
def _kill_after( proc: subprocess.Popen, timeout: int = 30, kill_event: Optional[threading.Event] = None, ): timeout_at = time.monotonic() + timeout while time.monotonic() < timeout_at: if proc.poll() is not None: return time.sleep(1) logger.info(f"Client command timed out after {timeout} seconds, " f"killing subprocess.") if kill_event: kill_event.set() proc.terminate()
def exponential_backoff_retry(f, retry_exceptions, initial_retry_delay_s, max_retries) -> None: retry_cnt = 0 retry_delay_s = initial_retry_delay_s while True: try: return f() except retry_exceptions as e: retry_cnt += 1 if retry_cnt > max_retries: raise logger.info(f"Retry function call failed due to {e} " f"in {retry_delay_s} seconds...") time.sleep(retry_delay_s) retry_delay_s *= 2
def report_result(self, test: Test, result: Result): if not os.path.exists(self.artifacts_dir): os.makedirs(self.artifacts_dir, 0o755) test_config_file = os.path.join(self.artifacts_dir, ARTIFACT_TEST_CONFIG_FILE) with open(test_config_file, "wt") as fp: json.dump(test, fp, sort_keys=True, indent=4) result_file = os.path.join(self.artifacts_dir, ARTIFACT_RESULT_FILE) with open(result_file, "wt") as fp: json.dump(result.__dict__, fp, sort_keys=True, indent=4) logger.info( f"Wrote test config and result to artifacts directory: {self.artifacts_dir}" )
def create_cluster_compute(self, _repeat: bool = True): assert self.cluster_compute_id is None if self.cluster_compute: assert self.cluster_compute logger.info(f"Tests uses compute template " f"with name {self.cluster_compute_name}. " f"Looking up existing cluster computes.") paging_token = None while not self.cluster_compute_id: result = self.sdk.search_cluster_computes( dict( project_id=self.project_id, name=dict(equals=self.cluster_compute_name), include_anonymous=True, paging=dict(token=paging_token), )) paging_token = result.metadata.next_paging_token for res in result.results: if res.name == self.cluster_compute_name: self.cluster_compute_id = res.id logger.info(f"Cluster compute already exists " f"with ID {self.cluster_compute_id}") break if not paging_token: break if not self.cluster_compute_id: logger.info(f"Cluster compute not found. " f"Creating with name {self.cluster_compute_name}.") try: result = self.sdk.create_cluster_compute( dict( name=self.cluster_compute_name, project_id=self.project_id, config=self.cluster_compute, )) self.cluster_compute_id = result.result.id except Exception as e: if _repeat: logger.warning( f"Got exception when trying to create cluster " f"compute: {e}. Sleeping for 10 seconds and then " f"try again once...") time.sleep(10) return self.create_cluster_compute(_repeat=False) raise ClusterComputeCreateError( "Could not create cluster compute") from e logger.info(f"Cluster compute template created with " f"name {self.cluster_compute_name} and " f"ID {self.cluster_compute_id}")
def maybe_fetch_api_token(): if not os.environ.get("ANYSCALE_CLI_TOKEN"): try: token, _ = AuthenticationBlock._load_credentials() logger.info("Loaded anyscale credentials from local storage.") os.environ["ANYSCALE_CLI_TOKEN"] = token return except Exception: pass # Ignore errors logger.info( "Missing ANYSCALE_CLI_TOKEN, retrieving from AWS secrets store") # NOTE(simon) This should automatically retrieve # [email protected]'s anyscale token os.environ["ANYSCALE_CLI_TOKEN"] = boto3.client( "secretsmanager", region_name="us-west-2").get_secret_value( SecretId=RELEASE_AWS_ANYSCALE_SECRET_ARN)["SecretString"]
def handle_result(test: Test, result: Result): alert_suite = test.get("alert", "default") logger.info( f"Checking results for test {test['name']} using alerting suite " f"{alert_suite}") if alert_suite not in result_to_handle_map: raise ReleaseTestConfigError(f"Alert suite {alert_suite} not found.") handler = result_to_handle_map[alert_suite] error = handler(test, result) if error: raise ResultsAlert(error) logger.info("No alerts have been raised - test passed successfully!")
def upload(self, source: Optional[str] = None, target: Optional[str] = None): logger.info(f"Uploading {source or '<cwd>'} to {target or '<cwd>'} " f"using SessionController") if source and os.path.isdir(source) and target: # Add trailing slashes source = os.path.join(source, "") target = os.path.join(target, "") self.session_controller.push( session_name=self.cluster_manager.cluster_name, source=source, target=target, config=None, all_nodes=False, )
def _run_job(self, cmd_to_run, env_vars) -> int: self.counter += 1 command_id = self.counter env = os.environ.copy() env["RAY_ADDRESS"] = self.cluster_manager.get_cluster_address() env.setdefault("ANYSCALE_HOST", ANYSCALE_HOST) full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run logger.info(f"Executing {cmd_to_run} with {env_vars} via ray job submit") job_client = self._get_job_client() job_id = job_client.submit_job( # Entrypoint shell command to execute entrypoint=full_cmd, ) self.last_job_id = job_id self.job_id_pool[command_id] = job_id self.start_time[command_id] = time.time() return command_id
def wait_for_url( url, timeout: float = 300.0, check_time: float = 30.0, status_time: float = 60.0 ) -> str: start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + status_time logger.info(f"Waiting up to {timeout} seconds until URL is available " f"({url})") while not url_exists(url): now = time.monotonic() if now >= timeout_at: raise RayWheelsTimeoutError( f"Time out when waiting for URL to be available: {url}" ) if now >= next_status: logger.info( f"... still waiting for URL {url} " f"({int(now - start_time)} seconds) ..." ) next_status += status_time # Sleep `check_time` sec before next check. time.sleep(check_time) logger.info(f"URL is now available: {url}") return url
def install_matching_ray(ray_wheels: Optional[str]): if not ray_wheels: logger.warning( "No Ray wheels found - can't install matching Ray wheels locally!") return assert "manylinux2014_x86_64" in ray_wheels, ray_wheels if sys.platform == "darwin": platform = "macosx_10_15_intel" elif sys.platform == "win32": platform = "win_amd64" else: platform = "manylinux2014_x86_64" ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform) logger.info(f"Installing matching Ray wheels locally: {ray_wheels}") subprocess.check_output("pip uninstall -y ray", shell=True, env=os.environ, text=True) subprocess.check_output(f"pip install -U {ray_wheels}", shell=True, env=os.environ, text=True)
def report_result(self, test: Test, result: Result): logger.info(f"Test {test['name']} finished after " f"{result.runtime:.2f} seconds. Last logs:\n\n" f"{result.last_logs}\n") logger.info(f"Got the following metadata: \n" f" name: {test['name']}\n" f" status: {result.status}\n" f" runtime: {result.runtime:.2f}\n" f" stable: {result.stable}\n" f"\n" f" buildkite_url: {format_link(result.buildkite_url)}\n" f" wheels_url: {format_link(result.wheels_url)}\n" f" cluster_url: {format_link(result.cluster_url)}\n") results = result.results if results: msg = "Observed the following results:\n\n" for key, val in results.items(): msg += f" {key} = {val}\n" else: msg = "Did not find any results." logger.info(msg)
def run_release_test( test: Test, anyscale_project: str, result: Result, ray_wheels_url: str, reporters: Optional[List[Reporter]] = None, smoke_test: bool = False, cluster_id: Optional[str] = None, cluster_env_id: Optional[str] = None, no_terminate: bool = False, ) -> Result: buildkite_group(":spiral_note_pad: Loading test configuration") validate_test(test) result.wheels_url = ray_wheels_url result.stable = test.get("stable", True) result.smoke_test = smoke_test buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "") if buildkite_url: buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "") result.buildkite_url = buildkite_url working_dir = test["working_dir"] old_wd = os.getcwd() new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir) os.chdir(new_wd) start_time = time.monotonic() run_type = test["run"].get("type", "sdk_command") command_runner_cls = type_str_to_command_runner.get(run_type) if not command_runner_cls: raise ReleaseTestConfigError( f"Unknown command runner type: {run_type}. Must be one of " f"{list(type_str_to_command_runner.keys())}") cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls] file_manager_str = test["run"].get("file_manager", None) if file_manager_str: if file_manager_str not in file_manager_str_to_file_manager: raise ReleaseTestConfigError( f"Unknown file manager: {file_manager_str}. Must be one of " f"{list(file_manager_str_to_file_manager.keys())}") file_manager_cls = file_manager_str_to_file_manager[file_manager_str] else: file_manager_cls = command_runner_to_file_manager[command_runner_cls] # Instantiate managers and command runner try: cluster_manager = cluster_manager_cls(test["name"], anyscale_project, smoke_test=smoke_test) file_manager = file_manager_cls(cluster_manager=cluster_manager) command_runner = command_runner_cls(cluster_manager, file_manager, working_dir) except Exception as e: raise ReleaseTestSetupError( f"Error setting up release test: {e}") from e pipeline_exception = None try: # Load configs cluster_env = load_test_cluster_env(test, ray_wheels_url=ray_wheels_url) cluster_compute = load_test_cluster_compute(test) if cluster_env_id: try: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_cluster_env() cluster_manager.fetch_build_info() logger.info("Using overridden cluster environment with ID " f"{cluster_env_id} and build ID " f"{cluster_manager.cluster_env_build_id}") except Exception as e: raise ClusterEnvCreateError( f"Could not get existing overridden cluster environment " f"{cluster_env_id}: {e}") from e else: cluster_manager.set_cluster_env(cluster_env) cluster_manager.set_cluster_compute(cluster_compute) buildkite_group(":nut_and_bolt: Setting up local environment") driver_setup_script = test.get("driver_setup", None) if driver_setup_script: try: run_bash_script(driver_setup_script) except Exception as e: raise LocalEnvSetupError( f"Driver setup script failed: {e}") from e # Install local dependencies command_runner.prepare_local_env(ray_wheels_url) command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) # Re-install anyscale package as local dependencies might have changed # from local env setup reinstall_anyscale_dependencies() # Print installed pip packages buildkite_group(":bulb: Local environment information") pip_packages = get_pip_packages() pip_package_string = "\n".join(pip_packages) logger.info(f"Installed python packages:\n{pip_package_string}") # Start cluster if cluster_id: buildkite_group(":rocket: Using existing cluster") # Re-use existing cluster ID for development cluster_manager.cluster_id = cluster_id cluster_manager.cluster_name = get_cluster_name(cluster_id) else: buildkite_group(":gear: Building cluster environment") build_timeout = test["run"].get("build_timeout", DEFAULT_BUILD_TIMEOUT) if cluster_env_id: cluster_manager.cluster_env_id = cluster_env_id cluster_manager.build_configs(timeout=build_timeout) cluster_timeout = test["run"].get("session_timeout", DEFAULT_CLUSTER_TIMEOUT) autosuspend_mins = test["cluster"].get("autosuspend_mins", None) if autosuspend_mins: cluster_manager.autosuspend_minutes = autosuspend_mins else: cluster_manager.autosuspend_minutes = min( DEFAULT_AUTOSUSPEND_MINS, int(command_timeout / 60) + 10) buildkite_group(":rocket: Starting up cluster") cluster_manager.start_cluster(timeout=cluster_timeout) result.cluster_url = cluster_manager.get_cluster_url() # Upload files buildkite_group(":wrench: Preparing remote environment") command_runner.prepare_remote_env() wait_for_nodes = test["run"].get("wait_for_nodes", None) if wait_for_nodes: buildkite_group(":stopwatch: Waiting for nodes to come up") num_nodes = test["run"]["wait_for_nodes"]["num_nodes"] wait_timeout = test["run"]["wait_for_nodes"].get( "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT) command_runner.wait_for_nodes(num_nodes, wait_timeout) prepare_cmd = test["run"].get("prepare", None) if prepare_cmd: prepare_timeout = test["run"].get("prepare_timeout", command_timeout) try: command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout) except CommandError as e: raise PrepareCommandError(e) except CommandTimeout as e: raise PrepareCommandTimeout(e) buildkite_group(":runner: Running test script") command = test["run"]["script"] command_env = {} if smoke_test: command = f"{command} --smoke-test" command_env["IS_SMOKE_TEST"] = "1" is_long_running = test["run"].get("long_running", False) try: command_runner.run_command(command, env=command_env, timeout=command_timeout) except CommandError as e: raise TestCommandError(e) except CommandTimeout as e: if not is_long_running: # Only raise error if command is not long running raise TestCommandTimeout(e) buildkite_group(":floppy_disk: Fetching results") try: command_results = command_runner.fetch_results() except Exception as e: logger.error("Could not fetch results for test command") logger.exception(e) command_results = {} # Postprocess result: if "last_update" in command_results: command_results["last_update_diff"] = time.time( ) - command_results.get("last_update", 0.0) if smoke_test: command_results["smoke_test"] = True result.results = command_results result.status = "finished" except Exception as e: logger.exception(e) buildkite_open_last() pipeline_exception = e try: last_logs = command_runner.get_last_logs() except Exception as e: logger.error(f"Error fetching logs: {e}") last_logs = "No logs could be retrieved." result.last_logs = last_logs if not no_terminate: buildkite_group(":earth_africa: Terminating cluster") try: cluster_manager.terminate_cluster(wait=False) except Exception as e: logger.error(f"Could not terminate cluster: {e}") time_taken = time.monotonic() - start_time result.runtime = time_taken os.chdir(old_wd) if not pipeline_exception: buildkite_group(":mag: Interpreting results") # Only handle results if we didn't run into issues earlier try: handle_result(test, result) except Exception as e: pipeline_exception = e if pipeline_exception: buildkite_group(":rotating_light: Handling errors") exit_code, error_type, runtime = handle_exception(pipeline_exception) result.return_code = exit_code.value result.status = error_type if runtime is not None: result.runtime = runtime buildkite_group(":memo: Reporting results", open=True) reporters = reporters or [] for reporter in reporters: try: reporter.report_result(test, result) except Exception as e: logger.error(f"Error reporting results via {type(reporter)}: {e}") if pipeline_exception: raise pipeline_exception return result
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: full_env = self.get_full_command_env(env) if full_env: env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " else: env_str = "" full_command = f"{env_str}{command}" logger.info( f"Running command in cluster {self.cluster_manager.cluster_name}: " f"{full_command}" ) logger.info( f"Link to cluster: " f"{format_link(self.cluster_manager.get_cluster_url())}" ) result = self.sdk.create_session_command( dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command) ) scd_id = result.result.id self.last_command_scd_id = scd_id completed = result.result.finished_at is not None start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 while not completed: now = time.monotonic() if now >= timeout_at: raise CommandTimeout( f"Cluster command timed out after {timeout} seconds." ) if now >= next_status: logger.info( f"... command still running ..." f"({int(now - start_time)} seconds) ..." ) next_status += 30 # Sleep 1 sec before next check. time.sleep(1) result = exponential_backoff_retry( lambda: self.sdk.get_session_command(session_command_id=scd_id), retry_exceptions=Exception, initial_retry_delay_s=10, max_retries=3, ) completed = result.result.finished_at status_code = result.result.status_code time_taken = time.monotonic() - start_time if status_code != 0: raise CommandError(f"Command returned non-success status: {status_code}") return time_taken
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: logger.info( f"Running command using Ray client on cluster " f"{self.cluster_manager.cluster_name}: {command}" ) env = env or {} full_env = self.get_full_command_env( { **os.environ, **env, "RAY_ADDRESS": self.cluster_manager.get_cluster_address(), "RAY_JOB_NAME": "test_job", "PYTHONUNBUFFERED": "1", } ) kill_event = threading.Event() def _kill_after( proc: subprocess.Popen, timeout: int = 30, kill_event: Optional[threading.Event] = None, ): timeout_at = time.monotonic() + timeout while time.monotonic() < timeout_at: if proc.poll() is not None: return time.sleep(1) logger.info( f"Client command timed out after {timeout} seconds, " f"killing subprocess." ) if kill_event: kill_event.set() proc.terminate() start_time = time.monotonic() proc = subprocess.Popen( command, env=full_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True, ) kill_thread = threading.Thread( target=_kill_after, args=(proc, timeout, kill_event) ) kill_thread.start() proc.stdout.reconfigure(line_buffering=True) sys.stdout.reconfigure(line_buffering=True) logs = deque(maxlen=LAST_LOGS_LENGTH) for line in proc.stdout: logs.append(line) sys.stdout.write(line) proc.wait() sys.stdout.reconfigure(line_buffering=False) time_taken = time.monotonic() - start_time self.last_logs = "\n".join(logs) return_code = proc.poll() if return_code == -15 or return_code == 15 or kill_event.is_set(): # Process has been terminated raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.") if return_code != 0: raise CommandError(f"Command returned non-success status: {return_code}") logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}") return time_taken
def report_result(self, test: Test, result: Result): logger.info("Persisting results to database...") result_dict = { "_runtime": result.runtime, # Keep session url for legacy support "_session_url": result.cluster_url, "_cluster_url": result.cluster_url, "_commit_url": result.wheels_url, "_stable": result.stable, } now = datetime.datetime.utcnow() rds_data_client = boto3.client("rds-data", region_name="us-west-2") if "legacy" in test: test_name = test["legacy"]["test_name"] test_suite = test["legacy"]["test_suite"] else: test_name = test["name"] test_suite = "" team = test["team"] or "" # Branch name category = get_test_env_var("RAY_BRANCH", "") status = result.status or "invalid" last_logs = result.last_logs or "" if result.results: result_dict.update(result.results) artifacts = {} parameters = [ { "name": "created_on", "typeHint": "TIMESTAMP", "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")}, }, {"name": "test_suite", "value": {"stringValue": test_suite}}, {"name": "test_name", "value": {"stringValue": test_name}}, {"name": "status", "value": {"stringValue": status}}, {"name": "last_logs", "value": {"stringValue": last_logs}}, { "name": "results", "typeHint": "JSON", "value": {"stringValue": json.dumps(result_dict)}, }, { "name": "artifacts", "typeHint": "JSON", "value": {"stringValue": json.dumps(artifacts)}, }, {"name": "category", "value": {"stringValue": category}}, {"name": "team", "value": {"stringValue": team}}, {"name": "session_url", "value": {"stringValue": result.cluster_url or ""}}, {"name": "commit_url", "value": {"stringValue": result.wheels_url or ""}}, {"name": "runtime", "value": {"doubleValue": result.runtime or -1.0}}, {"name": "stable", "value": {"booleanValue": result.stable}}, {"name": "frequency", "value": {"stringValue": test.get("frequency", "")}}, {"name": "return_code", "value": {"longValue": result.return_code}}, ] columns = [param["name"] for param in parameters] values = [f":{param['name']}" for param in parameters] column_str = ", ".join(columns).strip(", ") value_str = ", ".join(values).strip(", ") sql = ( f"INSERT INTO {self.database_table} " f"({column_str}) " f"VALUES ({value_str})" ) logger.debug(f"SQL query: {sql}") # Default boto3 call timeout is 45 seconds. retry_delay_s = 64 MAX_RDS_RETRY = 3 exponential_backoff_retry( lambda: rds_data_client.execute_statement( database=self.database, parameters=parameters, secretArn=RELEASE_AWS_DB_SECRET_ARN, resourceArn=RELEASE_AWS_DB_RESOURCE_ARN, schema=self.database_table, sql=sql, ), retry_exceptions=rds_data_client.exceptions.StatementTimeoutException, initial_retry_delay_s=retry_delay_s, max_retries=MAX_RDS_RETRY, ) logger.info("Result has been persisted to the database")
def build_cluster_env(self, timeout: float = 600.0): assert self.cluster_env_id assert self.cluster_env_build_id is None # Fetch build build_id = None last_status = None error_message = None config_json = None result = self.sdk.list_cluster_environment_builds(self.cluster_env_id) if not result or not result.results: raise ClusterEnvBuildError(f"No build found for cluster env: {result}") build = sorted(result.results, key=lambda b: b.created_at)[-1] build_id = build.id last_status = build.status error_message = build.error_message config_json = build.config_json if last_status == "succeeded": logger.info( f"Link to succeeded cluster env build: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) self.cluster_env_build_id = build_id return if last_status == "failed": logger.info(f"Previous cluster env build failed: {error_message}") logger.info("Starting new cluster env build...") # Retry build result = self.sdk.create_cluster_environment_build( dict( cluster_environment_id=self.cluster_env_id, config_json=config_json ) ) build_id = result.result.id logger.info( f"Link to created cluster env build: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) # Build found but not failed/finished yet completed = False start_wait = time.time() next_report = start_wait + REPORT_S timeout_at = time.monotonic() + timeout logger.info(f"Waiting for build {build_id} to finish...") logger.info( f"Track progress here: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) while not completed: now = time.time() if now > next_report: logger.info( f"... still waiting for build {build_id} to finish " f"({int(now - start_wait)} seconds) ..." ) next_report = next_report + REPORT_S result = self.sdk.get_build(build_id) build = result.result if build.status == "failed": raise ClusterEnvBuildError( f"Cluster env build failed. Please see " f"{anyscale_cluster_env_build_url(build_id)} for details. " f"Error message: {build.error_message}" ) if build.status == "succeeded": logger.info("Build succeeded.") self.cluster_env_build_id = build_id return completed = build.status not in ["in_progress", "pending"] if completed: raise ClusterEnvBuildError( f"Unknown build status: {build.status}. Please see " f"{anyscale_cluster_env_build_url(build_id)} for details" ) if time.monotonic() > timeout_at: raise ClusterEnvBuildTimeout( f"Time out when building cluster env {self.cluster_env_name}" ) time.sleep(1) self.cluster_env_build_id = build_id
def main(test_collection_file: Optional[str] = None): settings = get_pipeline_settings() repo = settings["ray_test_repo"] branch = settings["ray_test_branch"] tmpdir = None env = {} if repo: # If the Ray test repo is set, we clone that repo to fetch # the test configuration file. Otherwise we might be missing newly # added test. repo = settings["ray_test_repo"] tmpdir = tempfile.mktemp() clone_cmd = f"git clone --depth 1 --branch {branch} {repo} {tmpdir}" try: subprocess.check_output(clone_cmd, shell=True) except Exception as e: raise ReleaseTestCLIError(f"Could not clone test repository " f"{repo} (branch {branch}): {e}") from e test_collection_file = os.path.join(tmpdir, "release", "release_tests.yaml") env = { "RAY_TEST_REPO": repo, "RAY_TEST_BRANCH": branch, } else: test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) if tmpdir: shutil.rmtree(tmpdir, ignore_errors=True) frequency = settings["frequency"] prefer_smoke_tests = settings["prefer_smoke_tests"] test_attr_regex_filters = settings["test_attr_regex_filters"] ray_wheels = settings["ray_wheels"] priority = settings["priority"] logger.info( f"Found the following buildkite pipeline settings:\n\n" f" frequency = {settings['frequency']}\n" f" prefer_smoke_tests = {settings['prefer_smoke_tests']}\n" f" test_attr_regex_filters = {settings['test_attr_regex_filters']}\n" f" ray_wheels = {settings['ray_wheels']}\n" f" ray_test_repo = {settings['ray_test_repo']}\n" f" ray_test_branch = {settings['ray_test_branch']}\n" f" priority = {settings['priority']}\n" f" no_concurrency_limit = {settings['no_concurrency_limit']}\n") filtered_tests = filter_tests( test_collection, frequency=frequency, test_attr_regex_filters=test_attr_regex_filters, prefer_smoke_tests=prefer_smoke_tests, ) logger.info(f"Found {len(filtered_tests)} tests to run.") if len(filtered_tests) == 0: raise ReleaseTestCLIError( "Empty test collection. The selected frequency or filter did " "not return any tests to run. Adjust your filters.") grouped_tests = group_tests(filtered_tests) group_str = "" for group, tests in grouped_tests.items(): group_str += f"\n{group}:\n" for test, smoke in tests: group_str += f" {test['name']}" if smoke: group_str += " [smoke test]" group_str += "\n" logger.info(f"Tests to run:\n{group_str}") # Wait for wheels here so we have them ready before we kick off # the other workers ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) logger.info(f"Starting pipeline for Ray wheel: {ray_wheels_url}") no_concurrency_limit = settings["no_concurrency_limit"] if no_concurrency_limit: logger.warning("Concurrency is not limited for this run!") # Report if REPORT=1 or BUILDKITE_SOURCE=schedule report = (bool(int(os.environ.get("REPORT", "0"))) or os.environ.get("BUILDKITE_SOURCE", "manual") == "schedule") steps = [] for group in sorted(grouped_tests): tests = grouped_tests[group] group_steps = [] for test, smoke_test in tests: # If the python version is defined, we need a different Ray wheels URL if "python" in test: python_version = parse_python_version(test["python"]) this_ray_wheels_url = find_ray_wheels_url( ray_wheels, python_version=python_version) else: this_ray_wheels_url = ray_wheels_url step = get_step( test, report=report, smoke_test=smoke_test, ray_wheels=this_ray_wheels_url, env=env, priority_val=priority.value, ) if no_concurrency_limit: step.pop("concurrency", None) step.pop("concurrency_group", None) group_steps.append(step) group_step = {"group": group, "steps": group_steps} steps.append(group_step) if "BUILDKITE" in os.environ: if os.path.exists(PIPELINE_ARTIFACT_PATH): shutil.rmtree(PIPELINE_ARTIFACT_PATH) os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755) with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"), "wt") as fp: json.dump(steps, fp) settings["frequency"] = settings["frequency"].value settings["priority"] = settings["priority"].value with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp: json.dump(settings, fp) steps_str = json.dumps(steps) print(steps_str)