def install_matching_ray_locally(ray_wheels: Optional[str]): if not ray_wheels: logger.warning( "No Ray wheels found - can't install matching Ray wheels locally!") return assert "manylinux2014_x86_64" in ray_wheels, ray_wheels if sys.platform == "darwin": platform = "macosx_10_15_intel" elif sys.platform == "win32": platform = "win_amd64" else: platform = "manylinux2014_x86_64" ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform) logger.info(f"Installing matching Ray wheels locally: {ray_wheels}") subprocess.check_output("pip uninstall -y ray", shell=True, env=os.environ, text=True) subprocess.check_output( f"pip install -U {shlex.quote(ray_wheels)}", shell=True, env=os.environ, text=True, ) for module_name in RELOAD_MODULES: if module_name in sys.modules: importlib.reload(sys.modules[module_name])
def _push_local_dir(self): remote_upload_to = self._generate_tmp_s3_path() # pack local dir _, local_path = tempfile.mkstemp() shutil.make_archive(local_path, "gztar", os.getcwd()) # local source -> s3 self._run_with_retry(lambda: self.s3_client.upload_file( Filename=local_path + ".tar.gz", Bucket=self.bucket, Key=remote_upload_to, )) # remove local archive os.unlink(local_path) bucket_address = f"s3://{self.bucket}/{remote_upload_to}" # s3 -> remote target retcode, _ = self.job_manager.run_and_wait( f"pip install -q awscli && " f"aws s3 cp {bucket_address} archive.tar.gz && " f"tar xf archive.tar.gz ", {}, ) if retcode != 0: raise FileUploadError(f"Error uploading local dir to session " f"{self.cluster_manager.cluster_name}.") try: self._run_with_retry( lambda: self.s3_client.delete_object(Bucket=self.bucket, Key=remote_upload_to), initial_retry_delay_s=2, ) except Exception as e: logger.warning(f"Could not remove temporary S3 object: {e}")
def maybe_rewrite_wheels_url(ray_wheels_url: str, python_version: Tuple[int, int]) -> str: full_url = resolve_url(ray_wheels_url) # If the version is matching, just return the full url if is_wheels_url_matching_ray_verison(ray_wheels_url=full_url, python_version=python_version): return full_url # Try to parse the version from the filename / URL parsed_ray_version, parsed_python_version = parse_wheels_filename(full_url) if not parsed_ray_version or not python_version: # If we can't parse, we don't know the version, so we raise a warning logger.warning( f"The passed Ray wheels URL may not work with the python version " f"used in this test! Got python version {python_version} and " f"wheels URL: {ray_wheels_url}.") return full_url # If we parsed this and the python version is different from the actual version, # try to rewrite the URL current_filename = get_wheels_filename(parsed_ray_version, parsed_python_version) rewritten_filename = get_wheels_filename(parsed_ray_version, python_version) new_url = full_url.replace(current_filename, rewritten_filename) if new_url != full_url: logger.warning( f"The passed Ray wheels URL were for a different python version than " f"used in this test! Found python version {parsed_python_version} " f"but expected {python_version}. The wheels URL was re-written to " f"{new_url}.") return new_url
def create_cluster_env(self, _repeat: bool = True): assert self.cluster_env_id is None if self.cluster_env: assert self.cluster_env_name logger.info( f"Test uses a cluster env with name " f"{self.cluster_env_name}. Looking up existing " f"cluster envs with this name." ) paging_token = None while not self.cluster_env_id: result = self.sdk.search_cluster_environments( dict( project_id=self.project_id, name=dict(equals=self.cluster_env_name), paging=dict(count=50, token=paging_token), ) ) paging_token = result.metadata.next_paging_token for res in result.results: if res.name == self.cluster_env_name: self.cluster_env_id = res.id logger.info( f"Cluster env already exists with ID " f"{self.cluster_env_id}" ) break if not paging_token or self.cluster_env_id: break if not self.cluster_env_id: logger.info("Cluster env not found. Creating new one.") try: result = self.sdk.create_cluster_environment( dict( name=self.cluster_env_name, project_id=self.project_id, config_json=self.cluster_env, ) ) self.cluster_env_id = result.result.id except Exception as e: if _repeat: logger.warning( f"Got exception when trying to create cluster " f"env: {e}. Sleeping for 10 seconds and then " f"try again once..." ) time.sleep(10) return self.create_cluster_env(_repeat=False) raise ClusterEnvCreateError("Could not create cluster env.") from e logger.info(f"Cluster env created with ID {self.cluster_env_id}")
def create_cluster_compute(self, _repeat: bool = True): assert self.cluster_compute_id is None if self.cluster_compute: assert self.cluster_compute logger.info(f"Tests uses compute template " f"with name {self.cluster_compute_name}. " f"Looking up existing cluster computes.") paging_token = None while not self.cluster_compute_id: result = self.sdk.search_cluster_computes( dict( project_id=self.project_id, name=dict(equals=self.cluster_compute_name), include_anonymous=True, paging=dict(token=paging_token), )) paging_token = result.metadata.next_paging_token for res in result.results: if res.name == self.cluster_compute_name: self.cluster_compute_id = res.id logger.info(f"Cluster compute already exists " f"with ID {self.cluster_compute_id}") break if not paging_token: break if not self.cluster_compute_id: logger.info(f"Cluster compute not found. " f"Creating with name {self.cluster_compute_name}.") try: result = self.sdk.create_cluster_compute( dict( name=self.cluster_compute_name, project_id=self.project_id, config=self.cluster_compute, )) self.cluster_compute_id = result.result.id except Exception as e: if _repeat: logger.warning( f"Got exception when trying to create cluster " f"compute: {e}. Sleeping for 10 seconds and then " f"try again once...") time.sleep(10) return self.create_cluster_compute(_repeat=False) raise ClusterComputeCreateError( "Could not create cluster compute") from e logger.info(f"Cluster compute template created with " f"name {self.cluster_compute_name} and " f"ID {self.cluster_compute_id}")
def get_buildkite_prompt_value(key: str) -> Optional[str]: try: value = subprocess.check_output( ["buildkite-agent", "meta-data", "get", key], text=True) except Exception as e: logger.warning(f"Could not fetch metadata for {key}: {e}") return None logger.debug(f"Got Buildkite prompt value for {key}: {value}") return value
def as_smoke_test(test: Test) -> Test: if "smoke_test" not in test: logger.warning( f"Requested smoke test, but test with name {test['name']} does " f"not have any smoke test configuration.") return test smoke_test_config = test.pop("smoke_test") new_test = deep_update(test, smoke_test_config) return new_test
def build_configs(self, timeout: float = 30.0): try: self.create_cluster_compute() except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except ClusterComputeCreateError as e: raise e except Exception as e: raise ClusterComputeCreateError( f"Unexpected cluster compute build error: {e}") from e try: self.create_cluster_env() except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except ClusterEnvCreateError as e: raise e except Exception as e: raise ClusterEnvCreateError( f"Unexpected cluster env create error: {e}") from e try: self.build_cluster_env(timeout=timeout) except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except (ClusterEnvBuildError, ClusterEnvBuildTimeout) as e: raise e except Exception as e: raise ClusterEnvBuildError( f"Unexpected cluster env build error: {e}") from e
def get_concurrency_group(test: Test) -> Tuple[str, int]: try: test_cpus, test_gpus = get_test_resources(test) except Exception as e: logger.warning( f"Couldn't get test resources for test {test['name']}: {e}") return "small", CONCURRENY_GROUPS["small"] for condition in gpu_cpu_to_concurrency_groups: min_gpu = parse_condition(condition.min_gpu, float("-inf")) max_gpu = parse_condition(condition.max_gpu, float("inf")) min_cpu = parse_condition(condition.min_cpu, float("-inf")) max_cpu = parse_condition(condition.max_cpu, float("inf")) if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu: group = condition.group return group, CONCURRENY_GROUPS[group] # Return default logger.warning(f"Could not find concurrency group for test {test['name']} " f"based on used resources.") return "small", CONCURRENY_GROUPS["small"]
def install_matching_ray(ray_wheels: Optional[str]): if not ray_wheels: logger.warning( "No Ray wheels found - can't install matching Ray wheels locally!") return assert "manylinux2014_x86_64" in ray_wheels, ray_wheels if sys.platform == "darwin": platform = "macosx_10_15_intel" elif sys.platform == "win32": platform = "win_amd64" else: platform = "manylinux2014_x86_64" ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform) logger.info(f"Installing matching Ray wheels locally: {ray_wheels}") subprocess.check_output("pip uninstall -y ray", shell=True, env=os.environ, text=True) subprocess.check_output(f"pip install -U {ray_wheels}", shell=True, env=os.environ, text=True)
def upload(self, source: Optional[str] = None, target: Optional[str] = None): if source is None and target is None: self._push_local_dir() return assert isinstance(source, str) assert isinstance(target, str) remote_upload_to = self._generate_tmp_s3_path() # local source -> s3 self._run_with_retry( lambda: self.s3_client.upload_file( Filename=source, Bucket=self.bucket, Key=remote_upload_to, ) ) # s3 -> remote target bucket_address = f"s3://{self.bucket}/{remote_upload_to}" retcode, _ = self.job_manager.run_and_wait( "pip install -q awscli && " f"aws s3 cp {bucket_address} {target}", {}, ) if retcode != 0: raise FileUploadError(f"Error uploading file {source} to {target}") try: self._run_with_retry( lambda: self.s3_client.delete_object( Bucket=self.bucket, Key=remote_upload_to ), initial_retry_delay_s=2, ) except Exception as e: logger.warning(f"Could not remove temporary S3 object: {e}")
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: logger.info( f"Running command using Ray client on cluster " f"{self.cluster_manager.cluster_name}: {command}" ) env = env or {} full_env = self.get_full_command_env( { **os.environ, **env, "RAY_ADDRESS": self.cluster_manager.get_cluster_address(), "RAY_JOB_NAME": "test_job", "PYTHONUNBUFFERED": "1", } ) kill_event = threading.Event() def _kill_after( proc: subprocess.Popen, timeout: int = 30, kill_event: Optional[threading.Event] = None, ): timeout_at = time.monotonic() + timeout while time.monotonic() < timeout_at: if proc.poll() is not None: return time.sleep(1) logger.info( f"Client command timed out after {timeout} seconds, " f"killing subprocess." ) if kill_event: kill_event.set() proc.terminate() start_time = time.monotonic() proc = subprocess.Popen( command, env=full_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True, ) kill_thread = threading.Thread( target=_kill_after, args=(proc, timeout, kill_event) ) kill_thread.start() proc.stdout.reconfigure(line_buffering=True) sys.stdout.reconfigure(line_buffering=True) logs = deque(maxlen=LAST_LOGS_LENGTH) for line in proc.stdout: logs.append(line) sys.stdout.write(line) proc.wait() sys.stdout.reconfigure(line_buffering=False) time_taken = time.monotonic() - start_time self.last_logs = "\n".join(logs) return_code = proc.poll() if return_code == -15 or return_code == 15 or kill_event.is_set(): # Process has been terminated raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.") if return_code != 0: raise CommandError(f"Command returned non-success status: {return_code}") logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}") return time_taken
def main(test_collection_file: Optional[str] = None): settings = get_pipeline_settings() repo = settings["ray_test_repo"] branch = settings["ray_test_branch"] tmpdir = None env = {} if repo: # If the Ray test repo is set, we clone that repo to fetch # the test configuration file. Otherwise we might be missing newly # added test. repo = settings["ray_test_repo"] tmpdir = tempfile.mktemp() clone_cmd = f"git clone --depth 1 --branch {branch} {repo} {tmpdir}" try: subprocess.check_output(clone_cmd, shell=True) except Exception as e: raise ReleaseTestCLIError(f"Could not clone test repository " f"{repo} (branch {branch}): {e}") from e test_collection_file = os.path.join(tmpdir, "release", "release_tests.yaml") env = { "RAY_TEST_REPO": repo, "RAY_TEST_BRANCH": branch, } else: test_collection_file = test_collection_file or os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml") test_collection = read_and_validate_release_test_collection( test_collection_file) if tmpdir: shutil.rmtree(tmpdir, ignore_errors=True) frequency = settings["frequency"] prefer_smoke_tests = settings["prefer_smoke_tests"] test_attr_regex_filters = settings["test_attr_regex_filters"] ray_wheels = settings["ray_wheels"] priority = settings["priority"] logger.info( f"Found the following buildkite pipeline settings:\n\n" f" frequency = {settings['frequency']}\n" f" prefer_smoke_tests = {settings['prefer_smoke_tests']}\n" f" test_attr_regex_filters = {settings['test_attr_regex_filters']}\n" f" ray_wheels = {settings['ray_wheels']}\n" f" ray_test_repo = {settings['ray_test_repo']}\n" f" ray_test_branch = {settings['ray_test_branch']}\n" f" priority = {settings['priority']}\n" f" no_concurrency_limit = {settings['no_concurrency_limit']}\n") filtered_tests = filter_tests( test_collection, frequency=frequency, test_attr_regex_filters=test_attr_regex_filters, prefer_smoke_tests=prefer_smoke_tests, ) logger.info(f"Found {len(filtered_tests)} tests to run.") if len(filtered_tests) == 0: raise ReleaseTestCLIError( "Empty test collection. The selected frequency or filter did " "not return any tests to run. Adjust your filters.") grouped_tests = group_tests(filtered_tests) group_str = "" for group, tests in grouped_tests.items(): group_str += f"\n{group}:\n" for test, smoke in tests: group_str += f" {test['name']}" if smoke: group_str += " [smoke test]" group_str += "\n" logger.info(f"Tests to run:\n{group_str}") # Wait for wheels here so we have them ready before we kick off # the other workers ray_wheels_url = find_and_wait_for_ray_wheels_url( ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT) logger.info(f"Starting pipeline for Ray wheel: {ray_wheels_url}") no_concurrency_limit = settings["no_concurrency_limit"] if no_concurrency_limit: logger.warning("Concurrency is not limited for this run!") # Report if REPORT=1 or BUILDKITE_SOURCE=schedule report = (bool(int(os.environ.get("REPORT", "0"))) or os.environ.get("BUILDKITE_SOURCE", "manual") == "schedule") steps = [] for group in sorted(grouped_tests): tests = grouped_tests[group] group_steps = [] for test, smoke_test in tests: # If the python version is defined, we need a different Ray wheels URL if "python" in test: python_version = parse_python_version(test["python"]) this_ray_wheels_url = find_ray_wheels_url( ray_wheels, python_version=python_version) else: this_ray_wheels_url = ray_wheels_url step = get_step( test, report=report, smoke_test=smoke_test, ray_wheels=this_ray_wheels_url, env=env, priority_val=priority.value, ) if no_concurrency_limit: step.pop("concurrency", None) step.pop("concurrency_group", None) group_steps.append(step) group_step = {"group": group, "steps": group_steps} steps.append(group_step) if "BUILDKITE" in os.environ: if os.path.exists(PIPELINE_ARTIFACT_PATH): shutil.rmtree(PIPELINE_ARTIFACT_PATH) os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755) with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"), "wt") as fp: json.dump(steps, fp) settings["frequency"] = settings["frequency"].value settings["priority"] = settings["priority"].value with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp: json.dump(settings, fp) steps_str = json.dumps(steps) print(steps_str)