Пример #1
0
    def _wait_job(self, command_id: int, timeout: int):
        from ray.job_submission import JobStatus  # noqa: F811

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30

        while True:
            now = time.monotonic()
            if now >= timeout_at:
                raise CommandTimeout(
                    f"Cluster command timed out after {timeout} seconds.")

            if now >= next_status:
                logger.info(f"... command still running ..."
                            f"({int(now - start_time)} seconds) ...")
                next_status += 30
            status = self._get_job_status_with_retry(command_id)
            if status in {
                    JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED
            }:
                break
            time.sleep(1)
        status = self._get_job_status_with_retry(command_id)
        # TODO(sang): Propagate JobInfo.error_type
        if status == JobStatus.SUCCEEDED:
            retcode = 0
        else:
            retcode = -1
        duration = time.time() - self.start_time[command_id]
        return retcode, duration
Пример #2
0
    def wait_for_nodes(self, num_nodes: int, timeout: float = 900):
        ray_address = self.cluster_manager.get_cluster_address()
        try:
            if ray.is_initialized:
                ray.shutdown()

            ray.init(address=ray_address)

            start_time = time.monotonic()
            timeout_at = start_time + timeout
            next_status = start_time + 30
            nodes_up = len(ray.nodes())
            while nodes_up < num_nodes:
                now = time.monotonic()
                if now >= timeout_at:
                    raise ClusterNodesWaitTimeout(
                        f"Only {len(ray.nodes())}/{num_nodes} are up after "
                        f"{timeout} seconds.")

                if now >= next_status:
                    logger.info(f"Waiting for nodes to come up: "
                                f"{len(ray.nodes())}/{num_nodes} "
                                f"({now - start_time:.2f} seconds, "
                                f"timeout: {timeout} seconds).")
                    next_status += 30

                time.sleep(1)
                nodes_up = len(ray.nodes())

            ray.shutdown()
        except Exception as e:
            raise ClusterStartupError(
                f"Exception when waiting for nodes: {e}") from e

        logger.info(f"All {num_nodes} nodes are up.")
Пример #3
0
def install_matching_ray_locally(ray_wheels: Optional[str]):
    if not ray_wheels:
        logger.warning(
            "No Ray wheels found - can't install matching Ray wheels locally!")
        return
    assert "manylinux2014_x86_64" in ray_wheels, ray_wheels
    if sys.platform == "darwin":
        platform = "macosx_10_15_intel"
    elif sys.platform == "win32":
        platform = "win_amd64"
    else:
        platform = "manylinux2014_x86_64"
    ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform)
    logger.info(f"Installing matching Ray wheels locally: {ray_wheels}")
    subprocess.check_output("pip uninstall -y ray",
                            shell=True,
                            env=os.environ,
                            text=True)
    subprocess.check_output(
        f"pip install -U {shlex.quote(ray_wheels)}",
        shell=True,
        env=os.environ,
        text=True,
    )
    for module_name in RELOAD_MODULES:
        if module_name in sys.modules:
            importlib.reload(sys.modules[module_name])
Пример #4
0
def find_cloud_by_name(cloud_name: str,
                       sdk: Optional["AnyscaleSDK"] = None) -> Optional[str]:
    sdk = sdk or get_anyscale_sdk()

    cloud_id = None
    logger.info(f"Looking up cloud with name `{cloud_name}`. ")

    paging_token = None
    while not cloud_id:
        result = sdk.search_clouds(clouds_query=dict(
            paging=dict(count=50, paging_token=paging_token)))

        paging_token = result.metadata.next_paging_token

        for res in result.results:
            if res.name == cloud_name:
                cloud_id = res.id
                logger.info(
                    f"Found cloud with name `{cloud_name}` as `{cloud_id}`")
                break

        if not paging_token or cloud_id or not len(result.results):
            break

    return cloud_id
Пример #5
0
    def report_result(self, test: Test, result: Result):
        logger.info("Persisting result to the databricks delta lake...")

        result_json = {
            "_table": "release_test_result",
            "report_timestamp_ms": int(time.time() * 1000),
            "status": result.status or "",
            "results": result.results or {},
            "name": test.get("name", ""),
            "group": test.get("group", ""),
            "team": test.get("team", ""),
            "frequency": test.get("frequency", ""),
            "cluster_url": result.cluster_url or "",
            "wheel_url": result.wheels_url or "",
            "buildkite_url": result.buildkite_url or "",
            "runtime": result.runtime or -1.0,
            "stable": result.stable,
            "return_code": result.return_code,
        }

        logger.debug(f"Result json: {json.dumps(result_json)}")

        try:
            self.firehose.put_record(
                DeliveryStreamName="ray-ci-results",
                Record={"Data": json.dumps(result_json)},
            )
        except Exception:
            logger.exception(
                "Failed to persist result to the databricks delta lake")
        else:
            logger.info(
                "Result has been persisted to the databricks delta lake")
Пример #6
0
    def run_command(self,
                    command: str,
                    env: Optional[Dict] = None,
                    timeout: float = 3600.0) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}")

        logger.info(f"Link to cluster: "
                    f"{format_link(self.cluster_manager.get_cluster_url())}")

        status_code, time_taken = self.job_manager.run_and_wait(
            full_command, full_env, timeout=timeout)

        if status_code != 0:
            raise CommandError(
                f"Command returned non-success status: {status_code}")

        return time_taken
Пример #7
0
def reinstall_anyscale_dependencies() -> None:
    logger.info("Re-installing `anyscale` package")

    subprocess.check_output(
        "pip install -U anyscale",
        shell=True,
        text=True,
    )
Пример #8
0
def main(
    test_name: str,
    test_collection_file: Optional[str] = None,
    smoke_test: bool = False,
    report: bool = False,
    ray_wheels: Optional[str] = None,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    no_terminate: bool = False,
):
    test_collection_file = test_collection_file or os.path.join(
        os.path.dirname(__file__), "..", "..", "release_tests.yaml")
    test_collection = read_and_validate_release_test_collection(
        test_collection_file)
    test = find_test(test_collection, test_name)

    if not test:
        raise ReleaseTestCLIError(
            f"Test `{test_name}` not found in collection file: "
            f"{test_collection_file}")

    if smoke_test:
        test = as_smoke_test(test)

    ray_wheels_url = find_and_wait_for_ray_wheels_url(
        ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT)

    anyscale_project = os.environ.get("ANYSCALE_PROJECT", None)
    if not anyscale_project:
        raise ReleaseTestCLIError(
            "You have to set the ANYSCALE_PROJECT environment variable!")

    maybe_fetch_api_token()

    result = Result()

    reporters = [LogReporter()]
    if report:
        reporters.append(LegacyRDSReporter())

    try:
        result = run_release_test(
            test,
            anyscale_project=anyscale_project,
            result=result,
            ray_wheels_url=ray_wheels_url,
            reporters=reporters,
            smoke_test=smoke_test,
            cluster_id=cluster_id,
            cluster_env_id=cluster_env_id,
            no_terminate=no_terminate,
        )
    except ReleaseTestError as e:
        logger.exception(e)

    logger.info(f"Release test pipeline for test {test['name']} completed. "
                f"Returning with exit code = {result.return_code}")
    sys.exit(result.return_code)
Пример #9
0
def find_and_wait_for_ray_wheels_url(
    ray_wheels: Optional[str] = None,
    python_version: Tuple[int, int] = DEFAULT_PYTHON_VERSION,
    timeout: float = 3600.0,
) -> str:
    ray_wheels_url = find_ray_wheels_url(ray_wheels,
                                         python_version=python_version)
    logger.info(f"Using Ray wheels URL: {ray_wheels_url}")
    return wait_for_url(ray_wheels_url, timeout=timeout)
Пример #10
0
 def download(self, source: str, target: str):
     logger.info(f"Downloading {source or '<cwd>'} to {target or '<cwd>'} "
                 f"using SessionController")
     self.session_controller.pull(
         session_name=self.cluster_manager.cluster_name,
         source=source,
         target=target,
         config=None,
     )
Пример #11
0
def install_cluster_env_packages(cluster_env: Dict[Any, Any]):
    os.environ.update(cluster_env.get("env_vars", {}))
    packages = cluster_env["python"]["pip_packages"]
    logger.info(f"Installing cluster env packages locally: {packages}")

    for package in packages:
        subprocess.check_output(f"pip install -U {package}",
                                shell=True,
                                env=os.environ,
                                text=True)
Пример #12
0
    def fetch_results(self) -> Dict[str, Any]:
        try:
            tmpfile = tempfile.mkstemp(suffix=".json")[1]
            logger.info(tmpfile)
            self.file_manager.download(self.result_output_json, tmpfile)

            with open(tmpfile, "rt") as f:
                data = json.load(f)

            os.unlink(tmpfile)
            return data
        except Exception as e:
            raise ResultsError(
                f"Could not fetch results from session: {e}") from e
Пример #13
0
    def create_cluster_env(self, _repeat: bool = True):
        assert self.cluster_env_id is None

        if self.cluster_env:
            assert self.cluster_env_name

            logger.info(
                f"Test uses a cluster env with name "
                f"{self.cluster_env_name}. Looking up existing "
                f"cluster envs with this name."
            )

            paging_token = None
            while not self.cluster_env_id:
                result = self.sdk.search_cluster_environments(
                    dict(
                        project_id=self.project_id,
                        name=dict(equals=self.cluster_env_name),
                        paging=dict(count=50, token=paging_token),
                    )
                )
                paging_token = result.metadata.next_paging_token

                for res in result.results:
                    if res.name == self.cluster_env_name:
                        self.cluster_env_id = res.id
                        logger.info(
                            f"Cluster env already exists with ID "
                            f"{self.cluster_env_id}"
                        )
                        break

                if not paging_token or self.cluster_env_id:
                    break

            if not self.cluster_env_id:
                logger.info("Cluster env not found. Creating new one.")
                try:
                    result = self.sdk.create_cluster_environment(
                        dict(
                            name=self.cluster_env_name,
                            project_id=self.project_id,
                            config_json=self.cluster_env,
                        )
                    )
                    self.cluster_env_id = result.result.id
                except Exception as e:
                    if _repeat:
                        logger.warning(
                            f"Got exception when trying to create cluster "
                            f"env: {e}. Sleeping for 10 seconds and then "
                            f"try again once..."
                        )
                        time.sleep(10)
                        return self.create_cluster_env(_repeat=False)

                    raise ClusterEnvCreateError("Could not create cluster env.") from e

                logger.info(f"Cluster env created with ID {self.cluster_env_id}")
Пример #14
0
 def _kill_after(
     proc: subprocess.Popen,
     timeout: int = 30,
     kill_event: Optional[threading.Event] = None,
 ):
     timeout_at = time.monotonic() + timeout
     while time.monotonic() < timeout_at:
         if proc.poll() is not None:
             return
         time.sleep(1)
     logger.info(f"Client command timed out after {timeout} seconds, "
                 f"killing subprocess.")
     if kill_event:
         kill_event.set()
     proc.terminate()
Пример #15
0
def exponential_backoff_retry(f, retry_exceptions, initial_retry_delay_s,
                              max_retries) -> None:
    retry_cnt = 0
    retry_delay_s = initial_retry_delay_s
    while True:
        try:
            return f()
        except retry_exceptions as e:
            retry_cnt += 1
            if retry_cnt > max_retries:
                raise
            logger.info(f"Retry function call failed due to {e} "
                        f"in {retry_delay_s} seconds...")
            time.sleep(retry_delay_s)
            retry_delay_s *= 2
Пример #16
0
    def report_result(self, test: Test, result: Result):
        if not os.path.exists(self.artifacts_dir):
            os.makedirs(self.artifacts_dir, 0o755)

        test_config_file = os.path.join(self.artifacts_dir,
                                        ARTIFACT_TEST_CONFIG_FILE)
        with open(test_config_file, "wt") as fp:
            json.dump(test, fp, sort_keys=True, indent=4)

        result_file = os.path.join(self.artifacts_dir, ARTIFACT_RESULT_FILE)
        with open(result_file, "wt") as fp:
            json.dump(result.__dict__, fp, sort_keys=True, indent=4)

        logger.info(
            f"Wrote test config and result to artifacts directory: {self.artifacts_dir}"
        )
Пример #17
0
    def create_cluster_compute(self, _repeat: bool = True):
        assert self.cluster_compute_id is None

        if self.cluster_compute:
            assert self.cluster_compute

            logger.info(f"Tests uses compute template "
                        f"with name {self.cluster_compute_name}. "
                        f"Looking up existing cluster computes.")

            paging_token = None
            while not self.cluster_compute_id:
                result = self.sdk.search_cluster_computes(
                    dict(
                        project_id=self.project_id,
                        name=dict(equals=self.cluster_compute_name),
                        include_anonymous=True,
                        paging=dict(token=paging_token),
                    ))
                paging_token = result.metadata.next_paging_token

                for res in result.results:
                    if res.name == self.cluster_compute_name:
                        self.cluster_compute_id = res.id
                        logger.info(f"Cluster compute already exists "
                                    f"with ID {self.cluster_compute_id}")
                        break

                if not paging_token:
                    break

            if not self.cluster_compute_id:
                logger.info(f"Cluster compute not found. "
                            f"Creating with name {self.cluster_compute_name}.")
                try:
                    result = self.sdk.create_cluster_compute(
                        dict(
                            name=self.cluster_compute_name,
                            project_id=self.project_id,
                            config=self.cluster_compute,
                        ))
                    self.cluster_compute_id = result.result.id
                except Exception as e:
                    if _repeat:
                        logger.warning(
                            f"Got exception when trying to create cluster "
                            f"compute: {e}. Sleeping for 10 seconds and then "
                            f"try again once...")
                        time.sleep(10)
                        return self.create_cluster_compute(_repeat=False)

                    raise ClusterComputeCreateError(
                        "Could not create cluster compute") from e

                logger.info(f"Cluster compute template created with "
                            f"name {self.cluster_compute_name} and "
                            f"ID {self.cluster_compute_id}")
Пример #18
0
def maybe_fetch_api_token():
    if not os.environ.get("ANYSCALE_CLI_TOKEN"):
        try:
            token, _ = AuthenticationBlock._load_credentials()
            logger.info("Loaded anyscale credentials from local storage.")
            os.environ["ANYSCALE_CLI_TOKEN"] = token
            return
        except Exception:
            pass  # Ignore errors

        logger.info(
            "Missing ANYSCALE_CLI_TOKEN, retrieving from AWS secrets store")
        # NOTE(simon) This should automatically retrieve
        # [email protected]'s anyscale token
        os.environ["ANYSCALE_CLI_TOKEN"] = boto3.client(
            "secretsmanager", region_name="us-west-2").get_secret_value(
                SecretId=RELEASE_AWS_ANYSCALE_SECRET_ARN)["SecretString"]
Пример #19
0
def handle_result(test: Test, result: Result):
    alert_suite = test.get("alert", "default")

    logger.info(
        f"Checking results for test {test['name']} using alerting suite "
        f"{alert_suite}")

    if alert_suite not in result_to_handle_map:
        raise ReleaseTestConfigError(f"Alert suite {alert_suite} not found.")

    handler = result_to_handle_map[alert_suite]
    error = handler(test, result)

    if error:
        raise ResultsAlert(error)

    logger.info("No alerts have been raised - test passed successfully!")
Пример #20
0
    def upload(self,
               source: Optional[str] = None,
               target: Optional[str] = None):
        logger.info(f"Uploading {source or '<cwd>'} to {target or '<cwd>'} "
                    f"using SessionController")

        if source and os.path.isdir(source) and target:
            # Add trailing slashes
            source = os.path.join(source, "")
            target = os.path.join(target, "")

        self.session_controller.push(
            session_name=self.cluster_manager.cluster_name,
            source=source,
            target=target,
            config=None,
            all_nodes=False,
        )
Пример #21
0
    def _run_job(self, cmd_to_run, env_vars) -> int:
        self.counter += 1
        command_id = self.counter
        env = os.environ.copy()
        env["RAY_ADDRESS"] = self.cluster_manager.get_cluster_address()
        env.setdefault("ANYSCALE_HOST", ANYSCALE_HOST)

        full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run
        logger.info(f"Executing {cmd_to_run} with {env_vars} via ray job submit")

        job_client = self._get_job_client()

        job_id = job_client.submit_job(
            # Entrypoint shell command to execute
            entrypoint=full_cmd,
        )
        self.last_job_id = job_id
        self.job_id_pool[command_id] = job_id
        self.start_time[command_id] = time.time()
        return command_id
Пример #22
0
def wait_for_url(
    url, timeout: float = 300.0, check_time: float = 30.0, status_time: float = 60.0
) -> str:
    start_time = time.monotonic()
    timeout_at = start_time + timeout
    next_status = start_time + status_time
    logger.info(f"Waiting up to {timeout} seconds until URL is available " f"({url})")
    while not url_exists(url):
        now = time.monotonic()
        if now >= timeout_at:
            raise RayWheelsTimeoutError(
                f"Time out when waiting for URL to be available: {url}"
            )

        if now >= next_status:
            logger.info(
                f"... still waiting for URL {url} "
                f"({int(now - start_time)} seconds) ..."
            )
            next_status += status_time

        # Sleep `check_time` sec before next check.
        time.sleep(check_time)
    logger.info(f"URL is now available: {url}")
    return url
Пример #23
0
def install_matching_ray(ray_wheels: Optional[str]):
    if not ray_wheels:
        logger.warning(
            "No Ray wheels found - can't install matching Ray wheels locally!")
        return
    assert "manylinux2014_x86_64" in ray_wheels, ray_wheels
    if sys.platform == "darwin":
        platform = "macosx_10_15_intel"
    elif sys.platform == "win32":
        platform = "win_amd64"
    else:
        platform = "manylinux2014_x86_64"
    ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform)
    logger.info(f"Installing matching Ray wheels locally: {ray_wheels}")
    subprocess.check_output("pip uninstall -y ray",
                            shell=True,
                            env=os.environ,
                            text=True)
    subprocess.check_output(f"pip install -U {ray_wheels}",
                            shell=True,
                            env=os.environ,
                            text=True)
Пример #24
0
    def report_result(self, test: Test, result: Result):
        logger.info(f"Test {test['name']} finished after "
                    f"{result.runtime:.2f} seconds. Last logs:\n\n"
                    f"{result.last_logs}\n")

        logger.info(f"Got the following metadata: \n"
                    f"  name:    {test['name']}\n"
                    f"  status:  {result.status}\n"
                    f"  runtime: {result.runtime:.2f}\n"
                    f"  stable:  {result.stable}\n"
                    f"\n"
                    f"  buildkite_url: {format_link(result.buildkite_url)}\n"
                    f"  wheels_url:    {format_link(result.wheels_url)}\n"
                    f"  cluster_url:   {format_link(result.cluster_url)}\n")

        results = result.results
        if results:
            msg = "Observed the following results:\n\n"

            for key, val in results.items():
                msg += f"  {key} = {val}\n"
        else:
            msg = "Did not find any results."
        logger.info(msg)
Пример #25
0
def run_release_test(
    test: Test,
    anyscale_project: str,
    result: Result,
    ray_wheels_url: str,
    reporters: Optional[List[Reporter]] = None,
    smoke_test: bool = False,
    cluster_id: Optional[str] = None,
    cluster_env_id: Optional[str] = None,
    no_terminate: bool = False,
) -> Result:
    buildkite_group(":spiral_note_pad: Loading test configuration")

    validate_test(test)

    result.wheels_url = ray_wheels_url
    result.stable = test.get("stable", True)
    result.smoke_test = smoke_test

    buildkite_url = os.getenv("BUILDKITE_BUILD_URL", "")
    if buildkite_url:
        buildkite_url += "#" + os.getenv("BUILDKITE_JOB_ID", "")
    result.buildkite_url = buildkite_url

    working_dir = test["working_dir"]

    old_wd = os.getcwd()
    new_wd = os.path.join(RELEASE_PACKAGE_DIR, working_dir)
    os.chdir(new_wd)

    start_time = time.monotonic()

    run_type = test["run"].get("type", "sdk_command")

    command_runner_cls = type_str_to_command_runner.get(run_type)
    if not command_runner_cls:
        raise ReleaseTestConfigError(
            f"Unknown command runner type: {run_type}. Must be one of "
            f"{list(type_str_to_command_runner.keys())}")

    cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls]

    file_manager_str = test["run"].get("file_manager", None)
    if file_manager_str:
        if file_manager_str not in file_manager_str_to_file_manager:
            raise ReleaseTestConfigError(
                f"Unknown file manager: {file_manager_str}. Must be one of "
                f"{list(file_manager_str_to_file_manager.keys())}")
        file_manager_cls = file_manager_str_to_file_manager[file_manager_str]
    else:
        file_manager_cls = command_runner_to_file_manager[command_runner_cls]

    # Instantiate managers and command runner
    try:
        cluster_manager = cluster_manager_cls(test["name"],
                                              anyscale_project,
                                              smoke_test=smoke_test)
        file_manager = file_manager_cls(cluster_manager=cluster_manager)
        command_runner = command_runner_cls(cluster_manager, file_manager,
                                            working_dir)
    except Exception as e:
        raise ReleaseTestSetupError(
            f"Error setting up release test: {e}") from e

    pipeline_exception = None
    try:
        # Load configs
        cluster_env = load_test_cluster_env(test,
                                            ray_wheels_url=ray_wheels_url)
        cluster_compute = load_test_cluster_compute(test)

        if cluster_env_id:
            try:
                cluster_manager.cluster_env_id = cluster_env_id
                cluster_manager.build_cluster_env()
                cluster_manager.fetch_build_info()
                logger.info("Using overridden cluster environment with ID "
                            f"{cluster_env_id} and build ID "
                            f"{cluster_manager.cluster_env_build_id}")
            except Exception as e:
                raise ClusterEnvCreateError(
                    f"Could not get existing overridden cluster environment "
                    f"{cluster_env_id}: {e}") from e
        else:
            cluster_manager.set_cluster_env(cluster_env)

        cluster_manager.set_cluster_compute(cluster_compute)

        buildkite_group(":nut_and_bolt: Setting up local environment")
        driver_setup_script = test.get("driver_setup", None)
        if driver_setup_script:
            try:
                run_bash_script(driver_setup_script)
            except Exception as e:
                raise LocalEnvSetupError(
                    f"Driver setup script failed: {e}") from e

        # Install local dependencies
        command_runner.prepare_local_env(ray_wheels_url)
        command_timeout = test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT)

        # Re-install anyscale package as local dependencies might have changed
        # from local env setup
        reinstall_anyscale_dependencies()

        # Print installed pip packages
        buildkite_group(":bulb: Local environment information")
        pip_packages = get_pip_packages()
        pip_package_string = "\n".join(pip_packages)
        logger.info(f"Installed python packages:\n{pip_package_string}")

        # Start cluster
        if cluster_id:
            buildkite_group(":rocket: Using existing cluster")
            # Re-use existing cluster ID for development
            cluster_manager.cluster_id = cluster_id
            cluster_manager.cluster_name = get_cluster_name(cluster_id)
        else:
            buildkite_group(":gear: Building cluster environment")
            build_timeout = test["run"].get("build_timeout",
                                            DEFAULT_BUILD_TIMEOUT)

            if cluster_env_id:
                cluster_manager.cluster_env_id = cluster_env_id

            cluster_manager.build_configs(timeout=build_timeout)

            cluster_timeout = test["run"].get("session_timeout",
                                              DEFAULT_CLUSTER_TIMEOUT)

            autosuspend_mins = test["cluster"].get("autosuspend_mins", None)
            if autosuspend_mins:
                cluster_manager.autosuspend_minutes = autosuspend_mins
            else:
                cluster_manager.autosuspend_minutes = min(
                    DEFAULT_AUTOSUSPEND_MINS,
                    int(command_timeout / 60) + 10)

            buildkite_group(":rocket: Starting up cluster")
            cluster_manager.start_cluster(timeout=cluster_timeout)

        result.cluster_url = cluster_manager.get_cluster_url()

        # Upload files
        buildkite_group(":wrench: Preparing remote environment")
        command_runner.prepare_remote_env()

        wait_for_nodes = test["run"].get("wait_for_nodes", None)
        if wait_for_nodes:
            buildkite_group(":stopwatch: Waiting for nodes to come up")
            num_nodes = test["run"]["wait_for_nodes"]["num_nodes"]
            wait_timeout = test["run"]["wait_for_nodes"].get(
                "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT)
            command_runner.wait_for_nodes(num_nodes, wait_timeout)

        prepare_cmd = test["run"].get("prepare", None)
        if prepare_cmd:
            prepare_timeout = test["run"].get("prepare_timeout",
                                              command_timeout)
            try:
                command_runner.run_prepare_command(prepare_cmd,
                                                   timeout=prepare_timeout)
            except CommandError as e:
                raise PrepareCommandError(e)
            except CommandTimeout as e:
                raise PrepareCommandTimeout(e)

        buildkite_group(":runner: Running test script")
        command = test["run"]["script"]
        command_env = {}

        if smoke_test:
            command = f"{command} --smoke-test"
            command_env["IS_SMOKE_TEST"] = "1"

        is_long_running = test["run"].get("long_running", False)

        try:
            command_runner.run_command(command,
                                       env=command_env,
                                       timeout=command_timeout)
        except CommandError as e:
            raise TestCommandError(e)
        except CommandTimeout as e:
            if not is_long_running:
                # Only raise error if command is not long running
                raise TestCommandTimeout(e)

        buildkite_group(":floppy_disk: Fetching results")
        try:
            command_results = command_runner.fetch_results()
        except Exception as e:
            logger.error("Could not fetch results for test command")
            logger.exception(e)
            command_results = {}

        # Postprocess result:
        if "last_update" in command_results:
            command_results["last_update_diff"] = time.time(
            ) - command_results.get("last_update", 0.0)
        if smoke_test:
            command_results["smoke_test"] = True

        result.results = command_results
        result.status = "finished"

    except Exception as e:
        logger.exception(e)
        buildkite_open_last()
        pipeline_exception = e

    try:
        last_logs = command_runner.get_last_logs()
    except Exception as e:
        logger.error(f"Error fetching logs: {e}")
        last_logs = "No logs could be retrieved."

    result.last_logs = last_logs

    if not no_terminate:
        buildkite_group(":earth_africa: Terminating cluster")
        try:
            cluster_manager.terminate_cluster(wait=False)
        except Exception as e:
            logger.error(f"Could not terminate cluster: {e}")

    time_taken = time.monotonic() - start_time
    result.runtime = time_taken

    os.chdir(old_wd)

    if not pipeline_exception:
        buildkite_group(":mag: Interpreting results")
        # Only handle results if we didn't run into issues earlier
        try:
            handle_result(test, result)
        except Exception as e:
            pipeline_exception = e

    if pipeline_exception:
        buildkite_group(":rotating_light: Handling errors")
        exit_code, error_type, runtime = handle_exception(pipeline_exception)

        result.return_code = exit_code.value
        result.status = error_type
        if runtime is not None:
            result.runtime = runtime

    buildkite_group(":memo: Reporting results", open=True)
    reporters = reporters or []
    for reporter in reporters:
        try:
            reporter.report_result(test, result)
        except Exception as e:
            logger.error(f"Error reporting results via {type(reporter)}: {e}")

    if pipeline_exception:
        raise pipeline_exception

    return result
Пример #26
0
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}"
        )

        logger.info(
            f"Link to cluster: "
            f"{format_link(self.cluster_manager.get_cluster_url())}"
        )

        result = self.sdk.create_session_command(
            dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command)
        )

        scd_id = result.result.id
        self.last_command_scd_id = scd_id

        completed = result.result.finished_at is not None

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30

        while not completed:
            now = time.monotonic()
            if now >= timeout_at:
                raise CommandTimeout(
                    f"Cluster command timed out after {timeout} seconds."
                )

            if now >= next_status:
                logger.info(
                    f"... command still running ..."
                    f"({int(now - start_time)} seconds) ..."
                )
                next_status += 30

            # Sleep 1 sec before next check.
            time.sleep(1)

            result = exponential_backoff_retry(
                lambda: self.sdk.get_session_command(session_command_id=scd_id),
                retry_exceptions=Exception,
                initial_retry_delay_s=10,
                max_retries=3,
            )
            completed = result.result.finished_at

        status_code = result.result.status_code
        time_taken = time.monotonic() - start_time

        if status_code != 0:
            raise CommandError(f"Command returned non-success status: {status_code}")

        return time_taken
Пример #27
0
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        logger.info(
            f"Running command using Ray client on cluster "
            f"{self.cluster_manager.cluster_name}: {command}"
        )

        env = env or {}
        full_env = self.get_full_command_env(
            {
                **os.environ,
                **env,
                "RAY_ADDRESS": self.cluster_manager.get_cluster_address(),
                "RAY_JOB_NAME": "test_job",
                "PYTHONUNBUFFERED": "1",
            }
        )

        kill_event = threading.Event()

        def _kill_after(
            proc: subprocess.Popen,
            timeout: int = 30,
            kill_event: Optional[threading.Event] = None,
        ):
            timeout_at = time.monotonic() + timeout
            while time.monotonic() < timeout_at:
                if proc.poll() is not None:
                    return
                time.sleep(1)
            logger.info(
                f"Client command timed out after {timeout} seconds, "
                f"killing subprocess."
            )
            if kill_event:
                kill_event.set()
            proc.terminate()

        start_time = time.monotonic()
        proc = subprocess.Popen(
            command,
            env=full_env,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            text=True,
        )

        kill_thread = threading.Thread(
            target=_kill_after, args=(proc, timeout, kill_event)
        )
        kill_thread.start()

        proc.stdout.reconfigure(line_buffering=True)
        sys.stdout.reconfigure(line_buffering=True)
        logs = deque(maxlen=LAST_LOGS_LENGTH)
        for line in proc.stdout:
            logs.append(line)
            sys.stdout.write(line)
        proc.wait()
        sys.stdout.reconfigure(line_buffering=False)
        time_taken = time.monotonic() - start_time
        self.last_logs = "\n".join(logs)

        return_code = proc.poll()
        if return_code == -15 or return_code == 15 or kill_event.is_set():
            # Process has been terminated
            raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.")
        if return_code != 0:
            raise CommandError(f"Command returned non-success status: {return_code}")

        logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}")

        return time_taken
Пример #28
0
    def report_result(self, test: Test, result: Result):
        logger.info("Persisting results to database...")

        result_dict = {
            "_runtime": result.runtime,
            # Keep session url for legacy support
            "_session_url": result.cluster_url,
            "_cluster_url": result.cluster_url,
            "_commit_url": result.wheels_url,
            "_stable": result.stable,
        }

        now = datetime.datetime.utcnow()
        rds_data_client = boto3.client("rds-data", region_name="us-west-2")

        if "legacy" in test:
            test_name = test["legacy"]["test_name"]
            test_suite = test["legacy"]["test_suite"]
        else:
            test_name = test["name"]
            test_suite = ""

        team = test["team"] or ""

        # Branch name
        category = get_test_env_var("RAY_BRANCH", "")

        status = result.status or "invalid"
        last_logs = result.last_logs or ""

        if result.results:
            result_dict.update(result.results)
        artifacts = {}

        parameters = [
            {
                "name": "created_on",
                "typeHint": "TIMESTAMP",
                "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")},
            },
            {"name": "test_suite", "value": {"stringValue": test_suite}},
            {"name": "test_name", "value": {"stringValue": test_name}},
            {"name": "status", "value": {"stringValue": status}},
            {"name": "last_logs", "value": {"stringValue": last_logs}},
            {
                "name": "results",
                "typeHint": "JSON",
                "value": {"stringValue": json.dumps(result_dict)},
            },
            {
                "name": "artifacts",
                "typeHint": "JSON",
                "value": {"stringValue": json.dumps(artifacts)},
            },
            {"name": "category", "value": {"stringValue": category}},
            {"name": "team", "value": {"stringValue": team}},
            {"name": "session_url", "value": {"stringValue": result.cluster_url or ""}},
            {"name": "commit_url", "value": {"stringValue": result.wheels_url or ""}},
            {"name": "runtime", "value": {"doubleValue": result.runtime or -1.0}},
            {"name": "stable", "value": {"booleanValue": result.stable}},
            {"name": "frequency", "value": {"stringValue": test.get("frequency", "")}},
            {"name": "return_code", "value": {"longValue": result.return_code}},
        ]

        columns = [param["name"] for param in parameters]
        values = [f":{param['name']}" for param in parameters]
        column_str = ", ".join(columns).strip(", ")
        value_str = ", ".join(values).strip(", ")

        sql = (
            f"INSERT INTO {self.database_table} "
            f"({column_str}) "
            f"VALUES ({value_str})"
        )

        logger.debug(f"SQL query: {sql}")

        # Default boto3 call timeout is 45 seconds.
        retry_delay_s = 64
        MAX_RDS_RETRY = 3
        exponential_backoff_retry(
            lambda: rds_data_client.execute_statement(
                database=self.database,
                parameters=parameters,
                secretArn=RELEASE_AWS_DB_SECRET_ARN,
                resourceArn=RELEASE_AWS_DB_RESOURCE_ARN,
                schema=self.database_table,
                sql=sql,
            ),
            retry_exceptions=rds_data_client.exceptions.StatementTimeoutException,
            initial_retry_delay_s=retry_delay_s,
            max_retries=MAX_RDS_RETRY,
        )
        logger.info("Result has been persisted to the database")
Пример #29
0
    def build_cluster_env(self, timeout: float = 600.0):
        assert self.cluster_env_id
        assert self.cluster_env_build_id is None

        # Fetch build
        build_id = None
        last_status = None
        error_message = None
        config_json = None
        result = self.sdk.list_cluster_environment_builds(self.cluster_env_id)
        if not result or not result.results:
            raise ClusterEnvBuildError(f"No build found for cluster env: {result}")

        build = sorted(result.results, key=lambda b: b.created_at)[-1]
        build_id = build.id
        last_status = build.status
        error_message = build.error_message
        config_json = build.config_json

        if last_status == "succeeded":
            logger.info(
                f"Link to succeeded cluster env build: "
                f"{format_link(anyscale_cluster_env_build_url(build_id))}"
            )
            self.cluster_env_build_id = build_id
            return

        if last_status == "failed":
            logger.info(f"Previous cluster env build failed: {error_message}")
            logger.info("Starting new cluster env build...")

            # Retry build
            result = self.sdk.create_cluster_environment_build(
                dict(
                    cluster_environment_id=self.cluster_env_id, config_json=config_json
                )
            )
            build_id = result.result.id

            logger.info(
                f"Link to created cluster env build: "
                f"{format_link(anyscale_cluster_env_build_url(build_id))}"
            )

        # Build found but not failed/finished yet
        completed = False
        start_wait = time.time()
        next_report = start_wait + REPORT_S
        timeout_at = time.monotonic() + timeout
        logger.info(f"Waiting for build {build_id} to finish...")
        logger.info(
            f"Track progress here: "
            f"{format_link(anyscale_cluster_env_build_url(build_id))}"
        )
        while not completed:
            now = time.time()
            if now > next_report:
                logger.info(
                    f"... still waiting for build {build_id} to finish "
                    f"({int(now - start_wait)} seconds) ..."
                )
                next_report = next_report + REPORT_S

            result = self.sdk.get_build(build_id)
            build = result.result

            if build.status == "failed":
                raise ClusterEnvBuildError(
                    f"Cluster env build failed. Please see "
                    f"{anyscale_cluster_env_build_url(build_id)} for details. "
                    f"Error message: {build.error_message}"
                )

            if build.status == "succeeded":
                logger.info("Build succeeded.")
                self.cluster_env_build_id = build_id
                return

            completed = build.status not in ["in_progress", "pending"]

            if completed:
                raise ClusterEnvBuildError(
                    f"Unknown build status: {build.status}. Please see "
                    f"{anyscale_cluster_env_build_url(build_id)} for details"
                )

            if time.monotonic() > timeout_at:
                raise ClusterEnvBuildTimeout(
                    f"Time out when building cluster env {self.cluster_env_name}"
                )

            time.sleep(1)

        self.cluster_env_build_id = build_id
Пример #30
0
def main(test_collection_file: Optional[str] = None):
    settings = get_pipeline_settings()

    repo = settings["ray_test_repo"]
    branch = settings["ray_test_branch"]
    tmpdir = None

    env = {}
    if repo:
        # If the Ray test repo is set, we clone that repo to fetch
        # the test configuration file. Otherwise we might be missing newly
        # added test.
        repo = settings["ray_test_repo"]
        tmpdir = tempfile.mktemp()

        clone_cmd = f"git clone --depth 1 --branch {branch} {repo} {tmpdir}"
        try:
            subprocess.check_output(clone_cmd, shell=True)
        except Exception as e:
            raise ReleaseTestCLIError(f"Could not clone test repository "
                                      f"{repo} (branch {branch}): {e}") from e
        test_collection_file = os.path.join(tmpdir, "release",
                                            "release_tests.yaml")
        env = {
            "RAY_TEST_REPO": repo,
            "RAY_TEST_BRANCH": branch,
        }
    else:
        test_collection_file = test_collection_file or os.path.join(
            os.path.dirname(__file__), "..", "..", "release_tests.yaml")
    test_collection = read_and_validate_release_test_collection(
        test_collection_file)

    if tmpdir:
        shutil.rmtree(tmpdir, ignore_errors=True)

    frequency = settings["frequency"]
    prefer_smoke_tests = settings["prefer_smoke_tests"]
    test_attr_regex_filters = settings["test_attr_regex_filters"]
    ray_wheels = settings["ray_wheels"]
    priority = settings["priority"]

    logger.info(
        f"Found the following buildkite pipeline settings:\n\n"
        f"  frequency =               {settings['frequency']}\n"
        f"  prefer_smoke_tests =      {settings['prefer_smoke_tests']}\n"
        f"  test_attr_regex_filters = {settings['test_attr_regex_filters']}\n"
        f"  ray_wheels =              {settings['ray_wheels']}\n"
        f"  ray_test_repo =           {settings['ray_test_repo']}\n"
        f"  ray_test_branch =         {settings['ray_test_branch']}\n"
        f"  priority =                {settings['priority']}\n"
        f"  no_concurrency_limit =    {settings['no_concurrency_limit']}\n")

    filtered_tests = filter_tests(
        test_collection,
        frequency=frequency,
        test_attr_regex_filters=test_attr_regex_filters,
        prefer_smoke_tests=prefer_smoke_tests,
    )
    logger.info(f"Found {len(filtered_tests)} tests to run.")
    if len(filtered_tests) == 0:
        raise ReleaseTestCLIError(
            "Empty test collection. The selected frequency or filter did "
            "not return any tests to run. Adjust your filters.")
    grouped_tests = group_tests(filtered_tests)

    group_str = ""
    for group, tests in grouped_tests.items():
        group_str += f"\n{group}:\n"
        for test, smoke in tests:
            group_str += f"  {test['name']}"
            if smoke:
                group_str += " [smoke test]"
            group_str += "\n"

    logger.info(f"Tests to run:\n{group_str}")

    # Wait for wheels here so we have them ready before we kick off
    # the other workers
    ray_wheels_url = find_and_wait_for_ray_wheels_url(
        ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT)
    logger.info(f"Starting pipeline for Ray wheel: {ray_wheels_url}")

    no_concurrency_limit = settings["no_concurrency_limit"]
    if no_concurrency_limit:
        logger.warning("Concurrency is not limited for this run!")

    # Report if REPORT=1 or BUILDKITE_SOURCE=schedule
    report = (bool(int(os.environ.get("REPORT", "0")))
              or os.environ.get("BUILDKITE_SOURCE", "manual") == "schedule")

    steps = []
    for group in sorted(grouped_tests):
        tests = grouped_tests[group]
        group_steps = []
        for test, smoke_test in tests:
            # If the python version is defined, we need a different Ray wheels URL
            if "python" in test:
                python_version = parse_python_version(test["python"])
                this_ray_wheels_url = find_ray_wheels_url(
                    ray_wheels, python_version=python_version)
            else:
                this_ray_wheels_url = ray_wheels_url

            step = get_step(
                test,
                report=report,
                smoke_test=smoke_test,
                ray_wheels=this_ray_wheels_url,
                env=env,
                priority_val=priority.value,
            )

            if no_concurrency_limit:
                step.pop("concurrency", None)
                step.pop("concurrency_group", None)

            group_steps.append(step)

        group_step = {"group": group, "steps": group_steps}
        steps.append(group_step)

    if "BUILDKITE" in os.environ:
        if os.path.exists(PIPELINE_ARTIFACT_PATH):
            shutil.rmtree(PIPELINE_ARTIFACT_PATH)

        os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755)

        with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"),
                  "wt") as fp:
            json.dump(steps, fp)

        settings["frequency"] = settings["frequency"].value
        settings["priority"] = settings["priority"].value
        with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"),
                  "wt") as fp:
            json.dump(settings, fp)

    steps_str = json.dumps(steps)
    print(steps_str)