def build_configs(self, timeout: float = 30.0): try: self.create_cluster_compute() except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except ClusterComputeCreateError as e: raise e except Exception as e: raise ClusterComputeCreateError( f"Unexpected cluster compute build error: {e}") from e try: self.create_cluster_env() except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except ClusterEnvCreateError as e: raise e except Exception as e: raise ClusterEnvCreateError( f"Unexpected cluster env create error: {e}") from e try: self.build_cluster_env(timeout=timeout) except AssertionError as e: # If already exists, ignore logger.warning(str(e)) except (ClusterEnvBuildError, ClusterEnvBuildTimeout) as e: raise e except Exception as e: raise ClusterEnvBuildError( f"Unexpected cluster env build error: {e}") from e
def build_cluster_env(self, timeout: float = 600.0): assert self.cluster_env_id assert self.cluster_env_build_id is None # Fetch build build_id = None last_status = None error_message = None config_json = None result = self.sdk.list_cluster_environment_builds(self.cluster_env_id) if not result or not result.results: raise ClusterEnvBuildError(f"No build found for cluster env: {result}") build = sorted(result.results, key=lambda b: b.created_at)[-1] build_id = build.id last_status = build.status error_message = build.error_message config_json = build.config_json if last_status == "succeeded": logger.info( f"Link to succeeded cluster env build: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) self.cluster_env_build_id = build_id return if last_status == "failed": logger.info(f"Previous cluster env build failed: {error_message}") logger.info("Starting new cluster env build...") # Retry build result = self.sdk.create_cluster_environment_build( dict( cluster_environment_id=self.cluster_env_id, config_json=config_json ) ) build_id = result.result.id logger.info( f"Link to created cluster env build: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) # Build found but not failed/finished yet completed = False start_wait = time.time() next_report = start_wait + REPORT_S timeout_at = time.monotonic() + timeout logger.info(f"Waiting for build {build_id} to finish...") logger.info( f"Track progress here: " f"{format_link(anyscale_cluster_env_build_url(build_id))}" ) while not completed: now = time.time() if now > next_report: logger.info( f"... still waiting for build {build_id} to finish " f"({int(now - start_wait)} seconds) ..." ) next_report = next_report + REPORT_S result = self.sdk.get_build(build_id) build = result.result if build.status == "failed": raise ClusterEnvBuildError( f"Cluster env build failed. Please see " f"{anyscale_cluster_env_build_url(build_id)} for details. " f"Error message: {build.error_message}" ) if build.status == "succeeded": logger.info("Build succeeded.") self.cluster_env_build_id = build_id return completed = build.status not in ["in_progress", "pending"] if completed: raise ClusterEnvBuildError( f"Unknown build status: {build.status}. Please see " f"{anyscale_cluster_env_build_url(build_id)} for details" ) if time.monotonic() > timeout_at: raise ClusterEnvBuildTimeout( f"Time out when building cluster env {self.cluster_env_name}" ) time.sleep(1) self.cluster_env_build_id = build_id