def _run_with_retry(self, f, initial_retry_delay_s: int = 10): assert callable(f) return exponential_backoff_retry( f, retry_exceptions=Exception, initial_retry_delay_s=initial_retry_delay_s, max_retries=3, )
def _get_job_status_with_retry(self, command_id): job_client = self._get_job_client() return exponential_backoff_retry( lambda: job_client.get_job_status(self.job_id_pool[command_id]), retry_exceptions=Exception, initial_retry_delay_s=1, max_retries=3, )
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: full_env = self.get_full_command_env(env) if full_env: env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " else: env_str = "" full_command = f"{env_str}{command}" logger.info( f"Running command in cluster {self.cluster_manager.cluster_name}: " f"{full_command}" ) logger.info( f"Link to cluster: " f"{format_link(self.cluster_manager.get_cluster_url())}" ) result = self.sdk.create_session_command( dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command) ) scd_id = result.result.id self.last_command_scd_id = scd_id completed = result.result.finished_at is not None start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 while not completed: now = time.monotonic() if now >= timeout_at: raise CommandTimeout( f"Cluster command timed out after {timeout} seconds." ) if now >= next_status: logger.info( f"... command still running ..." f"({int(now - start_time)} seconds) ..." ) next_status += 30 # Sleep 1 sec before next check. time.sleep(1) result = exponential_backoff_retry( lambda: self.sdk.get_session_command(session_command_id=scd_id), retry_exceptions=Exception, initial_retry_delay_s=10, max_retries=3, ) completed = result.result.finished_at status_code = result.result.status_code time_taken = time.monotonic() - start_time if status_code != 0: raise CommandError(f"Command returned non-success status: {status_code}") return time_taken
def report_result(self, test: Test, result: Result): logger.info("Persisting results to database...") result_dict = { "_runtime": result.runtime, # Keep session url for legacy support "_session_url": result.cluster_url, "_cluster_url": result.cluster_url, "_commit_url": result.wheels_url, "_stable": result.stable, } now = datetime.datetime.utcnow() rds_data_client = boto3.client("rds-data", region_name="us-west-2") if "legacy" in test: test_name = test["legacy"]["test_name"] test_suite = test["legacy"]["test_suite"] else: test_name = test["name"] test_suite = "" team = test["team"] or "" # Branch name category = get_test_env_var("RAY_BRANCH", "") status = result.status or "invalid" last_logs = result.last_logs or "" if result.results: result_dict.update(result.results) artifacts = {} parameters = [ { "name": "created_on", "typeHint": "TIMESTAMP", "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")}, }, {"name": "test_suite", "value": {"stringValue": test_suite}}, {"name": "test_name", "value": {"stringValue": test_name}}, {"name": "status", "value": {"stringValue": status}}, {"name": "last_logs", "value": {"stringValue": last_logs}}, { "name": "results", "typeHint": "JSON", "value": {"stringValue": json.dumps(result_dict)}, }, { "name": "artifacts", "typeHint": "JSON", "value": {"stringValue": json.dumps(artifacts)}, }, {"name": "category", "value": {"stringValue": category}}, {"name": "team", "value": {"stringValue": team}}, {"name": "session_url", "value": {"stringValue": result.cluster_url or ""}}, {"name": "commit_url", "value": {"stringValue": result.wheels_url or ""}}, {"name": "runtime", "value": {"doubleValue": result.runtime or -1.0}}, {"name": "stable", "value": {"booleanValue": result.stable}}, {"name": "frequency", "value": {"stringValue": test.get("frequency", "")}}, {"name": "return_code", "value": {"longValue": result.return_code}}, ] columns = [param["name"] for param in parameters] values = [f":{param['name']}" for param in parameters] column_str = ", ".join(columns).strip(", ") value_str = ", ".join(values).strip(", ") sql = ( f"INSERT INTO {self.database_table} " f"({column_str}) " f"VALUES ({value_str})" ) logger.debug(f"SQL query: {sql}") # Default boto3 call timeout is 45 seconds. retry_delay_s = 64 MAX_RDS_RETRY = 3 exponential_backoff_retry( lambda: rds_data_client.execute_statement( database=self.database, parameters=parameters, secretArn=RELEASE_AWS_DB_SECRET_ARN, resourceArn=RELEASE_AWS_DB_RESOURCE_ARN, schema=self.database_table, sql=sql, ), retry_exceptions=rds_data_client.exceptions.StatementTimeoutException, initial_retry_delay_s=retry_delay_s, max_retries=MAX_RDS_RETRY, ) logger.info("Result has been persisted to the database")
def start_cluster(self, timeout: float = 600.0): logger.info(f"Creating cluster {self.cluster_name}") logger.info(f"Autosuspend time: {self.autosuspend_minutes} minutes") try: result = self.sdk.create_cluster( dict( name=self.cluster_name, project_id=self.project_id, cluster_environment_build_id=self.cluster_env_build_id, cluster_compute_id=self.cluster_compute_id, idle_timeout_minutes=self.autosuspend_minutes, )) self.cluster_id = result.result.id except Exception as e: raise ClusterCreationError(f"Error creating cluster: {e}") from e # Trigger session start logger.info( f"Starting cluster {self.cluster_name} ({self.cluster_id})") cluster_url = anyscale_cluster_url(project_id=self.project_id, session_id=self.cluster_id) logger.info(f"Link to cluster: {format_link(cluster_url)}") try: result = self.sdk.start_cluster(self.cluster_id, start_cluster_options={}) cop_id = result.result.id completed = result.result.completed except Exception as e: raise ClusterStartupError( f"Error starting cluster with name " f"{self.cluster_name} and {self.cluster_id} ({cluster_url}): " f"{e}") from e # Wait for session logger.info(f"Waiting for cluster {self.cluster_name}...") start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 while not completed: now = time.monotonic() if now >= timeout_at: raise ClusterStartupTimeout( f"Time out when creating cluster {self.cluster_name}") if now >= next_status: logger.info( f"... still waiting for cluster {self.cluster_name} " f"({int(now - start_time)} seconds) ...") next_status += 30 # Sleep 1 sec before next check. time.sleep(1) result = exponential_backoff_retry( lambda: self.sdk.get_cluster_operation(cop_id, _request_timeout=30), retry_exceptions=Exception, initial_retry_delay_s=2, max_retries=3, ) completed = result.result.completed result = self.sdk.get_cluster(self.cluster_id) if result.result.state != "Running": raise ClusterStartupFailed( f"Cluster did not come up - most likely the nodes are currently " f"not available. Please check the cluster startup logs: " f"{cluster_url} (cluster state: {result.result.state})")