예제 #1
0
 def _run_with_retry(self, f, initial_retry_delay_s: int = 10):
     assert callable(f)
     return exponential_backoff_retry(
         f,
         retry_exceptions=Exception,
         initial_retry_delay_s=initial_retry_delay_s,
         max_retries=3,
     )
예제 #2
0
 def _get_job_status_with_retry(self, command_id):
     job_client = self._get_job_client()
     return exponential_backoff_retry(
         lambda: job_client.get_job_status(self.job_id_pool[command_id]),
         retry_exceptions=Exception,
         initial_retry_delay_s=1,
         max_retries=3,
     )
예제 #3
0
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}"
        )

        logger.info(
            f"Link to cluster: "
            f"{format_link(self.cluster_manager.get_cluster_url())}"
        )

        result = self.sdk.create_session_command(
            dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command)
        )

        scd_id = result.result.id
        self.last_command_scd_id = scd_id

        completed = result.result.finished_at is not None

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30

        while not completed:
            now = time.monotonic()
            if now >= timeout_at:
                raise CommandTimeout(
                    f"Cluster command timed out after {timeout} seconds."
                )

            if now >= next_status:
                logger.info(
                    f"... command still running ..."
                    f"({int(now - start_time)} seconds) ..."
                )
                next_status += 30

            # Sleep 1 sec before next check.
            time.sleep(1)

            result = exponential_backoff_retry(
                lambda: self.sdk.get_session_command(session_command_id=scd_id),
                retry_exceptions=Exception,
                initial_retry_delay_s=10,
                max_retries=3,
            )
            completed = result.result.finished_at

        status_code = result.result.status_code
        time_taken = time.monotonic() - start_time

        if status_code != 0:
            raise CommandError(f"Command returned non-success status: {status_code}")

        return time_taken
예제 #4
0
    def report_result(self, test: Test, result: Result):
        logger.info("Persisting results to database...")

        result_dict = {
            "_runtime": result.runtime,
            # Keep session url for legacy support
            "_session_url": result.cluster_url,
            "_cluster_url": result.cluster_url,
            "_commit_url": result.wheels_url,
            "_stable": result.stable,
        }

        now = datetime.datetime.utcnow()
        rds_data_client = boto3.client("rds-data", region_name="us-west-2")

        if "legacy" in test:
            test_name = test["legacy"]["test_name"]
            test_suite = test["legacy"]["test_suite"]
        else:
            test_name = test["name"]
            test_suite = ""

        team = test["team"] or ""

        # Branch name
        category = get_test_env_var("RAY_BRANCH", "")

        status = result.status or "invalid"
        last_logs = result.last_logs or ""

        if result.results:
            result_dict.update(result.results)
        artifacts = {}

        parameters = [
            {
                "name": "created_on",
                "typeHint": "TIMESTAMP",
                "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")},
            },
            {"name": "test_suite", "value": {"stringValue": test_suite}},
            {"name": "test_name", "value": {"stringValue": test_name}},
            {"name": "status", "value": {"stringValue": status}},
            {"name": "last_logs", "value": {"stringValue": last_logs}},
            {
                "name": "results",
                "typeHint": "JSON",
                "value": {"stringValue": json.dumps(result_dict)},
            },
            {
                "name": "artifacts",
                "typeHint": "JSON",
                "value": {"stringValue": json.dumps(artifacts)},
            },
            {"name": "category", "value": {"stringValue": category}},
            {"name": "team", "value": {"stringValue": team}},
            {"name": "session_url", "value": {"stringValue": result.cluster_url or ""}},
            {"name": "commit_url", "value": {"stringValue": result.wheels_url or ""}},
            {"name": "runtime", "value": {"doubleValue": result.runtime or -1.0}},
            {"name": "stable", "value": {"booleanValue": result.stable}},
            {"name": "frequency", "value": {"stringValue": test.get("frequency", "")}},
            {"name": "return_code", "value": {"longValue": result.return_code}},
        ]

        columns = [param["name"] for param in parameters]
        values = [f":{param['name']}" for param in parameters]
        column_str = ", ".join(columns).strip(", ")
        value_str = ", ".join(values).strip(", ")

        sql = (
            f"INSERT INTO {self.database_table} "
            f"({column_str}) "
            f"VALUES ({value_str})"
        )

        logger.debug(f"SQL query: {sql}")

        # Default boto3 call timeout is 45 seconds.
        retry_delay_s = 64
        MAX_RDS_RETRY = 3
        exponential_backoff_retry(
            lambda: rds_data_client.execute_statement(
                database=self.database,
                parameters=parameters,
                secretArn=RELEASE_AWS_DB_SECRET_ARN,
                resourceArn=RELEASE_AWS_DB_RESOURCE_ARN,
                schema=self.database_table,
                sql=sql,
            ),
            retry_exceptions=rds_data_client.exceptions.StatementTimeoutException,
            initial_retry_delay_s=retry_delay_s,
            max_retries=MAX_RDS_RETRY,
        )
        logger.info("Result has been persisted to the database")
예제 #5
0
파일: full.py 프로젝트: vishalbelsare/ray
    def start_cluster(self, timeout: float = 600.0):
        logger.info(f"Creating cluster {self.cluster_name}")
        logger.info(f"Autosuspend time: {self.autosuspend_minutes} minutes")
        try:
            result = self.sdk.create_cluster(
                dict(
                    name=self.cluster_name,
                    project_id=self.project_id,
                    cluster_environment_build_id=self.cluster_env_build_id,
                    cluster_compute_id=self.cluster_compute_id,
                    idle_timeout_minutes=self.autosuspend_minutes,
                ))
            self.cluster_id = result.result.id
        except Exception as e:
            raise ClusterCreationError(f"Error creating cluster: {e}") from e

        # Trigger session start
        logger.info(
            f"Starting cluster {self.cluster_name} ({self.cluster_id})")
        cluster_url = anyscale_cluster_url(project_id=self.project_id,
                                           session_id=self.cluster_id)
        logger.info(f"Link to cluster: {format_link(cluster_url)}")

        try:
            result = self.sdk.start_cluster(self.cluster_id,
                                            start_cluster_options={})
            cop_id = result.result.id
            completed = result.result.completed
        except Exception as e:
            raise ClusterStartupError(
                f"Error starting cluster with name "
                f"{self.cluster_name} and {self.cluster_id} ({cluster_url}): "
                f"{e}") from e

        # Wait for session
        logger.info(f"Waiting for cluster {self.cluster_name}...")

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30
        while not completed:
            now = time.monotonic()
            if now >= timeout_at:
                raise ClusterStartupTimeout(
                    f"Time out when creating cluster {self.cluster_name}")

            if now >= next_status:
                logger.info(
                    f"... still waiting for cluster {self.cluster_name} "
                    f"({int(now - start_time)} seconds) ...")
                next_status += 30

            # Sleep 1 sec before next check.
            time.sleep(1)

            result = exponential_backoff_retry(
                lambda: self.sdk.get_cluster_operation(cop_id,
                                                       _request_timeout=30),
                retry_exceptions=Exception,
                initial_retry_delay_s=2,
                max_retries=3,
            )
            completed = result.result.completed

        result = self.sdk.get_cluster(self.cluster_id)
        if result.result.state != "Running":
            raise ClusterStartupFailed(
                f"Cluster did not come up - most likely the nodes are currently "
                f"not available. Please check the cluster startup logs: "
                f"{cluster_url} (cluster state: {result.result.state})")