示例#1
0
文件: utils.py 项目: tchordia/ray
def ray_job_submit(
    script_name: str,
    head_service: str,
    k8s_namespace: str = "default",
    ray_dashboard_port: int = 8265,
) -> str:
    """Submits a Python script via the Ray Job Submission API, using the Python SDK.
    Waits for successful completion of the job and returns the job logs as a string.

    Uses `kubectl port-forward` to access the Ray head's dashboard port.

    Scripts live in `tests/kuberay/scripts`. This directory is used as the working
    dir for the job.

    Args:
        script_name: The name of the script to submit.
        head_service: The name of the Ray head K8s service.
        k8s_namespace: K8s namespace the Ray cluster belongs to.
        ray_dashboard_port: The port on which the Ray head is running the Ray dashboard.
    """
    with _kubectl_port_forward(service=head_service,
                               namespace=k8s_namespace,
                               target_port=ray_dashboard_port) as local_port:
        # It takes a bit of time to establish the connection.
        # Try a few times to instantiate the JobSubmissionClient, as the client's
        # instantiation does not retry on connection errors.
        for trie in range(1, 7):
            time.sleep(5)
            try:
                client = JobSubmissionClient(f"http://127.0.0.1:{local_port}")
            except ConnectionError as e:
                if trie < 6:
                    logger.info(
                        "Job client connection failed. Retrying in 5 seconds.")
                else:
                    raise e from None
        job_id = client.submit_job(
            entrypoint=f"python {script_name}",
            runtime_env={
                "working_dir": SCRIPTS_DIR,
                # Throw in some extra data for fun, to validate runtime envs.
                "pip": ["pytest==6.0.0"],
                "env_vars": {
                    "key_foo": "value_bar"
                },
            },
        )
        # Wait for the job to complete successfully.
        # This logic is copied from the Job Submission docs.
        start = time.time()
        timeout = 60
        while time.time() - start <= timeout:
            status = client.get_job_status(job_id)
            print(f"status: {status}")
            if status in {
                    JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED
            }:
                break
            time.sleep(5)

        assert status == JobStatus.SUCCEEDED
        return client.get_job_logs(job_id)
示例#2
0
def _check_job_stopped(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status == JobStatus.STOPPED
示例#3
0
def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    if status == JobStatus.FAILED:
        logs = client.get_job_logs(job_id)
        raise RuntimeError(f"Job failed\nlogs:\n{logs}")
    return status == JobStatus.SUCCEEDED
示例#4
0
def _check_job_failed(client: JobSubmissionClient, job_id: str) -> bool:
    status = client.get_job_status(job_id)
    return status == JobStatus.FAILED