예제 #1
0
def test_list_jobs(job_sdk_client: JobSubmissionClient, use_sdk: bool):
    client = job_sdk_client

    runtime_env = {"env_vars": {"TEST": "123"}}
    metadata = {"foo": "bar"}
    entrypoint = "echo hello"
    job_id = client.submit_job(entrypoint=entrypoint,
                               runtime_env=runtime_env,
                               metadata=metadata)

    wait_for_condition(_check_job_succeeded, client=client, job_id=job_id)
    if use_sdk:
        info: JobInfo = client.list_jobs()[job_id]
    else:
        r = client._do_request(
            "GET",
            "/api/jobs/",
        )

        assert r.status_code == 200
        jobs_info_json = json.loads(r.text)
        info_json = jobs_info_json[job_id]
        info = JobInfo(**info_json)

    assert info.entrypoint == entrypoint
    assert info.status == JobStatus.SUCCEEDED
    assert info.message is not None
    assert info.end_time >= info.start_time
    assert info.runtime_env == runtime_env
    assert info.metadata == metadata
예제 #2
0
파일: sdk.py 프로젝트: krfricke/ray
    def get_job_info(
        self,
        job_id: str,
    ) -> JobInfo:
        """Get the latest status and other information associated with a job.

        Example:
            >>> from ray.job_submission import JobSubmissionClient
            >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
            >>> job_id = client.submit_job(entrypoint="sleep 1") # doctest: +SKIP
            >>> job_submission_client.get_job_info(job_id) # doctest: +SKIP
            JobInfo(status='SUCCEEDED', message='Job finished successfully.',
            error_type=None, start_time=1647388711, end_time=1647388712,
            metadata={}, runtime_env={})

        Args:
            job_id: The ID of the job whose information is being requested.

        Returns:
            The JobInfo for the job.

        Raises:
            RuntimeError: If the job does not exist or if the request to the
            job server fails.
        """
        r = self._do_request("GET", f"/api/jobs/{job_id}")

        if r.status_code == 200:
            return JobInfo(**r.json())
        else:
            self._raise_error(r)
예제 #3
0
파일: sdk.py 프로젝트: wuisawesome/ray
    def get_job_info(
        self,
        job_id: str,
    ) -> JobInfo:
        r = self._do_request("GET", f"/api/jobs/{job_id}")

        if r.status_code == 200:
            return JobInfo(**r.json())
        else:
            self._raise_error(r)
예제 #4
0
파일: sdk.py 프로젝트: wuisawesome/ray
    def list_jobs(self) -> Dict[str, JobInfo]:
        r = self._do_request("GET", "/api/jobs/")

        if r.status_code == 200:
            jobs_info_json = r.json()
            jobs_info = {
                job_id: JobInfo(**job_info_json)
                for job_id, job_info_json in jobs_info_json.items()
            }
            return jobs_info
        else:
            self._raise_error(r)
예제 #5
0
파일: sdk.py 프로젝트: krfricke/ray
    def list_jobs(self) -> Dict[str, JobInfo]:
        """List all jobs along with their status and other information.

        Lists all jobs that have ever run on the cluster, including jobs that are
        currently running and jobs that are no longer running.

        Example:
            >>> from ray.job_submission import JobSubmissionClient
            >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
            >>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP
            >>> client.submit_job(entrypoint="sleep 2") # doctest: +SKIP
            >>> client.list_jobs() # doctest: +SKIP
            {'raysubmit_4LamXRuQpYdSMg7J': JobInfo(status='SUCCEEDED',
            message='Job finished successfully.', error_type=None,
            start_time=1647388711, end_time=1647388712, metadata={}, runtime_env={}),
            'raysubmit_1dxCeNvG1fCMVNHG': JobInfo(status='RUNNING',
            message='Job is currently running.', error_type=None,
            start_time=1647454832, end_time=None, metadata={}, runtime_env={})}

        Returns:
            A dictionary mapping job_ids to their information.

        Raises:
            RuntimeError: If the request to the job server fails.
        """
        r = self._do_request("GET", "/api/jobs/")

        if r.status_code == 200:
            jobs_info_json = r.json()
            jobs_info = {
                job_id: JobInfo(**job_info_json)
                for job_id, job_info_json in jobs_info_json.items()
            }
            return jobs_info
        else:
            self._raise_error(r)
예제 #6
0
파일: job_manager.py 프로젝트: krfricke/ray
    def submit_job(
        self,
        *,
        entrypoint: str,
        job_id: Optional[str] = None,
        runtime_env: Optional[Dict[str, Any]] = None,
        metadata: Optional[Dict[str, str]] = None,
        _start_signal_actor: Optional[ActorHandle] = None,
    ) -> str:
        """
        Job execution happens asynchronously.

        1) Generate a new unique id for this job submission, each call of this
            method assumes they're independent submission with its own new
            ID, job supervisor actor, and child process.
        2) Create new detached actor with same runtime_env as job spec

        Actual setting up runtime_env, subprocess group, driver command
        execution, subprocess cleaning up and running status update to GCS
        is all handled by job supervisor actor.

        Args:
            entrypoint: Driver command to execute in subprocess shell.
                Represents the entrypoint to start user application.
            runtime_env: Runtime environment used to execute driver command,
                which could contain its own ray.init() to configure runtime
                env at ray cluster, task and actor level.
            metadata: Support passing arbitrary data to driver command in
                case needed.
            _start_signal_actor: Used in testing only to capture state
                transitions between PENDING -> RUNNING. Regular user shouldn't
                need this.

        Returns:
            job_id: Generated uuid for further job management. Only valid
                within the same ray cluster.
        """
        if job_id is None:
            job_id = generate_job_id()
        elif self._job_info_client.get_status(job_id) is not None:
            raise RuntimeError(f"Job {job_id} already exists.")

        logger.info(f"Starting job with job_id: {job_id}")
        job_info = JobInfo(
            entrypoint=entrypoint,
            status=JobStatus.PENDING,
            start_time=int(time.time() * 1000),
            metadata=metadata,
            runtime_env=runtime_env,
        )
        self._job_info_client.put_info(job_id, job_info)

        # Wait for the actor to start up asynchronously so this call always
        # returns immediately and we can catch errors with the actor starting
        # up.
        try:
            supervisor = self._supervisor_actor_cls.options(
                lifetime="detached",
                name=self.JOB_ACTOR_NAME.format(job_id=job_id),
                num_cpus=0,
                # Currently we assume JobManager is created by dashboard server
                # running on headnode, same for job supervisor actors scheduled
                resources={
                    self._get_current_node_resource_key(): 0.001,
                },
                runtime_env=runtime_env,
            ).remote(job_id, entrypoint, metadata or {})
            supervisor.run.remote(_start_signal_actor=_start_signal_actor)

            # Monitor the job in the background so we can detect errors without
            # requiring a client to poll.
            create_task(self._monitor_job(job_id, job_supervisor=supervisor))
        except Exception as e:
            self._job_info_client.put_status(
                job_id,
                JobStatus.FAILED,
                message=f"Failed to start job supervisor: {e}.",
            )

        return job_id