def test_list_jobs(job_sdk_client: JobSubmissionClient, use_sdk: bool): client = job_sdk_client runtime_env = {"env_vars": {"TEST": "123"}} metadata = {"foo": "bar"} entrypoint = "echo hello" job_id = client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env, metadata=metadata) wait_for_condition(_check_job_succeeded, client=client, job_id=job_id) if use_sdk: info: JobInfo = client.list_jobs()[job_id] else: r = client._do_request( "GET", "/api/jobs/", ) assert r.status_code == 200 jobs_info_json = json.loads(r.text) info_json = jobs_info_json[job_id] info = JobInfo(**info_json) assert info.entrypoint == entrypoint assert info.status == JobStatus.SUCCEEDED assert info.message is not None assert info.end_time >= info.start_time assert info.runtime_env == runtime_env assert info.metadata == metadata
def get_job_info( self, job_id: str, ) -> JobInfo: """Get the latest status and other information associated with a job. Example: >>> from ray.job_submission import JobSubmissionClient >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP >>> job_id = client.submit_job(entrypoint="sleep 1") # doctest: +SKIP >>> job_submission_client.get_job_info(job_id) # doctest: +SKIP JobInfo(status='SUCCEEDED', message='Job finished successfully.', error_type=None, start_time=1647388711, end_time=1647388712, metadata={}, runtime_env={}) Args: job_id: The ID of the job whose information is being requested. Returns: The JobInfo for the job. Raises: RuntimeError: If the job does not exist or if the request to the job server fails. """ r = self._do_request("GET", f"/api/jobs/{job_id}") if r.status_code == 200: return JobInfo(**r.json()) else: self._raise_error(r)
def get_job_info( self, job_id: str, ) -> JobInfo: r = self._do_request("GET", f"/api/jobs/{job_id}") if r.status_code == 200: return JobInfo(**r.json()) else: self._raise_error(r)
def list_jobs(self) -> Dict[str, JobInfo]: r = self._do_request("GET", "/api/jobs/") if r.status_code == 200: jobs_info_json = r.json() jobs_info = { job_id: JobInfo(**job_info_json) for job_id, job_info_json in jobs_info_json.items() } return jobs_info else: self._raise_error(r)
def list_jobs(self) -> Dict[str, JobInfo]: """List all jobs along with their status and other information. Lists all jobs that have ever run on the cluster, including jobs that are currently running and jobs that are no longer running. Example: >>> from ray.job_submission import JobSubmissionClient >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP >>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP >>> client.submit_job(entrypoint="sleep 2") # doctest: +SKIP >>> client.list_jobs() # doctest: +SKIP {'raysubmit_4LamXRuQpYdSMg7J': JobInfo(status='SUCCEEDED', message='Job finished successfully.', error_type=None, start_time=1647388711, end_time=1647388712, metadata={}, runtime_env={}), 'raysubmit_1dxCeNvG1fCMVNHG': JobInfo(status='RUNNING', message='Job is currently running.', error_type=None, start_time=1647454832, end_time=None, metadata={}, runtime_env={})} Returns: A dictionary mapping job_ids to their information. Raises: RuntimeError: If the request to the job server fails. """ r = self._do_request("GET", "/api/jobs/") if r.status_code == 200: jobs_info_json = r.json() jobs_info = { job_id: JobInfo(**job_info_json) for job_id, job_info_json in jobs_info_json.items() } return jobs_info else: self._raise_error(r)
def submit_job( self, *, entrypoint: str, job_id: Optional[str] = None, runtime_env: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, str]] = None, _start_signal_actor: Optional[ActorHandle] = None, ) -> str: """ Job execution happens asynchronously. 1) Generate a new unique id for this job submission, each call of this method assumes they're independent submission with its own new ID, job supervisor actor, and child process. 2) Create new detached actor with same runtime_env as job spec Actual setting up runtime_env, subprocess group, driver command execution, subprocess cleaning up and running status update to GCS is all handled by job supervisor actor. Args: entrypoint: Driver command to execute in subprocess shell. Represents the entrypoint to start user application. runtime_env: Runtime environment used to execute driver command, which could contain its own ray.init() to configure runtime env at ray cluster, task and actor level. metadata: Support passing arbitrary data to driver command in case needed. _start_signal_actor: Used in testing only to capture state transitions between PENDING -> RUNNING. Regular user shouldn't need this. Returns: job_id: Generated uuid for further job management. Only valid within the same ray cluster. """ if job_id is None: job_id = generate_job_id() elif self._job_info_client.get_status(job_id) is not None: raise RuntimeError(f"Job {job_id} already exists.") logger.info(f"Starting job with job_id: {job_id}") job_info = JobInfo( entrypoint=entrypoint, status=JobStatus.PENDING, start_time=int(time.time() * 1000), metadata=metadata, runtime_env=runtime_env, ) self._job_info_client.put_info(job_id, job_info) # Wait for the actor to start up asynchronously so this call always # returns immediately and we can catch errors with the actor starting # up. try: supervisor = self._supervisor_actor_cls.options( lifetime="detached", name=self.JOB_ACTOR_NAME.format(job_id=job_id), num_cpus=0, # Currently we assume JobManager is created by dashboard server # running on headnode, same for job supervisor actors scheduled resources={ self._get_current_node_resource_key(): 0.001, }, runtime_env=runtime_env, ).remote(job_id, entrypoint, metadata or {}) supervisor.run.remote(_start_signal_actor=_start_signal_actor) # Monitor the job in the background so we can detect errors without # requiring a client to poll. create_task(self._monitor_job(job_id, job_supervisor=supervisor)) except Exception as e: self._job_info_client.put_status( job_id, JobStatus.FAILED, message=f"Failed to start job supervisor: {e}.", ) return job_id