def stop_job(self): while True: # first get the entity version job_info = self.get_job_info() version = job_info.status.version.value request = stateless_svc.StopJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=version), ) try: self.client.stateless_svc.StopJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) except grpc.RpcError as e: # if entity version is incorrect, just retry if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() ): continue raise break
def resume(self, entity_version=None): """ resume the given update """ job_entity_version = entity_version or \ self.job.entity_version or \ self.job.get_status().version.value while True: request = stateless_svc.ResumeJobWorkflowRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion(value=job_entity_version), ) try: resp = self.client.stateless_svc.ResumeJobWorkflow( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if e.code() == grpc.StatusCode.INVALID_ARGUMENT \ and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \ and entity_version is None: job_entity_version = entity_version or \ self.job.get_status().version.value continue raise break self.job.entity_version = resp.version.value log.info('job workflow resumed: %s', self.job.entity_version)
def get_job_info(self): request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID( value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp.job_info
def create(self, in_place=False, entity_version=None): """ replace the job spec with the spec provided in StatelessUpdate if entity_version is provided, replace will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, replace will query job runtime to get config version and retry until version is correct. :return: the update ID """ # wait for job manager leader self.job.wait_for_jobmgr_available() respool_id = self.pool.ensure_exists() self.updated_job_spec.respool_id.value = respool_id job_entity_version = (entity_version or self.job.entity_version or self.job.get_status().version.value) while True: request = stateless_svc.ReplaceJobRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), spec=self.updated_job_spec, update_spec=stateless.UpdateSpec( batch_size=self.batch_size, rollback_on_failure=self.roll_back_on_failure, max_instance_retries=self.max_instance_attempts, max_tolerable_instance_failures=self.max_failure_instances, start_paused=self.start_paused, in_place=in_place, ), ) try: resp = self.client.stateless_svc.ReplaceJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.job.get_status().version.value) continue raise break self.job.entity_version = resp.version.value log.info( "job spec replaced with new entity version: %s", self.job.entity_version, )
def list_workflows(self): """ :return: the list of workflows for a job. """ request = stateless_svc.ListJobWorkflowsRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), ) resp = self.client.stateless_svc.ListJobWorkflows( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp.workflow_infos
def query_pods(self): """ :return: list of pod info of all matching pod """ request = stateless_svc.QueryPodsRequest(job_id=v1alpha_peloton.JobID( value=self.job_id)) resp = self.client.stateless_svc.QueryPods( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp.pods
def get_job(self): """ :return: the configuration and runtime status of a job. """ request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID( value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp
def update_job( self, instance_inc, batch_size, use_instance_config, sleep_time, host_limit_1=False, ): default_config = self.create_pod_config( sleep_time, "static", host_limit_1=host_limit_1) job_spec = create_stateless_job_spec( "instance %s && sleep %s" % (instance_inc, sleep_time), [ v1alpha_peloton.Label(key="task_num", value=str(instance_inc)), v1alpha_peloton.Label(key="sleep_time", value=str(sleep_time)), ], instance_inc, default_config, self.respool_id, ) update_spec = stateless.UpdateSpec(batch_size=batch_size) while True: # first get the entity version job_info = self.get_job_info() version = job_info.status.version.value job_spec.instance_count = ( job_info.spec.instance_count + instance_inc ) request = stateless_svc.ReplaceJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=version), spec=job_spec, update_spec=update_spec, ) try: resp = self.client.stateless_svc.ReplaceJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) except grpc.RpcError as e: # if entity version is incorrect, just retry if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() ): continue raise break return resp
def list_pods(self): """ return all pods in the job """ podSummaries = [] request = stateless_svc.ListPodsRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), ) for resp in self.client.stateless_svc.ListPods( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec): for podSummary in resp.pods: podSummaries.append(podSummary) return podSummaries
def delete(self, entity_version=None, force_delete=False): """ Delete the job :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, start will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, start will query job runtime to get config version and retry until version is correct. :param force_delete: force delete a job. If set to true, it will force a delete of the job even if it is running.The job will be first stopped and deleted. This step cannot be undone, and the job cannot be re-created (with same uuid) till the delete is complete. """ job_entity_version = ( entity_version or self.entity_version or self.get_status().version.value ) while True: request = stateless_svc.DeleteJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version ), force=force_delete, ) try: self.client.stateless_svc.DeleteJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None ): job_entity_version = ( entity_version or self.get_status().version.value ) continue raise break log.info("job %s deleted", self.job_id)
def restart( self, entity_version=None, batch_size=None, ranges=None, in_place=False ): """ Restart pods based on the ranges. If ranges is not provided then it restarts all pods of the job :return: restart response from the API """ job_entity_version = ( entity_version or self.entity_version or self.get_status().version.value ) while True: request = stateless_svc.RestartJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version ), restart_spec=stateless.RestartSpec( batch_size=batch_size, ranges=ranges, in_place=in_place ), ) try: resp = self.client.stateless_svc.RestartJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None ): job_entity_version = ( entity_version or self.get_status().version.value ) continue raise break self.entity_version = resp.version.value log.info("job restarted, new entity version: %s", self.entity_version) return resp
def get_replace_job_diff(self, entity_version=None, job_spec=None): """ :return: get replace job diff response. """ job_entity_version = entity_version or \ self.entity_version or \ self.get_status().version.value request = stateless_svc.GetReplaceJobDiffRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=job_entity_version), spec=job_spec, ) resp = self.client.stateless_svc.GetReplaceJobDiff( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp
def wait_for_jobmgr_available(self): """ utility method to wait for job manger leader to come up. good practice to check before all write apis """ attempts = 0 while attempts < self.config.max_retry_attempts: try: request = stateless_svc.DeleteJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value="dummy-entity-version"), ) self.client.stateless_svc.DeleteJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: if e.code() != grpc.StatusCode.UNAVAILABLE: break log.info("waiting for job manager leader") time.sleep(self.config.sleep_time_sec) attempts += 1
def pause(self, entity_version=None): """ pause the given update """ # wait for job manager leader self.job.wait_for_jobmgr_available() job_entity_version = (entity_version or self.job.entity_version or self.job.get_status().version.value) while True: request = stateless_svc.PauseJobWorkflowRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.PauseJobWorkflow( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.job.get_status().version.value) continue raise break self.job.entity_version = resp.version.value log.info("job workflow paused: %s", self.job.entity_version)
def wait_for_workflow_state(self, goal_state="SUCCEEDED", failed_state="FAILED"): """ Waits for the job workflow to reach a particular state :param goal_state: The state to reach :param failed_state: The failed state of the job """ state = "" attempts = 0 start = time.time() log.info("%s waiting for state workflow %s", self.job_id, goal_state) state_transition_failure = False # convert the name from v0 state name to v1 alpha state name, # so the function signature can be shared between the apis goal_state = "WORKFLOW_STATE_" + goal_state failed_state = "WORKFLOW_STATE_" + failed_state instance_completed = 0 while attempts < self.config.max_retry_attempts: try: request = stateless_svc.GetJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id)) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) status = resp.workflow_info.status new_state = stateless.WorkflowState.Name(status.state) if state != new_state: log.info("%s transitioned to state %s", self.job_id, new_state) state = new_state if state == goal_state: break # If we assert here, we will log the exception, # and continue with the finally block. Set a flag # here to indicate failure and then break the loop # in the finally block if state == failed_state: state_transition_failure = True except Exception as e: log.warn(e) attempts += 1 else: # for workflow, we only begin to count attempts when no progress is made if instance_completed == status.num_instances_completed + status.num_instances_failed: attempts += 1 else: instance_completed = status.num_instances_completed + status.num_instances_failed attempts = 0 finally: if state_transition_failure: break time.sleep(self.config.sleep_time_sec) if state_transition_failure: log.info( "goal_state:%s current_state:%s attempts: %s", goal_state, state, str(attempts), ) assert False if attempts == self.config.max_retry_attempts: log.info("%s max attempts reached to wait for goal state", self.job_id) log.info("goal_state:%s current_state:%s", goal_state, state) assert False end = time.time() elapsed = end - start log.info("%s state transition took %s seconds", self.job_id, elapsed)
def stop(self, ranges=None, entity_version=None): """ Stops certain pods based on the ranges. If ranges is not provided then it stops the job Job level stop does not support range. We are using pod api for range operation. We do this for backward compatibility of existing tests :param ranges: the instance ranges to stop :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, stop will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, stop will query job runtime to get config version and retry until version is correct. :return: stop response from the API """ # wait for job manager leader self.wait_for_jobmgr_available() if ranges is None: job_entity_version = (entity_version or self.entity_version or self.get_status().version.value) while True: request = stateless_svc.StopJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.StopJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.get_status().version.value) continue raise break self.entity_version = resp.version.value log.info("job stopped, new entity version: %s", self.entity_version) return resp for pod_range in ranges: for pod_id in range(getattr(pod_range, "from"), pod_range.to): pod_name = self.job_id + "-" + str(pod_id) request = pod_svc.StopPodRequest( pod_name=v1alpha_peloton.PodName(value=pod_name)) self.client.pod_svc.StopPod( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info("stopping pods in job {0} with ranges {1}".format( self.job_id, ranges)) return pod_svc.StopPodResponse()
def wait_for_state(self, goal_state='SUCCEEDED', failed_state='FAILED'): """ Waits for the job to reach a particular state :param goal_state: The state to reach :param failed_state: The failed state of the job """ state = '' attempts = 0 start = time.time() log.info('%s waiting for state %s', self.job_id, goal_state) state_transition_failure = False # convert the name from v0 state name to v1 alpha state name, # so the function signature can be shared between the apis goal_state = 'JOB_STATE_' + goal_state failed_state = 'JOB_STATE_' + failed_state while attempts < self.config.max_retry_attempts: try: request = stateless_svc.GetJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), ) resp = self.client.stateless_svc.GetJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) status = resp.job_info.status new_state = stateless.JobState.Name(status.state) if state != new_state: log.info('%s transitioned to state %s', self.job_id, new_state) state = new_state if state == goal_state: break # If we assert here, we will log the exception, # and continue with the finally block. Set a flag # here to indicate failure and then break the loop # in the finally block if state == failed_state: state_transition_failure = True except Exception as e: log.warn(e) finally: if state_transition_failure: break time.sleep(self.config.sleep_time_sec) attempts += 1 if state_transition_failure: log.info('goal_state:%s current_state:%s attempts: %s', goal_state, state, str(attempts)) assert False if attempts == self.config.max_retry_attempts: log.info('%s max attempts reached to wait for goal state', self.job_id) log.info('goal_state:%s current_state:%s', goal_state, state) assert False end = time.time() elapsed = end - start log.info('%s state transition took %s seconds', self.job_id, elapsed) assert state == goal_state
def start(self, ranges=None, entity_version=None): """ Starts certain pods based on the ranges. If ranges is not provided it starts all pods of the job Job level start does not support range. We are using pod api for range operation. We do this for backward compatibility of existing tests :param ranges: the instance ranges to start :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, start will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, start will query job runtime to get config version and retry until version is correct. :return: start response from the API """ if ranges is None: job_entity_version = entity_version or \ self.entity_version or \ self.get_status().version.value while True: request = stateless_svc.StartJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.StartJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if e.code() == grpc.StatusCode.INVALID_ARGUMENT \ and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \ and entity_version is None: job_entity_version = entity_version or \ self.get_status().version.value continue raise break self.entity_version = resp.version.value log.info('job started, new entity version: %s', self.entity_version) return resp for pod_range in ranges: for pod_id in range(getattr(pod_range, 'from'), pod_range.to): pod_name = self.job_id + '-' + str(pod_id) request = pod_svc.StartPodRequest( pod_name=v1alpha_peloton.PodName(value=pod_name), ) self.client.pod_svc.StartPod( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info('starting pods in job {0} with ranges {1}'.format( self.job_id, ranges)) return pod_svc.StartPodResponse()