def stop_job(self): while True: # first get the entity version job_info = self.get_job_info() version = job_info.status.version.value request = stateless_svc.StopJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=version), ) try: self.client.stateless_svc.StopJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) except grpc.RpcError as e: # if entity version is incorrect, just retry if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() ): continue raise break
def resume(self, entity_version=None): """ resume the given update """ job_entity_version = entity_version or \ self.job.entity_version or \ self.job.get_status().version.value while True: request = stateless_svc.ResumeJobWorkflowRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion(value=job_entity_version), ) try: resp = self.client.stateless_svc.ResumeJobWorkflow( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if e.code() == grpc.StatusCode.INVALID_ARGUMENT \ and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \ and entity_version is None: job_entity_version = entity_version or \ self.job.get_status().version.value continue raise break self.job.entity_version = resp.version.value log.info('job workflow resumed: %s', self.job.entity_version)
def create(self, in_place=False, entity_version=None): """ replace the job spec with the spec provided in StatelessUpdate if entity_version is provided, replace will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, replace will query job runtime to get config version and retry until version is correct. :return: the update ID """ # wait for job manager leader self.job.wait_for_jobmgr_available() respool_id = self.pool.ensure_exists() self.updated_job_spec.respool_id.value = respool_id job_entity_version = (entity_version or self.job.entity_version or self.job.get_status().version.value) while True: request = stateless_svc.ReplaceJobRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), spec=self.updated_job_spec, update_spec=stateless.UpdateSpec( batch_size=self.batch_size, rollback_on_failure=self.roll_back_on_failure, max_instance_retries=self.max_instance_attempts, max_tolerable_instance_failures=self.max_failure_instances, start_paused=self.start_paused, in_place=in_place, ), ) try: resp = self.client.stateless_svc.ReplaceJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.job.get_status().version.value) continue raise break self.job.entity_version = resp.version.value log.info( "job spec replaced with new entity version: %s", self.job.entity_version, )
def update_job( self, instance_inc, batch_size, use_instance_config, sleep_time, host_limit_1=False, ): default_config = self.create_pod_config( sleep_time, "static", host_limit_1=host_limit_1) job_spec = create_stateless_job_spec( "instance %s && sleep %s" % (instance_inc, sleep_time), [ v1alpha_peloton.Label(key="task_num", value=str(instance_inc)), v1alpha_peloton.Label(key="sleep_time", value=str(sleep_time)), ], instance_inc, default_config, self.respool_id, ) update_spec = stateless.UpdateSpec(batch_size=batch_size) while True: # first get the entity version job_info = self.get_job_info() version = job_info.status.version.value job_spec.instance_count = ( job_info.spec.instance_count + instance_inc ) request = stateless_svc.ReplaceJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=version), spec=job_spec, update_spec=update_spec, ) try: resp = self.client.stateless_svc.ReplaceJob( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) except grpc.RpcError as e: # if entity version is incorrect, just retry if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() ): continue raise break return resp
def delete(self, entity_version=None, force_delete=False): """ Delete the job :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, start will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, start will query job runtime to get config version and retry until version is correct. :param force_delete: force delete a job. If set to true, it will force a delete of the job even if it is running.The job will be first stopped and deleted. This step cannot be undone, and the job cannot be re-created (with same uuid) till the delete is complete. """ job_entity_version = ( entity_version or self.entity_version or self.get_status().version.value ) while True: request = stateless_svc.DeleteJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version ), force=force_delete, ) try: self.client.stateless_svc.DeleteJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None ): job_entity_version = ( entity_version or self.get_status().version.value ) continue raise break log.info("job %s deleted", self.job_id)
def restart( self, entity_version=None, batch_size=None, ranges=None, in_place=False ): """ Restart pods based on the ranges. If ranges is not provided then it restarts all pods of the job :return: restart response from the API """ job_entity_version = ( entity_version or self.entity_version or self.get_status().version.value ) while True: request = stateless_svc.RestartJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version ), restart_spec=stateless.RestartSpec( batch_size=batch_size, ranges=ranges, in_place=in_place ), ) try: resp = self.client.stateless_svc.RestartJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if ( e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None ): job_entity_version = ( entity_version or self.get_status().version.value ) continue raise break self.entity_version = resp.version.value log.info("job restarted, new entity version: %s", self.entity_version) return resp
def get_replace_job_diff(self, entity_version=None, job_spec=None): """ :return: get replace job diff response. """ job_entity_version = entity_version or \ self.entity_version or \ self.get_status().version.value request = stateless_svc.GetReplaceJobDiffRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion(value=job_entity_version), spec=job_spec, ) resp = self.client.stateless_svc.GetReplaceJobDiff( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) return resp
def wait_for_jobmgr_available(self): """ utility method to wait for job manger leader to come up. good practice to check before all write apis """ attempts = 0 while attempts < self.config.max_retry_attempts: try: request = stateless_svc.DeleteJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value="dummy-entity-version"), ) self.client.stateless_svc.DeleteJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: if e.code() != grpc.StatusCode.UNAVAILABLE: break log.info("waiting for job manager leader") time.sleep(self.config.sleep_time_sec) attempts += 1
def pause(self, entity_version=None): """ pause the given update """ # wait for job manager leader self.job.wait_for_jobmgr_available() job_entity_version = (entity_version or self.job.entity_version or self.job.get_status().version.value) while True: request = stateless_svc.PauseJobWorkflowRequest( job_id=v1alpha_peloton.JobID(value=self.job.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.PauseJobWorkflow( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if config version is incorrect and caller does not specify a # config version, get config version from job runtime # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.job.get_status().version.value) continue raise break self.job.entity_version = resp.version.value log.info("job workflow paused: %s", self.job.entity_version)
def stop(self, ranges=None, entity_version=None): """ Stops certain pods based on the ranges. If ranges is not provided then it stops the job Job level stop does not support range. We are using pod api for range operation. We do this for backward compatibility of existing tests :param ranges: the instance ranges to stop :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, stop will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, stop will query job runtime to get config version and retry until version is correct. :return: stop response from the API """ # wait for job manager leader self.wait_for_jobmgr_available() if ranges is None: job_entity_version = (entity_version or self.entity_version or self.get_status().version.value) while True: request = stateless_svc.StopJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.StopJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if (e.code() == grpc.StatusCode.ABORTED and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() and entity_version is None): job_entity_version = (entity_version or self.get_status().version.value) continue raise break self.entity_version = resp.version.value log.info("job stopped, new entity version: %s", self.entity_version) return resp for pod_range in ranges: for pod_id in range(getattr(pod_range, "from"), pod_range.to): pod_name = self.job_id + "-" + str(pod_id) request = pod_svc.StopPodRequest( pod_name=v1alpha_peloton.PodName(value=pod_name)) self.client.pod_svc.StopPod( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info("stopping pods in job {0} with ranges {1}".format( self.job_id, ranges)) return pod_svc.StopPodResponse()
def start(self, ranges=None, entity_version=None): """ Starts certain pods based on the ranges. If ranges is not provided it starts all pods of the job Job level start does not support range. We are using pod api for range operation. We do this for backward compatibility of existing tests :param ranges: the instance ranges to start :param entity_version: the entity version of the job, for concurrency control. If entity_version is provided, start will use the provided value, and raise an exception if version is wrong. if entity_version is not provided, start will query job runtime to get config version and retry until version is correct. :return: start response from the API """ if ranges is None: job_entity_version = entity_version or \ self.entity_version or \ self.get_status().version.value while True: request = stateless_svc.StartJobRequest( job_id=v1alpha_peloton.JobID(value=self.job_id), version=v1alpha_peloton.EntityVersion( value=job_entity_version), ) try: resp = self.client.stateless_svc.StartJob( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) except grpc.RpcError as e: # if entity version is incorrect, get entity version from job status # and try again. if e.code() == grpc.StatusCode.INVALID_ARGUMENT \ and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \ and entity_version is None: job_entity_version = entity_version or \ self.get_status().version.value continue raise break self.entity_version = resp.version.value log.info('job started, new entity version: %s', self.entity_version) return resp for pod_range in ranges: for pod_id in range(getattr(pod_range, 'from'), pod_range.to): pod_name = self.job_id + '-' + str(pod_id) request = pod_svc.StartPodRequest( pod_name=v1alpha_peloton.PodName(value=pod_name), ) self.client.pod_svc.StartPod( request, metadata=self.client.jobmgr_metadata, timeout=self.config.rpc_timeout_sec, ) log.info('starting pods in job {0} with ranges {1}'.format( self.job_id, ranges)) return pod_svc.StartPodResponse()