def stop_job(self):
        while True:
            # first get the entity version
            job_info = self.get_job_info()
            version = job_info.status.version.value

            request = stateless_svc.StopJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(value=version),
            )
            try:
                self.client.stateless_svc.StopJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=default_timeout,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, just retry
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                ):
                    continue
                raise
            break
示例#2
0
    def resume(self, entity_version=None):
        """
        resume the given update
        """
        job_entity_version = entity_version or \
            self.job.entity_version or \
            self.job.get_status().version.value

        while True:
            request = stateless_svc.ResumeJobWorkflowRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(value=job_entity_version),
            )
            try:
                resp = self.client.stateless_svc.ResumeJobWorkflow(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if e.code() == grpc.StatusCode.INVALID_ARGUMENT \
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \
                        and entity_version is None:
                    job_entity_version = entity_version or \
                                         self.job.get_status().version.value
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info('job workflow resumed: %s', self.job.entity_version)
 def get_job_info(self):
     request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID(
         value=self.job_id))
     resp = self.client.stateless_svc.GetJob(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=default_timeout,
     )
     return resp.job_info
    def create(self, in_place=False, entity_version=None):
        """
        replace the job spec with the spec provided in StatelessUpdate
        if entity_version is provided,  replace will use the provided value,
        and raise an exception if version is wrong.
        if entity_version is not provided, replace will query job runtime to
        get config version and retry until version is correct.
        :return: the update ID
        """
        # wait for job manager leader
        self.job.wait_for_jobmgr_available()

        respool_id = self.pool.ensure_exists()
        self.updated_job_spec.respool_id.value = respool_id

        job_entity_version = (entity_version or self.job.entity_version
                              or self.job.get_status().version.value)

        while True:
            request = stateless_svc.ReplaceJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version),
                spec=self.updated_job_spec,
                update_spec=stateless.UpdateSpec(
                    batch_size=self.batch_size,
                    rollback_on_failure=self.roll_back_on_failure,
                    max_instance_retries=self.max_instance_attempts,
                    max_tolerable_instance_failures=self.max_failure_instances,
                    start_paused=self.start_paused,
                    in_place=in_place,
                ),
            )
            try:
                resp = self.client.stateless_svc.ReplaceJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if (e.code() == grpc.StatusCode.ABORTED
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                        and entity_version is None):
                    job_entity_version = (entity_version or
                                          self.job.get_status().version.value)
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info(
            "job spec replaced with new entity version: %s",
            self.job.entity_version,
        )
示例#5
0
 def list_workflows(self):
     """
     :return: the list of workflows for a job.
     """
     request = stateless_svc.ListJobWorkflowsRequest(
         job_id=v1alpha_peloton.JobID(value=self.job_id), )
     resp = self.client.stateless_svc.ListJobWorkflows(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     return resp.workflow_infos
示例#6
0
 def query_pods(self):
     """
     :return: list of pod info of all matching pod
     """
     request = stateless_svc.QueryPodsRequest(job_id=v1alpha_peloton.JobID(
         value=self.job_id))
     resp = self.client.stateless_svc.QueryPods(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     return resp.pods
示例#7
0
 def get_job(self):
     """
     :return: the configuration and runtime status of a job.
     """
     request = stateless_svc.GetJobRequest(job_id=v1alpha_peloton.JobID(
         value=self.job_id))
     resp = self.client.stateless_svc.GetJob(
         request,
         metadata=self.client.jobmgr_metadata,
         timeout=self.config.rpc_timeout_sec,
     )
     return resp
    def update_job(
        self,
        instance_inc,
        batch_size,
        use_instance_config,
        sleep_time,
        host_limit_1=False,
    ):
        default_config = self.create_pod_config(
            sleep_time, "static", host_limit_1=host_limit_1)
        job_spec = create_stateless_job_spec(
            "instance %s && sleep %s" % (instance_inc, sleep_time),
            [
                v1alpha_peloton.Label(key="task_num", value=str(instance_inc)),
                v1alpha_peloton.Label(key="sleep_time", value=str(sleep_time)),
            ],
            instance_inc,
            default_config,
            self.respool_id,
        )
        update_spec = stateless.UpdateSpec(batch_size=batch_size)

        while True:
            # first get the entity version
            job_info = self.get_job_info()
            version = job_info.status.version.value
            job_spec.instance_count = (
                job_info.spec.instance_count + instance_inc
            )

            request = stateless_svc.ReplaceJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(value=version),
                spec=job_spec,
                update_spec=update_spec,
            )
            try:
                resp = self.client.stateless_svc.ReplaceJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=default_timeout,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, just retry
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                ):
                    continue
                raise
            break
        return resp
示例#9
0
 def list_pods(self):
     """
     return all pods in the job
     """
     podSummaries = []
     request = stateless_svc.ListPodsRequest(
         job_id=v1alpha_peloton.JobID(value=self.job_id), )
     for resp in self.client.stateless_svc.ListPods(
             request,
             metadata=self.client.jobmgr_metadata,
             timeout=self.config.rpc_timeout_sec):
         for podSummary in resp.pods:
             podSummaries.append(podSummary)
     return podSummaries
示例#10
0
    def delete(self, entity_version=None, force_delete=False):
        """
        Delete the job

        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided,  start will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, start will query job runtime to
            get config version and retry until version is correct.
        :param force_delete: force delete a job.  If set to true, it will force
            a delete of the job even if it is running.The job will be first
            stopped and deleted. This step cannot be undone, and the job cannot
            be re-created (with same uuid) till the delete is complete.
        """
        job_entity_version = (
            entity_version
            or self.entity_version
            or self.get_status().version.value
        )

        while True:
            request = stateless_svc.DeleteJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version
                ),
                force=force_delete,
            )
            try:
                self.client.stateless_svc.DeleteJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, get entity version from job status
                # and try again.
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                    and entity_version is None
                ):
                    job_entity_version = (
                        entity_version or self.get_status().version.value
                    )
                    continue
                raise
            break
        log.info("job %s deleted", self.job_id)
示例#11
0
    def restart(
        self, entity_version=None, batch_size=None, ranges=None, in_place=False
    ):
        """
        Restart pods based on the ranges.
        If ranges is not provided then it restarts all pods of the job

        :return: restart response from the API
        """
        job_entity_version = (
            entity_version
            or self.entity_version
            or self.get_status().version.value
        )

        while True:
            request = stateless_svc.RestartJobRequest(
                job_id=v1alpha_peloton.JobID(value=self.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version
                ),
                restart_spec=stateless.RestartSpec(
                    batch_size=batch_size, ranges=ranges, in_place=in_place
                ),
            )
            try:
                resp = self.client.stateless_svc.RestartJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if entity version is incorrect, get entity version from job status
                # and try again.
                if (
                    e.code() == grpc.StatusCode.ABORTED
                    and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                    and entity_version is None
                ):
                    job_entity_version = (
                        entity_version or self.get_status().version.value
                    )
                    continue
                raise
            break
        self.entity_version = resp.version.value
        log.info("job restarted, new entity version: %s", self.entity_version)
        return resp
示例#12
0
    def get_replace_job_diff(self, entity_version=None, job_spec=None):
        """
        :return: get replace job diff response.
        """
        job_entity_version = entity_version or \
            self.entity_version or \
            self.get_status().version.value

        request = stateless_svc.GetReplaceJobDiffRequest(
            job_id=v1alpha_peloton.JobID(value=self.job_id),
            version=v1alpha_peloton.EntityVersion(value=job_entity_version),
            spec=job_spec,
        )
        resp = self.client.stateless_svc.GetReplaceJobDiff(
            request,
            metadata=self.client.jobmgr_metadata,
            timeout=self.config.rpc_timeout_sec,
        )
        return resp
示例#13
0
 def wait_for_jobmgr_available(self):
     """
     utility method to wait for job manger leader to come up.
     good practice to check before all write apis
     """
     attempts = 0
     while attempts < self.config.max_retry_attempts:
         try:
             request = stateless_svc.DeleteJobRequest(
                 job_id=v1alpha_peloton.JobID(value=self.job_id),
                 version=v1alpha_peloton.EntityVersion(
                     value="dummy-entity-version"),
             )
             self.client.stateless_svc.DeleteJob(
                 request,
                 metadata=self.client.jobmgr_metadata,
                 timeout=self.config.rpc_timeout_sec,
             )
         except grpc.RpcError as e:
             if e.code() != grpc.StatusCode.UNAVAILABLE:
                 break
         log.info("waiting for job manager leader")
         time.sleep(self.config.sleep_time_sec)
         attempts += 1
    def pause(self, entity_version=None):
        """
        pause the given update
        """
        # wait for job manager leader
        self.job.wait_for_jobmgr_available()

        job_entity_version = (entity_version or self.job.entity_version
                              or self.job.get_status().version.value)

        while True:
            request = stateless_svc.PauseJobWorkflowRequest(
                job_id=v1alpha_peloton.JobID(value=self.job.job_id),
                version=v1alpha_peloton.EntityVersion(
                    value=job_entity_version),
            )
            try:
                resp = self.client.stateless_svc.PauseJobWorkflow(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
            except grpc.RpcError as e:
                # if config version is incorrect and caller does not specify a
                # config version, get config version from job runtime
                # and try again.
                if (e.code() == grpc.StatusCode.ABORTED
                        and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                        and entity_version is None):
                    job_entity_version = (entity_version or
                                          self.job.get_status().version.value)
                    continue
                raise
            break
        self.job.entity_version = resp.version.value
        log.info("job workflow paused: %s", self.job.entity_version)
示例#15
0
    def wait_for_workflow_state(self,
                                goal_state="SUCCEEDED",
                                failed_state="FAILED"):
        """
        Waits for the job workflow to reach a particular state
        :param goal_state: The state to reach
        :param failed_state: The failed state of the job
        """
        state = ""
        attempts = 0
        start = time.time()
        log.info("%s waiting for state workflow %s", self.job_id, goal_state)
        state_transition_failure = False
        # convert the name from v0 state name to v1 alpha state name,
        # so the function signature can be shared between the apis
        goal_state = "WORKFLOW_STATE_" + goal_state
        failed_state = "WORKFLOW_STATE_" + failed_state
        instance_completed = 0
        while attempts < self.config.max_retry_attempts:
            try:
                request = stateless_svc.GetJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id))
                resp = self.client.stateless_svc.GetJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                status = resp.workflow_info.status
                new_state = stateless.WorkflowState.Name(status.state)
                if state != new_state:
                    log.info("%s transitioned to state %s", self.job_id,
                             new_state)
                state = new_state
                if state == goal_state:
                    break
                # If we assert here, we will log the exception,
                # and continue with the finally block. Set a flag
                # here to indicate failure and then break the loop
                # in the finally block
                if state == failed_state:
                    state_transition_failure = True
            except Exception as e:
                log.warn(e)
                attempts += 1
            else:
                # for workflow, we only begin to count attempts when no progress is made
                if instance_completed == status.num_instances_completed + status.num_instances_failed:
                    attempts += 1
                else:
                    instance_completed = status.num_instances_completed + status.num_instances_failed
                    attempts = 0
            finally:
                if state_transition_failure:
                    break
                time.sleep(self.config.sleep_time_sec)

        if state_transition_failure:
            log.info(
                "goal_state:%s current_state:%s attempts: %s",
                goal_state,
                state,
                str(attempts),
            )
            assert False

        if attempts == self.config.max_retry_attempts:
            log.info("%s max attempts reached to wait for goal state",
                     self.job_id)
            log.info("goal_state:%s current_state:%s", goal_state, state)
            assert False

        end = time.time()
        elapsed = end - start
        log.info("%s state transition took %s seconds", self.job_id, elapsed)
示例#16
0
    def stop(self, ranges=None, entity_version=None):
        """
        Stops certain pods based on the ranges.
        If ranges is not provided then it stops the job

        Job level stop does not support range.
        We are using pod api for range operation.
        We do this for backward compatibility of existing tests

        :param ranges: the instance ranges to stop
        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided, stop will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, stop will query job runtime to
            get config version and retry until version is correct.
        :return: stop response from the API
        """
        # wait for job manager leader
        self.wait_for_jobmgr_available()
        if ranges is None:
            job_entity_version = (entity_version or self.entity_version
                                  or self.get_status().version.value)

            while True:
                request = stateless_svc.StopJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id),
                    version=v1alpha_peloton.EntityVersion(
                        value=job_entity_version),
                )
                try:
                    resp = self.client.stateless_svc.StopJob(
                        request,
                        metadata=self.client.jobmgr_metadata,
                        timeout=self.config.rpc_timeout_sec,
                    )
                except grpc.RpcError as e:
                    # if entity version is incorrect, get entity version from job status
                    # and try again.
                    if (e.code() == grpc.StatusCode.ABORTED and
                            INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
                            and entity_version is None):
                        job_entity_version = (entity_version or
                                              self.get_status().version.value)
                        continue
                    raise
                break
            self.entity_version = resp.version.value
            log.info("job stopped, new entity version: %s",
                     self.entity_version)
            return resp

        for pod_range in ranges:
            for pod_id in range(getattr(pod_range, "from"), pod_range.to):
                pod_name = self.job_id + "-" + str(pod_id)
                request = pod_svc.StopPodRequest(
                    pod_name=v1alpha_peloton.PodName(value=pod_name))
                self.client.pod_svc.StopPod(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )

        log.info("stopping pods in job {0} with ranges {1}".format(
            self.job_id, ranges))
        return pod_svc.StopPodResponse()
示例#17
0
    def wait_for_state(self, goal_state='SUCCEEDED', failed_state='FAILED'):
        """
        Waits for the job to reach a particular state
        :param goal_state: The state to reach
        :param failed_state: The failed state of the job
        """
        state = ''
        attempts = 0
        start = time.time()
        log.info('%s waiting for state %s', self.job_id, goal_state)
        state_transition_failure = False
        # convert the name from v0 state name to v1 alpha state name,
        # so the function signature can be shared between the apis
        goal_state = 'JOB_STATE_' + goal_state
        failed_state = 'JOB_STATE_' + failed_state
        while attempts < self.config.max_retry_attempts:
            try:
                request = stateless_svc.GetJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id), )
                resp = self.client.stateless_svc.GetJob(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )
                status = resp.job_info.status
                new_state = stateless.JobState.Name(status.state)
                if state != new_state:
                    log.info('%s transitioned to state %s', self.job_id,
                             new_state)
                state = new_state
                if state == goal_state:
                    break
                # If we assert here, we will log the exception,
                # and continue with the finally block. Set a flag
                # here to indicate failure and then break the loop
                # in the finally block
                if state == failed_state:
                    state_transition_failure = True
            except Exception as e:
                log.warn(e)
            finally:
                if state_transition_failure:
                    break
                time.sleep(self.config.sleep_time_sec)
                attempts += 1

        if state_transition_failure:
            log.info('goal_state:%s current_state:%s attempts: %s', goal_state,
                     state, str(attempts))
            assert False

        if attempts == self.config.max_retry_attempts:
            log.info('%s max attempts reached to wait for goal state',
                     self.job_id)
            log.info('goal_state:%s current_state:%s', goal_state, state)
            assert False

        end = time.time()
        elapsed = end - start
        log.info('%s state transition took %s seconds', self.job_id, elapsed)
        assert state == goal_state
示例#18
0
    def start(self, ranges=None, entity_version=None):
        """
        Starts certain pods based on the ranges.
        If ranges is not provided it starts all pods of the job

        Job level start does not support range.
        We are using pod api for range operation.
        We do this for backward compatibility of existing tests

        :param ranges: the instance ranges to start
        :param entity_version: the entity version of the job, for concurrency control.
            If entity_version is provided, start will use the provided value,
            and raise an exception if version is wrong.
            if entity_version is not provided, start will query job runtime to
            get config version and retry until version is correct.
        :return: start response from the API
        """
        if ranges is None:
            job_entity_version = entity_version or \
                self.entity_version or \
                self.get_status().version.value

            while True:
                request = stateless_svc.StartJobRequest(
                    job_id=v1alpha_peloton.JobID(value=self.job_id),
                    version=v1alpha_peloton.EntityVersion(
                        value=job_entity_version),
                )
                try:
                    resp = self.client.stateless_svc.StartJob(
                        request,
                        metadata=self.client.jobmgr_metadata,
                        timeout=self.config.rpc_timeout_sec,
                    )
                except grpc.RpcError as e:
                    # if entity version is incorrect, get entity version from job status
                    # and try again.
                    if e.code() == grpc.StatusCode.INVALID_ARGUMENT \
                            and INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() \
                            and entity_version is None:
                        job_entity_version = entity_version or \
                            self.get_status().version.value
                        continue
                    raise
                break
            self.entity_version = resp.version.value
            log.info('job started, new entity version: %s',
                     self.entity_version)
            return resp

        for pod_range in ranges:
            for pod_id in range(getattr(pod_range, 'from'), pod_range.to):
                pod_name = self.job_id + '-' + str(pod_id)
                request = pod_svc.StartPodRequest(
                    pod_name=v1alpha_peloton.PodName(value=pod_name), )
                self.client.pod_svc.StartPod(
                    request,
                    metadata=self.client.jobmgr_metadata,
                    timeout=self.config.rpc_timeout_sec,
                )

        log.info('starting pods in job {0} with ranges {1}'.format(
            self.job_id, ranges))
        return pod_svc.StartPodResponse()