def finish_job(self, job_state): super().finish_job(job_state) jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) != 1: log.warning( "More than one job matches selector. Possible configuration error" " in job id '%s'", job_state.job_id) job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def finish_job(self, job_state): self._handle_metadata_externally(job_state.job_wrapper, resolve_requirements=True) super().finish_job(job_state) jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) > 1: log.warning("More than one job matches selector: %s. Possible configuration error" " in job id '%s'" % (jobs.response['items'], job_state.job_id)) elif len(jobs.response['items']) == 0: log.warning("No k8s job found which matches job id '%s'. Ignoring...", job_state.job_id) else: job = Job(self._pykube_api, jobs.response['items'][0]) if self.__has_guest_ports(job_state.job_wrapper): self.__cleanup_k8s_guest_ports(job_state.job_wrapper, job) self.__cleanup_k8s_job(job)
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: name = self.__produce_unique_k8s_job_name(job.get_id_tag()) namespace = self.runner_params['k8s_namespace'] job_to_delete = find_job_object_by_name(self._pykube_api, name, namespace) if job_to_delete: self.__cleanup_k8s_job(job_to_delete) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.exception("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: job_to_delete = find_job_object_by_name(self._pykube_api, job.get_job_runner_external_id(), self.runner_params['k8s_namespace']) if job_to_delete and len(job_to_delete.response['items']) > 0: k8s_job = Job(self._pykube_api, job_to_delete.response['items'][0]) if self.__has_guest_ports(job_wrapper): self.__cleanup_k8s_guest_ports(job_wrapper, k8s_job) self.__cleanup_k8s_job(k8s_job) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") except Exception as e: log.exception("({}/{}) User killed running job, but error encountered during termination: {}".format( job.id, job.get_job_runner_external_id(), e))
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is an AsynchronousJobState""" jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 if 'max_pod_retries' in job_destination.params: max_pod_retries = int( job_destination.params['max_pod_retries']) elif 'k8s_pod_retries' in self.runner_params: max_pod_retries = int(self.runner_params['k8s_pod_retries']) else: max_pod_retries = 1 # Check if job.obj['status'] is empty, # return job_state unchanged if this is the case # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. if len(job.obj['status']) == 0: return job_state if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] job_persisted_state = job_state.job_wrapper.get_state() # This assumes jobs dependent on a single pod, single container if succeeded > 0 or job_state == model.Job.states.STOPPED: job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retries: if not job_state.running: if self.__job_pending_due_to_unschedulable_pod(job_state): if self.runner_params.get( 'k8s_unschedulable_walltime_limit'): creation_time_str = job.obj['metadata'].get( 'creationTimestamp') creation_time = datetime.strptime( creation_time_str, '%Y-%m-%dT%H:%M:%SZ') elapsed_seconds = (datetime.utcnow() - creation_time).total_seconds() if elapsed_seconds > self.runner_params[ 'k8s_unschedulable_walltime_limit']: return self._handle_unschedulable_job( job, job_state) else: pass else: pass else: job_state.running = True job_state.job_wrapper.change_state( model.Job.states.RUNNING) return job_state elif job_persisted_state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has not been deleted, # remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None else: return self._handle_job_failure(job, job_state) elif len(jobs.response['items']) == 0: if job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has been deleted, # cleanup and remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=%s", job_state.job_id) self.mark_as_failed(job_state) # job is no longer viable - remove from watched jobs return None else: # there is more than one job associated to the expected unique job id used as selector. log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id) self.mark_as_failed(job_state) # job is no longer viable - remove from watched jobs return None