def stop_job(self, job): """Attempts to delete a dispatched job to the k8s cluster""" try: jobs = Job.objects(self._pykube_api).filter(selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag())) if len(jobs.response['items']) >= 0: job_to_delete = Job(self._pykube_api, jobs.response['items'][0]) job_to_delete.scale(replicas=0) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: jobs = Job.objects(self._pykube_api).filter(selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag())) if len(jobs.response['items']) >= 0: job_to_delete = Job(self._pykube_api, jobs.response['items'][0]) job_to_delete.scale(replicas=0) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def __get_job_status(self): """Return the Kubernetes job status""" # Look for the required job jobs = Job.objects(self.__kube_api).filter(selector="luigi_task_id=" + self.job_uuid) # Raise an exception if no such job found if len(jobs.response["items"]) == 0: raise RuntimeError("Kubernetes job " + self.uu_name + " not found") # Figure out status and return it job = Job(self.__kube_api, jobs.response["items"][0]) if ("succeeded" in job.obj["status"] and job.obj["status"]["succeeded"] > 0): job.scale(replicas=0) # Downscale the job, but keep it for logging return "succeeded" if ("failed" in job.obj["status"]): failed_cnt = job.obj["status"]["failed"] self.__logger.debug("Kubernetes job " + self.uu_name + " status.failed: " + str(failed_cnt)) if (failed_cnt > self.max_retrials): job.scale(replicas=0) # avoid more retrials return "failed" return "running"
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 max_pod_retrials = 1 if 'k8s_pod_retrials' in self.runner_params: max_pod_retrials = int(self.runner_params['k8s_pod_retrials']) if 'max_pod_retrials' in job_destination.params: max_pod_retrials = int(job_destination.params['max_pod_retrials']) if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write("") error_file.close() job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retrials: job_state.running = True return job_state elif failed > max_pod_retrials: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write("Exceeded max number of Kubernetes pod retrials allowed for job\n") error_file.close() job_state.running = False job_state.fail_message = "More pods failed than allowed. See stdout for pods details." self.mark_as_failed(job_state) job.scale(replicas=0) return None # We should not get here log.debug( "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed.") return job_state elif len(jobs.response['items']) == 0: # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=" + job_state.job_id) error_file = open(job_state.error_file, 'w') error_file.write("No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error("There is more than one Kubernetes Job associated to job id " + job_state.job_id) self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write("There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 max_pod_retrials = 1 if 'k8s_pod_retrials' in self.runner_params: max_pod_retrials = int(self.runner_params['k8s_pod_retrials']) if 'max_pod_retrials' in job_destination.params: max_pod_retrials = int( job_destination.params['max_pod_retrials']) if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write("") error_file.close() job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retrials: job_state.running = True return job_state elif failed > max_pod_retrials: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write( "Exceeded max number of Kubernetes pod retrials allowed for job\n" ) error_file.close() job_state.running = False job_state.fail_message = "More pods failed than allowed. See stdout for pods details." self.mark_as_failed(job_state) job.scale(replicas=0) return None # We should not get here log.debug( "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed." ) return job_state elif len(jobs.response['items']) == 0: # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=" + job_state.job_id) error_file = open(job_state.error_file, 'w') error_file.write( "No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error( "There is more than one Kubernetes Job associated to job id " + job_state.job_id) self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write( "There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state