def finish_job(self, job_state): super(KubernetesJobRunner, self).finish_job(job_state) jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) # If more than one job matches selector, leave all jobs intact as it's a configuration error if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def finish_job(self, job_state): super(KubernetesJobRunner, self).finish_job(job_state) jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) != 1: log.warning("More than one job matches selector. Possible configuration error" " in job id '%s'", job_state.job_id) job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def _active_kubernetes_jobs(self): pykube_api = pykube_client_from_dict({}) # TODO: namespace. jobs = Job.objects(pykube_api).filter() active = 0 for job in jobs: if self.instance_id not in job.obj["metadata"]["name"]: continue status = job.obj["status"] active += status.get("active", 0) return active
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: jobs = Job.objects(self._pykube_api).filter( selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag()), namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) > 0: job_to_delete = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job_to_delete) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter( selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 if 'max_pod_retries' in job_destination.params: max_pod_retries = int( job_destination.params['max_pod_retries']) elif 'k8s_pod_retries' in self.runner_params: max_pod_retries = int(self.runner_params['k8s_pod_retries']) elif 'max_pod_retrials' in job_destination.params: # For backward compatibility max_pod_retries = int( job_destination.params['max_pod_retrials']) elif 'k8s_pod_retrials' in self.runner_params: # For backward compatibility max_pod_retries = int(self.runner_params['max_pod_retrials']) else: max_pod_retries = 1 # Check if job.obj['status'] is empty, # return job_state unchanged if this is the case # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. if len(job.obj['status']) == 0: return job_state if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retries: if not job_state.running: job_state.running = True job_state.job_wrapper.change_state( model.Job.states.RUNNING) return job_state elif job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has not been deleted, # remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None else: return self._handle_job_failure(job, job_state) elif len(jobs.response['items']) == 0: if job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has been deleted, # cleanup and remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=%s", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "No Kubernetes Jobs are available under expected selector app=%s\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "More than one Kubernetes Job associated with job id '%s'\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state