def finish_job(self, job_state): super(KubernetesJobRunner, self).finish_job(job_state) jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) # If more than one job matches selector, leave all jobs intact as it's a configuration error if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def finish_job(self, job_state): super(KubernetesJobRunner, self).finish_job(job_state) jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) != 1: log.warning("More than one job matches selector. Possible configuration error" " in job id '%s'", job_state.job_id) job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: jobs = Job.objects(self._pykube_api).filter( selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag()), namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) > 0: job_to_delete = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job_to_delete) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def finish_job(self, job_state): super().finish_job(job_state) jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) != 1: log.warning( "More than one job matches selector. Possible configuration error" " in job id '%s'", job_state.job_id) job = Job(self._pykube_api, jobs.response['items'][0]) self.__cleanup_k8s_job(job)
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug("Starting queue_job for job " + job_wrapper.get_id_tag()) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False, stdout_file=ajs.output_file, stderr_file=ajs.error_file): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception("(%s) failure writing job script" % job_wrapper.get_id_tag()) return # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix, self.__get_k8s_job_spec(ajs)) job = Job(self._pykube_api, k8s_job_obj) job.create() job_id = job.metadata['name'] # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = job_id # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(job_id) self.monitor_queue.put(ajs)
def _active_kubernetes_jobs(self): pykube_api = pykube_client_from_dict({}) # TODO: namespace. jobs = Job.objects(pykube_api).filter() active = 0 for job in jobs: if self.instance_id not in job.obj["metadata"]["name"]: continue status = job.obj["status"] active += status.get("active", 0) return active
def finish_job(self, job_state): self._handle_metadata_externally(job_state.job_wrapper, resolve_requirements=True) super().finish_job(job_state) jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) > 1: log.warning("More than one job matches selector: %s. Possible configuration error" " in job id '%s'" % (jobs.response['items'], job_state.job_id)) elif len(jobs.response['items']) == 0: log.warning("No k8s job found which matches job id '%s'. Ignoring...", job_state.job_id) else: job = Job(self._pykube_api, jobs.response['items'][0]) if self.__has_guest_ports(job_state.job_wrapper): self.__cleanup_k8s_guest_ports(job_state.job_wrapper, job) self.__cleanup_k8s_job(job)
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: job_to_delete = find_job_object_by_name(self._pykube_api, job.get_job_runner_external_id(), self.runner_params['k8s_namespace']) if job_to_delete and len(job_to_delete.response['items']) > 0: k8s_job = Job(self._pykube_api, job_to_delete.response['items'][0]) if self.__has_guest_ports(job_wrapper): self.__cleanup_k8s_guest_ports(job_wrapper, k8s_job) self.__cleanup_k8s_job(k8s_job) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") except Exception as e: log.exception("({}/{}) User killed running job, but error encountered during termination: {}".format( job.id, job.get_job_runner_external_id(), e))
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter( selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 if 'max_pod_retries' in job_destination.params: max_pod_retries = int( job_destination.params['max_pod_retries']) elif 'k8s_pod_retries' in self.runner_params: max_pod_retries = int(self.runner_params['k8s_pod_retries']) elif 'max_pod_retrials' in job_destination.params: # For backward compatibility max_pod_retries = int( job_destination.params['max_pod_retrials']) elif 'k8s_pod_retrials' in self.runner_params: # For backward compatibility max_pod_retries = int(self.runner_params['max_pod_retrials']) else: max_pod_retries = 1 # Check if job.obj['status'] is empty, # return job_state unchanged if this is the case # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. if len(job.obj['status']) == 0: return job_state if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retries: if not job_state.running: job_state.running = True job_state.job_wrapper.change_state( model.Job.states.RUNNING) return job_state elif job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has not been deleted, # remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None else: return self._handle_job_failure(job, job_state) elif len(jobs.response['items']) == 0: if job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has been deleted, # cleanup and remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=%s", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "No Kubernetes Jobs are available under expected selector app=%s\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "More than one Kubernetes Job associated with job id '%s'\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug("Starting queue_job for job " + job_wrapper.get_id_tag()) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False, stdout_file=ajs.output_file, stderr_file=ajs.error_file): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception("(%s) failure writing job script" % job_wrapper.get_id_tag()) return # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_name = self.__produce_unique_k8s_job_name( job_wrapper.get_id_tag()) k8s_job_obj = job_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_job_spec(ajs)) # Checks if job exists and is trusted, or if it needs re-creation. job = Job(self._pykube_api, k8s_job_obj) if job.exists() and not self._galaxy_instance_id: # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and # re-create the job log.debug( "Matching job exists, but Job is not trusted, so it will be deleted and a new one created." ) job.delete() elapsed_seconds = 0 while job.exists(): sleep(3) elapsed_seconds += 3 if elapsed_seconds > self.runner_params[ 'k8s_timeout_seconds_job_deletion']: log.debug( "Timed out before k8s could delete existing untrusted job " + k8s_job_name + ", not queuing associated Galaxy job.") return log.debug("Waiting for job to be deleted " + k8s_job_name) Job(self._pykube_api, k8s_job_obj).create() elif job.exists() and self._galaxy_instance_id: # The job exists and we trust the identifier. log.debug( "Matching job exists, but Job is trusted, so we simply use the existing one for " + k8s_job_name) # We simply leave the k8s job to be handled later on by the check watched-items. else: # Creates the Kubernetes Job if it doesn't exist. job.create() # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = k8s_job_name # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination(job_wrapper.job_destination, k8s_job_name) self.monitor_queue.put(ajs)
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is an AsynchronousJobState""" jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 if 'max_pod_retries' in job_destination.params: max_pod_retries = int( job_destination.params['max_pod_retries']) elif 'k8s_pod_retries' in self.runner_params: max_pod_retries = int(self.runner_params['k8s_pod_retries']) else: max_pod_retries = 1 # Check if job.obj['status'] is empty, # return job_state unchanged if this is the case # as probably this means that the k8s API server hasn't # had time to fill in the object status since the # job was created only too recently. if len(job.obj['status']) == 0: return job_state if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] job_persisted_state = job_state.job_wrapper.get_state() # This assumes jobs dependent on a single pod, single container if succeeded > 0 or job_state == model.Job.states.STOPPED: job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retries: if not job_state.running: if self.__job_pending_due_to_unschedulable_pod(job_state): if self.runner_params.get( 'k8s_unschedulable_walltime_limit'): creation_time_str = job.obj['metadata'].get( 'creationTimestamp') creation_time = datetime.strptime( creation_time_str, '%Y-%m-%dT%H:%M:%SZ') elapsed_seconds = (datetime.utcnow() - creation_time).total_seconds() if elapsed_seconds > self.runner_params[ 'k8s_unschedulable_walltime_limit']: return self._handle_unschedulable_job( job, job_state) else: pass else: pass else: job_state.running = True job_state.job_wrapper.change_state( model.Job.states.RUNNING) return job_state elif job_persisted_state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has not been deleted, # remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None else: return self._handle_job_failure(job, job_state) elif len(jobs.response['items']) == 0: if job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job and job has been deleted, # cleanup and remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=%s", job_state.job_id) self.mark_as_failed(job_state) # job is no longer viable - remove from watched jobs return None else: # there is more than one job associated to the expected unique job id used as selector. log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id) self.mark_as_failed(job_state) # job is no longer viable - remove from watched jobs return None
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False, stdout_file=ajs.output_file, stderr_file=ajs.error_file): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception( f"({job_wrapper.get_id_tag()}) failure writing job script") return # Construction of Kubernetes objects follow: https://kubernetes.io/docs/concepts/workloads/controllers/job/ if self.__has_guest_ports(job_wrapper): try: self.__configure_port_routing(ajs) except HTTPError: log.exception( "Kubernetes failed to expose tool ports as services, HTTP exception encountered" ) ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to export tool ports as services." self.mark_as_failed(ajs) return k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix, self.__get_k8s_job_spec(ajs)) job = Job(self._pykube_api, k8s_job_obj) try: job.create() except HTTPError: log.exception( "Kubernetes failed to create job, HTTP exception encountered") ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return if not job.name: log.exception( f"Kubernetes failed to create job, empty name encountered: [{job.obj}]" ) ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return job_id = job.name # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = job_id # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(job_id) self.monitor_queue.put(ajs)
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False, stdout_file=ajs.output_file, stderr_file=ajs.error_file): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception( f"({job_wrapper.get_id_tag()}) failure writing job script") return # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_prefix = self.__produce_k8s_job_prefix() guest_ports = ajs.job_wrapper.guest_ports ports_dict = {} for guest_port in guest_ports: ports_dict[str(guest_port)] = dict(host='manual', port=guest_port, protocol="https") eps = None if ajs.job_wrapper.guest_ports: k8s_job_name = self.__get_k8s_job_name(k8s_job_prefix, ajs.job_wrapper) log.debug( f'Configuring entry points and deploying service/ingress for job with ID {ajs.job_id}' ) k8s_service_obj = service_object_dict( self.runner_params, k8s_job_name, self.__get_k8s_service_spec(ajs)) eps = self.app.interactivetool_manager.configure_entry_points( ajs.job_wrapper.get_job(), ports_dict) k8s_ingress_obj = ingress_object_dict( self.runner_params, k8s_job_name, self.__get_k8s_ingress_spec(ajs, eps)) service = Service(self._pykube_api, k8s_service_obj) service.create() ingress = Ingress(self._pykube_api, k8s_ingress_obj) ingress.create() k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix, self.__get_k8s_job_spec(ajs, eps)) job = Job(self._pykube_api, k8s_job_obj) try: job.create() except HTTPError: log.exception( "Kubernetes failed to create job, HTTP exception encountered") ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return if not job.name: log.exception( f"Kubernetes failed to create job, empty name encountered: [{job.obj}]" ) ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return job_id = job.name # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = job_id # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(job_id) self.monitor_queue.put(ajs)