Пример #1
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                    namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                self.__produce_log_file(job_state)
                with open(job_state.error_file, 'w'):
                    pass
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif failed > 0 and self.__job_failed_due_to_low_memory(job_state):
                return self._handle_job_failure(job, job_state, reason="OOM")
            elif active > 0 and failed <= max_pod_retrials:
                job_state.running = True
                return job_state
            elif failed > max_pod_retrials:
                return self._handle_job_failure(job, job_state)
            # We should not get here
            log.debug(
                "Reaching unexpected point for Kubernetes job name %s where it is not classified as succ., active nor failed.", job.name)
            log.debug("k8s full job object:\n%s", job.obj)
            return job_state

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=" + job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write("No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n")
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("There is more than one Kubernetes Job associated to job id " + job_state.job_id)
            self.__produce_log_file(job_state)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write("There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n")
            self.mark_as_failed(job_state)
            return job_state
Пример #2
0
    def __get_job(self):
        jobs = Job.objects(self.__kube_api) \
            .filter(namespace=self.job_namespace, selector="luigi_task_id=" + self.job_uuid) \
            .response['items']

        assert len(jobs) == 1, "Kubernetes job " + self.job_namespace +"/"+ self.uu_name + " not found" "(job_uuid= "+  self.job_uuid + ")"
        return Job(self.__kube_api, jobs[0])
Пример #3
0
 def test_fail_job(self):
     fail = FailJob()
     self.assertRaises(RuntimeError, fail.run)
     # Check for retrials
     kube_api = HTTPClient(KubeConfig.from_file("~/.kube/config"))  # assumes minikube
     jobs = Job.objects(kube_api).filter(selector="luigi_task_id="
                                                  + fail.job_uuid)
     self.assertEqual(len(jobs.response["items"]), 1)
     job = Job(kube_api, jobs.response["items"][0])
     self.assertTrue("failed" in job.obj["status"])
     self.assertTrue(job.obj["status"]["failed"] > fail.max_retrials)
Пример #4
0
 def test_fail_job(self):
     fail = FailJob()
     self.assertRaises(RuntimeError, fail.run)
     # Check for retrials
     kube_api = HTTPClient(
         KubeConfig.from_file("~/.kube/config"))  # assumes minikube
     jobs = Job.objects(kube_api).filter(selector="luigi_task_id=" +
                                         fail.job_uuid)
     self.assertEqual(len(jobs.response["items"]), 1)
     job = Job(kube_api, jobs.response["items"][0])
     self.assertTrue("failed" in job.obj["status"])
     self.assertTrue(job.obj["status"]["failed"] > fail.max_retrials)
Пример #5
0
 def stop_job(self, job):
     """Attempts to delete a dispatched job to the k8s cluster"""
     try:
         jobs = Job.objects(self._pykube_api).filter(selector="app=" +
                                                              self.__produce_unique_k8s_job_name(job.get_id_tag()))
         if len(jobs.response['items']) >= 0:
             job_to_delete = Job(self._pykube_api, jobs.response['items'][0])
             job_to_delete.scale(replicas=0)
         # TODO assert whether job parallelism == 0
         # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
         log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id))
     except Exception as e:
         log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % (
             job.id, job.job_runner_external_id, e))
Пример #6
0
 def stop_job(self, job_wrapper):
     """Attempts to delete a dispatched job to the k8s cluster"""
     job = job_wrapper.get_job()
     try:
         jobs = Job.objects(self._pykube_api).filter(selector="app=" +
                                                              self.__produce_unique_k8s_job_name(job.get_id_tag()))
         if len(jobs.response['items']) >= 0:
             job_to_delete = Job(self._pykube_api, jobs.response['items'][0])
             job_to_delete.scale(replicas=0)
         # TODO assert whether job parallelism == 0
         # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
         log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id))
     except Exception as e:
         log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % (
             job.id, job.job_runner_external_id, e))
Пример #7
0
 def __get_job_status(self):
     """Return the Kubernetes job status"""
     # Look for the required job
     jobs = Job.objects(self.__kube_api).filter(selector="luigi_task_id=" +
                                                self.job_uuid)
     # Raise an exception if no such job found
     if len(jobs.response["items"]) == 0:
         raise RuntimeError("Kubernetes job " + self.uu_name + " not found")
     # Figure out status and return it
     job = Job(self.__kube_api, jobs.response["items"][0])
     if ("succeeded" in job.obj["status"]
             and job.obj["status"]["succeeded"] > 0):
         job.scale(replicas=0)  # Downscale the job, but keep it for logging
         return "succeeded"
     if ("failed" in job.obj["status"]):
         failed_cnt = job.obj["status"]["failed"]
         self.__logger.debug("Kubernetes job " + self.uu_name +
                             " status.failed: " + str(failed_cnt))
         if (failed_cnt > self.max_retrials):
             job.scale(replicas=0)  # avoid more retrials
             return "failed"
     return "running"
Пример #8
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id)
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write("")
                error_file.close()
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retrials:
                job_state.running = True
                return job_state
            elif failed > max_pod_retrials:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write("Exceeded max number of Kubernetes pod retrials allowed for job\n")
                error_file.close()
                job_state.running = False
                job_state.fail_message = "More pods failed than allowed. See stdout for pods details."
                self.mark_as_failed(job_state)
                job.scale(replicas=0)
                return None

            # We should not get here
            log.debug(
                "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed.")
            return job_state

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=" + job_state.job_id)
            error_file = open(job_state.error_file, 'w')
            error_file.write("No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("There is more than one Kubernetes Job associated to job id " + job_state.job_id)
            self.__produce_log_file(job_state)
            error_file = open(job_state.error_file, 'w')
            error_file.write("There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
Пример #9
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(selector="app=" +
                                                    job_state.job_id)
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(
                    job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write("")
                error_file.close()
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retrials:
                job_state.running = True
                return job_state
            elif failed > max_pod_retrials:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write(
                    "Exceeded max number of Kubernetes pod retrials allowed for job\n"
                )
                error_file.close()
                job_state.running = False
                job_state.fail_message = "More pods failed than allowed. See stdout for pods details."
                self.mark_as_failed(job_state)
                job.scale(replicas=0)
                return None
            # We should not get here
            log.debug(
                "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed."
            )
            return job_state

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=" +
                      job_state.job_id)
            error_file = open(job_state.error_file, 'w')
            error_file.write(
                "No Kubernetes Jobs are available under expected selector app="
                + job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error(
                "There is more than one Kubernetes Job associated to job id " +
                job_state.job_id)
            self.__produce_log_file(job_state)
            error_file = open(job_state.error_file, 'w')
            error_file.write(
                "There is more than one Kubernetes Job associated to job id " +
                job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
Пример #10
0
def find_job_object_by_name(pykube_api, job_name, namespace=None):
    if not job_name:
        raise ValueError("job name must not be empty")
    return Job.objects(pykube_api).filter(
        field_selector={"metadata.name": job_name}, namespace=namespace)
Пример #11
0
 def __get_job(self):
     jobs = Job.objects(self.__kube_api) \
         .filter(selector="luigi_task_id=" + self.job_uuid) \
         .response['items']
     assert len(jobs) == 1, "Kubernetes job " + self.uu_name + " not found"
     return Job(self.__kube_api, jobs[0])
Пример #12
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(
            selector="app=" + job_state.job_id,
            namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(
                    job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif failed > 0 and self.__job_failed_due_to_low_memory(job_state):
                return self._handle_job_failure(job, job_state, reason="OOM")
            elif active > 0 and failed <= max_pod_retrials:
                if not job_state.running:
                    job_state.running = True
                    job_state.job_wrapper.change_state(
                        model.Job.states.RUNNING)
                return job_state
            elif failed > max_pod_retrials:
                return self._handle_job_failure(job, job_state)
            elif job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job, cleanup and remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            else:
                # We really shouldn't reach this point, but we might if the job has been killed by the kubernetes admin
                log.info(
                    "Kubernetes job '%s' not classified as succ., active or failed. Full Job object: \n%s",
                    job.name, job.obj)

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=%s",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "No Kubernetes Jobs are available under expected selector app=%s\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("More than one Kubernetes Job associated to job id '%s'",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "More than one Kubernetes Job associated to job id '%s'\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state