示例#1
0
 def clean_up(self, job_name):
     """
     Deletes the job. Deleting the job deletes are related pods.
     """
     logging.info("KILLING job {}".format(str(job_name)))
     result = retryable_check_output(args=namespaced_kubectl() + [
         'delete',
         '--ignore-not-found=true',  # in case we hit an edge case on retry
         'job',
         job_name
     ])
     logging.info(result)
    def get_pods(job_name):
        """
        Return pods for a given job_name

        :param job_name: a unique job name
        :type job_name: string
        :return: result form kubectl command that contains job pod and container information
        :type: dict

        """
        return json.loads(
            retryable_check_output(
                args=namespaced_kubectl() +
                ['get', 'pods', '-o', 'json', '-l',
                 'job-name==%s' % job_name]))
示例#3
0
    def log_container_logs(self, job_name, pod_output=None):
        """
        Reads the logs from each container in each pod in the job, re-logs them back

        :param job_name: job that owns the pods with the containers we want to log
        :param pod_output: Result of get_pods(job_name) call. If None, will be
                           requested. This is a convenience so we can share/
                           reuse the results of get_pods()
        :return:
        """
        pod_output = pod_output or self.get_pods(job_name)
        for pod in pod_output['items']:
            pod_name = pod['metadata']['name']
            for container in pod['spec']['containers']:
                container_name = container['name']
                extra = dict(pod=pod_name, container=container_name)
                logging.info('LOGGING OUTPUT FROM JOB [%s/%s]:' %
                             (pod_name, container_name),
                             extra=extra)
                output = retryable_check_output(
                    args=namespaced_kubectl() +
                    ['logs', pod_name, container_name])
                for line in output.splitlines():
                    logging.info(line, extra=extra)
示例#4
0
    def execute(self, context, session=None):

        if self.die_if_duplicate:

            current_task_instance = TaskInstance(self,
                                                 context['execution_date'])
            current_task_instance.refresh_from_db(include_queue_time=True)

            TI = TaskInstance
            instances_that_are_running = session.query(TI).filter(
                TI.dag_id == current_task_instance.dag_id,
                TI.task_id == current_task_instance.task_id,
                TI.state.in_([State.RUNNING, State.UP_FOR_RETRY,
                              State.QUEUED]),
            ).all()

            should_die = False
            for task_instance in instances_that_are_running:
                if task_instance.queued_dttm < current_task_instance.queued_dttm:
                    should_die = True
                    break

            if should_die:
                raise Exception(
                    "A prior execution of this task is already running!  Failing this execution."
                )

        job_name, job_yaml_string = self.create_job_yaml(context)

        try:
            # Setting pod_output to None, this will prevent a log_container_logs error
            # if polling fails and self.polling_job_completion is not able to return pod_output.
            pod_output = None
            logging.info(job_yaml_string)
            self.instance_names.append(
                job_name)  # should happen once, but safety first!
            self.xcom_push(context, "kubernetes_job_name", job_name)

            with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
                f.write(job_yaml_string)
                f.flush()
                result = subprocess.check_output(args=namespaced_kubectl() +
                                                 ['apply', '-f', f.name])
                logging.info(result)

            pod_output = self.poll_job_completion(job_name)
            pod_output = pod_output or self.get_pods(
                job_name)  # if we didn't get it for some reason
            return None
        finally:
            try:
                # don't consider the job failed if this fails!
                self.log_container_logs(job_name, pod_output=pod_output)
                stacktrace = traceback.format_exc().rstrip()
                if stacktrace != "None":
                    logging.error(
                        "Got an exception during airflow worker execution!  Stack trace:\n{}"
                        .format(traceback))

                if pod_output:
                    # let's clean up all our old pods. we'll kill the entry point (PID 1) in each running container
                    for pod in pod_output.get('items', []):
                        # if we never got to running, there won't be containerStatuses
                        if 'containerStatuses' in pod['status']:
                            live_containers = [
                                cs['name']
                                for cs in pod['status']['containerStatuses']
                                if 'running' in cs['state']
                            ]
                            for cname in live_containers:
                                logging.info(
                                    'killing dependent live container %s' %
                                    cname)
                                # there is a race condition between reading the status and trying to kill the running
                                # container. ignore the return code to duck the issue.
                                subprocess.call(namespaced_kubectl() + [
                                    'exec', pod['metadata']['name'], '-c',
                                    cname, 'kill', '1'
                                ])

            except Exception as ex:
                logging.error("Failed to clean up kubernetes job:\n%s" %
                              traceback.format_exc(),
                              extra={'err': ex})

            self.clean_up(job_name)
示例#5
0
    def poll_job_completion(self,
                            job_name,
                            dependent_containers={'cloudsql-proxy'}):
        """
        Polls for completion of the created job.
        Sleeps for sleep_seconds_between_polling between polling.
        Any failed pods will raise an error and fail the KubernetesJobOperator task.
        """
        logging.info('Polling for completion of job: %s' % job_name)
        pod_output = None  # keeping this out here so we can reuse it in the "finally" clause

        has_live_existed = False
        while True:
            time.sleep(self.sleep_seconds_between_polling)

            pod_output = self.get_pods(job_name)

            job_description = json.loads(
                retryable_check_output(namespaced_kubectl() +
                                       ['get', 'job', "-o", "json", job_name]))

            status_block = job_description['status']

            if 'succeeded' in status_block and 'failed' in status_block:
                raise Exception(
                    "Invalid status block containing both succeeded and failed: %s",
                    json.dumps(status_block))

            if 'active' in status_block:
                status = 'running'
            elif 'failed' in status_block:
                status = "failed"
            elif 'succeeded' in status_block:
                status = 'complete'
            else:
                status = "pending"

            logging.info('Current status is: %s' % status)

            if "pending" == status:
                pass

            if "failed" == status:
                raise Exception('%s has failed pods, failing task.' % job_name)

            if "complete" == status:
                return pod_output

            # Determine if we have any containers left running in each pod of the job.
            # Dependent containers don't count.
            # If there are no pods left running anything, we are done here. Cleaning up
            # dependent containers will be left to the top-level `finally` block down below.
            has_live = False
            for pod in pod_output['items']:
                if 'Unknown' == pod['status']['phase']:
                    # we haven't run yet
                    has_live = True
                    break
                elif 'Pending' == pod['status']['phase']:
                    has_live = True
                    start_time_s = pod['status'].get('startTime')
                    if not start_time_s:
                        logging.info('Pod not yet started')
                        break
                    start_time = datetime.strptime(start_time_s,
                                                   "%Y-%m-%dT%H:%M:%SZ")
                    start_duration_secs = (datetime.utcnow() -
                                           start_time).total_seconds()
                    if start_duration_secs > 300:
                        raise Exception(
                            '%s has failed to start after %0.2f seconds' % (
                                job_name,
                                start_duration_secs,
                            ))
                elif 'Running' == pod['status']['phase']:
                    # get all of the independent containers that are still alive (running or waiting)
                    live_cnt = 0
                    for cs in pod['status']['containerStatuses']:
                        if cs['name'] in dependent_containers:
                            pass
                        elif 'terminated' in cs['state']:
                            has_live_existed = True
                            exit_code = int(cs['state']['terminated'].get(
                                'exitCode', 0))
                            if exit_code > 0:
                                raise Exception(
                                    '%s has failed pods, failing task.' %
                                    job_name)
                        else:
                            live_cnt += 1

                    if live_cnt > 0:
                        has_live = True
                        break
                elif 'Succeeded' == pod['status']['phase']:
                    # For us to end up in this block, the job has to be Running and the pod has to be Succeeded.
                    # This happens when (on a previous attempt) we successfully finished execution, killed dependent
                    # containers, and failed to delete the job.
                    # In this scenario, we want to immediately stop polling, and retry job deletion.
                    has_live_existed = True
                    has_live = False
                elif 'Failed' == pod['status']['phase']:
                    raise Exception("Containers failed!")
                else:
                    raise Exception(
                        "Encountered pod state {state} - no behavior has been prepared for pods in this state!"
                        .format(state=pod["status"]["phase"]))
            total_pods = len(pod_output['items'])
            logging.info(
                "total pods: {total_pods}".format(total_pods=total_pods))
            has_live_existed = has_live_existed or has_live
            # if we get to this point but for some reason there are no pods, log it and retry
            if not has_live_existed:
                logging.info('No pods have run. Retrying.')
            # we have no live pods, but live pods have existed.
            elif not has_live:
                logging.info('No live, independent pods left.')
                return pod_output
示例#6
0
 def get_pods(self, job_name):
     return json.loads(
         retryable_check_output(
             args=namespaced_kubectl() +
             ['get', 'pods', '-o', 'json', '-l',
              'job-name==%s' % job_name]))
 def get_hostname(self, ti):
     return json.loads(
         subprocess.check_output(
             namespaced_kubectl() +
             ["get", "-o", "json", "pod/{}".format(ti.hostname)])
     )['status']['podIP']