示例#1
0
    def __init__(self, kube_config, task_queue, result_queue, kube_client,
                 worker_uuid, kube_dbnd):
        super(DbndKubernetesScheduler,
              self).__init__(kube_config, task_queue, result_queue,
                             kube_client, worker_uuid)
        self.kube_dbnd = kube_dbnd

        # PATCH manage watcher
        from multiprocessing.managers import SyncManager

        self._manager = SyncManager()
        self._manager.start(mgr_init)

        self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0
        self.kube_watcher = self._make_kube_watcher_dbnd()
        # will be used to low level pod interactions
        self.failed_pods_to_ignore = []
        self.running_pods = {}
        self.pod_to_task = {}
        self.metrics_logger = KubernetesMetricsLogger()
示例#2
0
    def __init__(
        self,
        kube_config,
        task_queue,
        result_queue,
        kube_client,
        scheduler_job_id,
        kube_dbnd: "DbndKubernetesClient",
    ):
        super().__init__(kube_config, task_queue, result_queue, kube_client,
                         scheduler_job_id)
        self.kube_dbnd = kube_dbnd

        # PATCH watcher communication manager
        # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages
        # from multiprocessing.managers import SyncManager

        # Scheduler <-> (via _manager) KubeWatcher
        # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown
        # self._manager = SyncManager()
        # self._manager.start(mgr_init)

        # self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0

        self.kube_config = kube_config
        self.kube_watcher = self._make_kube_watcher_dbnd()

        # pod_id to SubmittedPodState
        self.submitted_pods = {}  # type: Dict[str,SubmittedPodState]

        # sending data to databand tracker
        self.metrics_logger = KubernetesMetricsLogger()

        # disappeared pods mechanism
        self.last_disappeared_pods = {}
        self.current_iteration = 1
        # add `k8s-scheduler:` prefix to all log messages
        self._log = PrefixLoggerAdapter("k8s-scheduler", self.log)
示例#3
0
 def __init__(self, kube_dbnd, **kwargs):
     super(DbndKubernetesJobWatcher, self).__init__(**kwargs)
     self.kube_dbnd = kube_dbnd  # type: DbndKubernetesClient
     self.processed_events = {}
     self.processed_pods = {}
     self.metrics_logger = KubernetesMetricsLogger()
示例#4
0
class DbndKubernetesJobWatcher(KubernetesJobWatcher):
    def __init__(self, kube_dbnd, **kwargs):
        super(DbndKubernetesJobWatcher, self).__init__(**kwargs)
        self.kube_dbnd = kube_dbnd  # type: DbndKubernetesClient
        self.processed_events = {}
        self.processed_pods = {}
        self.metrics_logger = KubernetesMetricsLogger()

    def run(self):
        """Performs watching"""

        # Must reset filesystem cache to avoid using out-of-cluster credentials within Kubernetes
        self.reset_fs_cache()

        kube_client = self.kube_dbnd.kube_client
        try:
            while True:
                try:
                    self.resource_version = self._run(
                        kube_client,
                        self.resource_version,
                        self.worker_uuid,
                        self.kube_config,
                    )
                except DatabandSigTermError:
                    break
                except Exception:
                    self.log.exception(
                        "Unknown error in KubernetesJobWatcher. Failing")
                    raise
                else:
                    self.log.warning(
                        "Watch died gracefully, starting back up with: "
                        "last resource_version: %s",
                        self.resource_version,
                    )
        except (KeyboardInterrupt, DatabandSigTermError):
            pass

    def _run(self, kube_client, resource_version, worker_uuid, kube_config):
        self.log.info(
            "Event: and now my watch begins starting at resource_version: %s",
            resource_version,
        )

        from kubernetes import watch

        watcher = watch.Watch()

        kwargs = {"label_selector": "airflow-worker={}".format(worker_uuid)}
        if resource_version:
            kwargs["resource_version"] = resource_version
        if kube_config.kube_client_request_args:
            for key, value in kube_config.kube_client_request_args.items():
                kwargs[key] = value

        last_resource_version = None
        for event in watcher.stream(kube_client.list_namespaced_pod,
                                    self.namespace, **kwargs):
            # DBND PATCH
            # we want to process the message
            task = event["object"]
            self.log.debug("Event: %s had an event of type %s",
                           task.metadata.name, event["type"])

            if event["type"] == "ERROR":
                return self.process_error(event)

            pod_data = event["object"]
            pod_name = pod_data.metadata.name
            phase = pod_data.status.phase

            if self.processed_events.get(pod_name):
                self.log.debug("Event: %s at %s - skipping as seen", phase,
                               pod_name)
                continue
            status = self.kube_dbnd.process_pod_event(event)

            self._update_node_name(pod_name, pod_data)

            if status in ["Succeeded", "Failed"]:
                self.processed_events[pod_name] = status

            self.process_status_quite(
                task.metadata.name,
                status,
                task.metadata.labels,
                task.metadata.resource_version,
            )
            self.resource_version = task.metadata.resource_version

        return self.resource_version

    def _update_node_name(self, pod_name, pod_data):
        if self.processed_pods.get(pod_name):
            self.log.debug(
                "Pod %s has already been logged to metrics - skipping")
            return
        node_name = pod_data.spec.node_name
        if not node_name:
            return
        # Some events are missing the node name, but it will get there for sure
        try:
            task_id = pod_data.metadata.labels.get("task_id")
            if not task_id:
                return

            dr = try_get_databand_run()
            if not dr:
                return
            task_run = dr.get_task_run(task_id)
            if not task_run:
                return

            self.metrics_logger.log_pod_information(task_run.task, pod_name,
                                                    node_name)
        except Exception as ex:
            logger.info("Failed to gather node name for %s", pod_name)
        finally:
            self.processed_pods[pod_name] = True

    def process_status_quite(self, pod_id, status, labels, resource_version):
        """Process status response"""
        if status == "Pending":
            self.log.debug("Event: %s Pending", pod_id)
        elif status == "Failed":
            self.log.debug("Event: %s Failed", pod_id)
            self.watcher_queue.put(
                (pod_id, State.FAILED, labels, resource_version))
        elif status == "Succeeded":
            self.log.debug("Event: %s Succeeded", pod_id)
            self.watcher_queue.put(
                (pod_id, State.SUCCESS, labels, resource_version))
        elif status == "Running":
            self.log.debug("Event: %s is Running", pod_id)
        else:
            self.log.warning(
                "Event: Invalid state: %s on pod: %s with labels: %s with "
                "resource_version: %s",
                status,
                pod_id,
                labels,
                resource_version,
            )

    def reset_fs_cache(self):
        from targets.fs import reset_fs_cache

        reset_fs_cache()
示例#5
0
class DbndKubernetesScheduler(AirflowKubernetesScheduler):
    def __init__(self, kube_config, task_queue, result_queue, kube_client,
                 worker_uuid, kube_dbnd):
        super(DbndKubernetesScheduler,
              self).__init__(kube_config, task_queue, result_queue,
                             kube_client, worker_uuid)
        self.kube_dbnd = kube_dbnd

        # PATCH manage watcher
        from multiprocessing.managers import SyncManager

        self._manager = SyncManager()
        self._manager.start(mgr_init)

        self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0
        self.kube_watcher = self._make_kube_watcher_dbnd()
        # will be used to low level pod interactions
        self.failed_pods_to_ignore = []
        self.running_pods = {}
        self.pod_to_task = {}
        self.metrics_logger = KubernetesMetricsLogger()

    def _make_kube_watcher(self):
        # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value
        # of the resource version while we need to store a sperate value for every kubernetes executor (because even in a basic flow
        # we can have two Kubernets executors running at once, the one that launched the driver and the one inside the driver).
        #
        # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll
        # Kubernets for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache"
        # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod
        return None

    def _make_kube_watcher_dbnd(self):
        watcher = DbndKubernetesJobWatcher(
            namespace=self.namespace,
            watcher_queue=self.watcher_queue,
            resource_version=self.current_resource_version,
            worker_uuid=self.worker_uuid,
            kube_config=self.kube_config,
            kube_dbnd=self.kube_dbnd,
        )
        watcher.start()
        return watcher

    @staticmethod
    def _create_pod_id(dag_id, task_id):
        task_run = try_get_databand_run().get_task_run(task_id)
        return task_run.job_id__dns1123

    def _health_check_kube_watcher(self):
        if self.kube_watcher.is_alive():
            pass
        else:
            self.log.error("Error while health checking kube watcher process. "
                           "Process died for unknown reasons")
            self.kube_watcher = self._make_kube_watcher_dbnd()

    def run_next(self, next_job):
        """

        The run_next command will check the task_queue for any un-run jobs.
        It will then create a unique job-id, launch that job in the cluster,
        and store relevant info in the current_jobs map so we can track the job's
        status
        """
        key, command, kube_executor_config = next_job
        dag_id, task_id, execution_date, try_number = key
        self.log.debug(
            "Kube POD to submit: image=%s with %s",
            self.kube_config.kube_image,
            str(next_job),
        )

        dr = try_get_databand_run()
        task_run = dr.get_task_run_by_af_id(task_id)
        pod_command = [str(c) for c in command]
        task_engine = task_run.task_engine  # type: KubernetesEngineConfig
        pod = task_engine.build_pod(
            task_run=task_run,
            cmds=pod_command,
            labels={
                "airflow-worker":
                self.worker_uuid,
                "dag_id":
                self._make_safe_label_value(dag_id),
                "task_id":
                self._make_safe_label_value(task_run.task_af_id),
                "execution_date":
                self._datetime_to_label_safe_datestring(execution_date),
                "try_number":
                str(try_number),
            },
            try_number=try_number,
            include_system_secrets=True,
        )

        pod_ctrl = self.kube_dbnd.get_pod_ctrl_for_pod(pod)
        self.running_pods[pod.name] = self.namespace
        self.pod_to_task[pod.name] = task_run.task

        pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True)
        self.metrics_logger.log_pod_started(task_run.task)

    def delete_pod(self, pod_id):
        if pod_id in self.failed_pods_to_ignore:
            logger.warning(
                "Received request to delete pod %s that is ignored! Ignoring...",
                pod_id)
            return
        try:
            found_pod = self.running_pods.pop(pod_id, None)
            if found_pod:
                result = self.kube_dbnd.delete_pod(pod_id, self.namespace)

                if pod_id in self.pod_to_task:
                    self.metrics_logger.log_pod_deleted(
                        self.pod_to_task[pod_id])
                    self.pod_to_task.pop(pod_id)  # Keep the cache clean

                return result
        except Exception as e:
            # Catch all exceptions to prevent any delete loops, best effort
            logger.warning(
                "Exception raised when trying to delete pod %s! Adding to ignored list...",
                pod_id,
            )
            self.failed_pods_to_ignore.append(pod_id)

    def terminate(self):
        pods_to_delete = sorted(self.running_pods.keys())
        if pods_to_delete:
            logger.info("Deleting %d submitted pods: %s", len(pods_to_delete),
                        pods_to_delete)
            for pod_name in pods_to_delete:
                try:
                    self.delete_pod(pod_name)
                except Exception:
                    logger.exception("Failed to terminate pod %s", pod_name)
        super(DbndKubernetesScheduler, self).terminate()
示例#6
0
class DbndKubernetesScheduler(AirflowKubernetesScheduler):
    """
    The scheduler is responsible for:
        * running the pods on kubernetes
        * getting pods' events from the watcher (DbndKubernetesJobWatcher)
        * handling the state of the task relative to the state of the pods

    Very serious override of AirflowKubernetesScheduler
    1. better visibility on errors, so we proceed Failures with much more info
    2. tracking of all around "airflow run" events -> Pod Crashes, Pod Submission errors
        a. in case of crash (OOM, evicted pod) -> error propogation to databand and retry
    """
    def __init__(
        self,
        kube_config,
        task_queue,
        result_queue,
        kube_client,
        scheduler_job_id,
        kube_dbnd: "DbndKubernetesClient",
    ):
        super().__init__(kube_config, task_queue, result_queue, kube_client,
                         scheduler_job_id)
        self.kube_dbnd = kube_dbnd

        # PATCH watcher communication manager
        # we want to wait for stop, instead of "exit" inplace, so we can get all "not" received messages
        # from multiprocessing.managers import SyncManager

        # Scheduler <-> (via _manager) KubeWatcher
        # if _manager dies inplace, we will not get any "info" from KubeWatcher until shutdown
        # self._manager = SyncManager()
        # self._manager.start(mgr_init)

        # self.watcher_queue = self._manager.Queue()
        self.current_resource_version = 0

        self.kube_config = kube_config
        self.kube_watcher = self._make_kube_watcher_dbnd()

        # pod_id to SubmittedPodState
        self.submitted_pods = {}  # type: Dict[str,SubmittedPodState]

        # sending data to databand tracker
        self.metrics_logger = KubernetesMetricsLogger()

        # disappeared pods mechanism
        self.last_disappeared_pods = {}
        self.current_iteration = 1
        # add `k8s-scheduler:` prefix to all log messages
        self._log = PrefixLoggerAdapter("k8s-scheduler", self.log)

    def _make_kube_watcher(self):
        # prevent storing in db of the kubernetes resource version, because the kubernetes db model only stores a single value
        # of the resource version while we need to store a separate value for every kubernetes executor (because even in a basic flow
        # we can have two Kubernetes executors running at once, the one that launched the driver and the one inside the driver).
        #
        # the resource version is the position inside the event stream of the kubernetes cluster and is used by the watcher to poll
        # Kubernetes for events. It's probably fine to not store this because by default Kubernetes will returns "the evens currently in cache"
        # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_namespaced_pod
        return None

    def _make_kube_watcher_dbnd(self):
        watcher = DbndKubernetesJobWatcher(**get_job_watcher_kwargs(self))
        watcher.start()
        return watcher

    @staticmethod
    def _create_pod_id(dag_id, task_id):
        task_run = try_get_databand_run().get_task_run(task_id)
        return create_pod_id(task_run)

    def _health_check_kube_watcher(self):
        if self.kube_watcher.is_alive():
            pass
        else:
            self.log.error("Error while health checking kube watcher process. "
                           "Process died for unknown reasons")
            self.kube_watcher = self._make_kube_watcher_dbnd()

    def run_next(self, next_job: "KubernetesJobType"):
        """
        The run_next command will check the task_queue for any un-run jobs.
        It will then create a unique job-id, launch that job in the cluster,
        and store relevant info in the current_jobs map so we can track the job's
        status
        """
        if AIRFLOW_VERSION_2:
            # pod_template_file is included in Airflow 2, but not used by our executor.
            key, command, kube_executor_config, pod_template_file = next_job
        else:
            key, command, kube_executor_config = next_job

        self.run_next_kube_job(key, command)

    def run_next_kube_job(self, key, command):
        dag_id, task_id, execution_date, try_number = key
        self.log.debug(
            "Kube POD to submit: image=%s with %s [%s]",
            self.kube_config.kube_image,
            str(key),
            str(command),
        )

        databand_run = try_get_databand_run()
        task_run = databand_run.get_task_run_by_af_id(task_id)

        pod_command = [str(c) for c in command]
        task_engine = task_run.task_engine  # type: KubernetesEngineConfig
        pod: "k8s.V1Pod" = task_engine.build_pod(
            task_run=task_run,
            cmds=pod_command,
            labels={
                "airflow-worker":
                self._version_independent_worker_id(),
                "dag_id":
                make_safe_label_value(dag_id),
                "task_id":
                make_safe_label_value(task_run.task_af_id),
                "execution_date":
                self._datetime_to_label_safe_datestring(execution_date),
                "try_number":
                str(try_number),
            },
            try_number=try_number,
            include_system_secrets=True,
        )
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(pod.metadata.name,
                                               pod.metadata.namespace,
                                               config=task_engine)
        self.submitted_pods[pod.metadata.name] = SubmittedPodState(
            pod_name=pod.metadata.name,
            task_run=task_run,
            scheduler_key=key,
            submitted_at=utcnow(),
        )

        pod_ctrl.run_pod(pod=pod, task_run=task_run, detach_run=True)
        self.metrics_logger.log_pod_submitted(task_run.task,
                                              pod_name=pod.metadata.name)

    def _version_independent_worker_id(self):
        if AIRFLOW_VERSION_2:
            return self.scheduler_job_id
        else:
            return self.worker_uuid

    # in airflow>1.10.10 delete_pod method takes additional "namespace" arg
    # we do not use it in our overridden method but still we need to adjust
    # method signature to avoid errors when we run code on airflow>1.10.10.
    def delete_pod(self, pod_id, *args):
        # we are going to delete pod only once.
        # the moment it's removed from submitted_pods, we will not handle it event, neither delete it
        submitted_pod = self.submitted_pods.pop(pod_id, None)
        if not submitted_pod:
            return

        try:
            self.metrics_logger.log_pod_finished(submitted_pod.task_run.task)
        except Exception:
            # Catch all exceptions to prevent any delete loops, best effort
            self.log.exception(
                "%s failed to save pod finish info: pod_name=%s.!",
                submitted_pod.task_run,
                pod_id,
            )

        try:
            result = self.kube_dbnd.delete_pod(pod_id, self.namespace)
            return result
        except Exception:
            # Catch all exceptions to prevent any delete loops, best effort
            self.log.exception(
                "%s: Exception raised when trying to delete pod: pod_name=%s.",
                submitted_pod.task_run,
                pod_id,
            )

    def terminate(self):
        # we kill watcher and communication channel first

        # prevent watcher bug of being stacked on termination during event processing
        try:
            self.kube_watcher.safe_terminate()
            super(DbndKubernetesScheduler, self).terminate()
        finally:
            self._terminate_all_running_pods()

    def _terminate_all_running_pods(self):
        """
        Clean up of all running pods on terminate:
        """
        # now we need to clean after the run
        pods_to_delete = sorted(list(self.submitted_pods.values()))
        if not pods_to_delete:
            return

        self.log.info(
            "Terminating run, deleting all %d submitted pods that are still running/not finalized",
            len(pods_to_delete),
        )
        for submitted_pod in pods_to_delete:
            try:
                self.delete_pod(submitted_pod.pod_name)
            except Exception:
                self.log.exception("Failed to terminate pod %s",
                                   submitted_pod.pod_name)

        # Wait for pods to be deleted and execute their own state management
        self.log.info(
            "Setting all running/not finalized pods to cancelled in 10 seconds..."
        )
        time.sleep(10)
        try:
            for submitted_pod in pods_to_delete:
                task_run = submitted_pod.task_run
                ti_state = get_airflow_task_instance_state(task_run)
                if is_task_instance_finished(ti_state):
                    if task_run.task_run_state not in TaskRunState.final_states(
                    ):
                        self.log.info(
                            "%s with pod %s is not finished: airflow state - %s and databand state - %s."
                            "Setting the task_run state to match airflow state",
                            task_run,
                            submitted_pod.pod_name,
                            ti_state,
                            task_run.task_run_state,
                        )

                        new_state = AIRFLOW_TO_DBND_STATE_MAP.get(
                            ti_state, TaskRunState.CANCELLED)
                        task_run.set_task_run_state(new_state)
                    else:
                        self.log.info(
                            "%s with pod %s is finished: airflow state - %s and databand state - %s.Skipping",
                            task_run,
                            submitted_pod.pod_name,
                            ti_state,
                            task_run.task_run_state,
                        )

                    continue
                task_run.set_task_run_state(TaskRunState.CANCELLED)
        except Exception:
            self.log.exception("Could not set pods to cancelled!")

    def process_watcher_task(self, task):
        """Process the task event sent by watcher."""
        pod_event = WatcherPodEvent.from_watcher_task(task)
        pod_name = pod_event.pod_id
        self.log.debug(
            "Attempting to process pod; pod_name: %s; state: %s; labels: %s",
            pod_event.pod_id,
            pod_event.state,
            pod_event.labels,
        )

        submitted_pod = self.submitted_pods.get(pod_name)
        if submitted_pod is None:
            # this is deleted pod - on delete watcher will send event
            # 1. delete by scheduler - we skip here
            # 2. external delete -> we continue to process the event
            return

        # DBND-AIRFLOW we have it precached, we don't need to go to DB
        # key = self._labels_to_key(labels=labels)
        # if not key:
        #     self.log.info(
        #         "Can't find a key for event from %s - %s from labels %s, skipping",
        #         pod_name,
        #         state,
        #         labels,
        #     )
        #     return

        self.log.debug(
            "Attempting to process pod; pod_name: %s; state: %s; labels: %s",
            pod_event.pod_id,
            pod_event.state,
            pod_event.labels,
        )

        # we are not looking for key
        result = PodResult.from_pod(submitted_pod, pod_event)
        if submitted_pod.processed:
            # we already processed this kind of event, as in this process we have failed status already
            self.log.info(
                "Skipping pod '%s' event from %s - already processed",
                pod_event.state,
                pod_name,
            )
            return

        if result.state == State.RUNNING:
            # we should get here only once -> when pod starts to run

            self._process_pod_running(submitted_pod)
            # we will not send event to executor (otherwise it will delete the running pod)
            return

        try:
            if result.state is None:
                # simple case, pod has success - will be proceed by airflow main scheduler (Job)
                # task can be failed or passed. Airflow exit with 0 if task has failed regular way.
                self._process_pod_success(submitted_pod)
                self.result_queue.put(result.as_tuple())
            elif result.state == State.FAILED:
                # Pod crash, it was deleted, killed, evicted.. we need to give it extra treatment
                self._process_pod_failed(submitted_pod)
                self.result_queue.put(result.as_tuple())
            else:
                self.log.debug(
                    "finishing job %s - %s (%s)",
                    result.key,
                    result.state,
                    result.pod_id,
                )
                self.result_queue.put(result.as_tuple())
        finally:
            submitted_pod.processed = True

    def _process_pod_running(self, submitted_pod):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        # node_name is coming from outside
        # if it's not None, we already got Runnning phase event
        if submitted_pod.node_name:
            self.log.info(
                "%s: Zombie bug: Seeing pod event again. "
                "Probably something happening with pod and it's node: %s",
                submitted_pod.task_run,
                submitted_pod.pod_name,
            )
            return

        pod_data = self.kube_dbnd.get_pod_status(pod_name)
        if not pod_data or not pod_data.spec.node_name:
            self.log.error("%s: Failed to find pod data for %s", pod_name)
            node_name = "failed_to_find"
        else:
            node_name = pod_data.spec.node_name
            self.metrics_logger.log_pod_running(task_run.task,
                                                node_name=node_name)

        submitted_pod.node_name = node_name

        # only to print this info to Console , track=False
        task_run.set_task_run_state(TaskRunState.RUNNING, track=False)

    def _process_pod_success(self, submitted_pod):
        # type: (SubmittedPodState) -> None

        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        if submitted_pod.processed:
            self.log.info(
                "%s Skipping pod 'success' event from %s: already processed",
                pod_name)
            return

        # get refreshed TI from Airflow DB
        ti = get_airflow_task_instance(task_run=task_run)

        # we print success message to the screen
        # we will not send it to databand tracking store

        if ti.state == State.SUCCESS:
            dbnd_state = TaskRunState.SUCCESS
        elif ti.state in {State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE}:
            dbnd_state = TaskRunState.UP_FOR_RETRY
        elif ti.state in {State.FAILED, State.SHUTDOWN}:
            dbnd_state = TaskRunState.FAILED
        else:
            # we got a corruption here, pod has finished, but the AF state is not "final" state
            # meaning: AF execution was interrupted in the middle
            self.log.error(
                "Pod %s has finished with SUCCESS, but task instance state is %s, failing the job."
                % (pod_name, ti.state))

            self._process_pod_failed(submitted_pod,
                                     PodFailureReason.err_pod_evicted)
            return

        # only print to console
        task_run.set_task_run_state(dbnd_state, track=False)
        self.log.info(
            "%s has been completed at pod '%s' with state %s try_number=%s!" %
            (task_run, pod_name, ti.state, ti._try_number))

    def _process_pod_failed(
        self,
        submitted_pod: SubmittedPodState,
        known_fail_reason: Optional[PodFailureReason] = None,
    ):
        task_run = submitted_pod.task_run
        pod_name = submitted_pod.pod_name

        task_id = task_run.task_af_id
        ti_state = get_airflow_task_instance_state(task_run=task_run)

        self.log.error("%s: pod %s has crashed, airflow state: %s", task_run,
                       pod_name, ti_state)

        pod_data = self.kube_dbnd.get_pod_status(pod_name)
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(pod_name, self.namespace)

        pod_logs = []
        if pod_data:
            pod_status_log = _get_status_log_safe(pod_data)
            pod_phase = pod_data.status.phase
            if pod_phase != "Pending":
                pod_logs = pod_ctrl.get_pod_logs()
        else:
            pod_status_log = "POD NOT FOUND"

        error_msg = "Pod %s at %s has failed (task state=%s)!" % (
            pod_name,
            self.namespace,
            ti_state,
        )
        failure_reason, failure_message = self._find_pod_failure_reason(
            pod_data=pod_data, pod_name=pod_name)
        if failure_reason:
            error_msg += " Discovered reason for failure is %s: %s." % (
                failure_reason,
                failure_message,
            )
        error_help_msg = "Please see full pod log for more details."
        if pod_logs:
            error_help_msg += "\nPod logs:\n%s\n" % "\n".join(
                ["out: %s" % l for l in pod_logs[-20:]])

        from dbnd._core.task_run.task_run_error import TaskRunError

        task_run_error = TaskRunError.build_from_message(
            task_run=task_run, msg=error_msg, help_msg=error_help_msg)

        if is_task_instance_finished(ti_state):
            # Pod has failed, however, Airflow managed to update the state
            # that means - all code (including dbnd) were executed
            # let just notify the error, so we can show it in the summary
            # we will not send it to databand tracking store, only print to console
            dbnd_state = AIRFLOW_TO_DBND_STATE_MAP.get(ti_state, None)
            task_run.set_task_run_state(dbnd_state,
                                        track=False,
                                        error=task_run_error)

            if dbnd_state == TaskRunState.FAILED:
                color = "red"
            elif dbnd_state == TaskRunState.SUCCESS:
                color = "cyan"
            else:
                color = "yellow"

            self.log.info(
                "%s",
                task_run.task.ctrl.banner(
                    "Task %s(%s) - pod %s has failed, airflow state=%s!" %
                    (task_run.task.task_name, task_id, pod_name, ti_state),
                    color=color,
                    task_run=task_run,
                ),
            )
            return

        # we got State.Failed from watcher, but at DB airflow instance in unfinished state
        # that means the task has failed in the middle
        # (all kind of errors and exit codes)
        task_run_log = error_msg
        task_run_log += pod_status_log
        if pod_logs:
            # let's upload it logs - we don't know what happen
            task_run_log += "\nPod logs:\n\n%s\n\n" % "\n".join(pod_logs)
        task_run.tracker.save_task_run_log(task_run_log)

        self._handle_crashed_task_instance(
            task_run=task_run,
            task_run_error=task_run_error,
            failure_reason=known_fail_reason or failure_reason,
        )

    def _find_pod_failure_reason(self, pod_name, pod_data):
        # type: (str, V1Pod) -> (Optional[PodFailureReason], Optional[str])
        if not pod_data:
            return (
                PodFailureReason.err_pod_deleted,
                "Pod %s is not found at cluster (deleted/spot/preemptible)" %
                pod_name,
            )

        pod_phase = pod_data.status.phase
        pod_ctrl = self.kube_dbnd.get_pod_ctrl(name=pod_name)

        if pod_phase == "Pending":
            self.log.error(
                "Got pod %s at Pending state which is failing: looking for the reason..",
                pod_name,
            )
            try:
                pod_ctrl.check_deploy_errors(pod_data)
            # we handle only known errors
            except KubernetesImageNotFoundError as ex:
                return PodFailureReason.err_image_pull, str(ex)
            except KubernetesPodConfigFailure as ex:  # pod config error
                return PodFailureReason.err_config_error, str(ex)
            except Exception:
                # we don't want to handle that
                pass
            return None, None

        if pod_data.metadata.deletion_timestamp:
            return (
                PodFailureReason.err_pod_deleted,
                "Pod %s has been deleted at %s" %
                (pod_name, pod_data.metadata.deletion_timestamp),
            )

        pod_exit_code, termination_reason = _try_get_pod_exit_code(pod_data)
        if pod_exit_code or termination_reason:
            self.log.info(
                "Found pod exit code %d for pod %s; Termination reason: %s",
                pod_exit_code,
                pod_name,
                termination_reason,
            )
            exit_code_message = ""
            if pod_exit_code:
                exit_code_message = "Pod exit code %s; "
            pod_exit_code = str(pod_exit_code)

            return (
                pod_exit_code,
                "%sTermination reason: %s" %
                (exit_code_message, termination_reason),
            )
        return None, None

    @provide_session
    def _handle_crashed_task_instance(self,
                                      task_run,
                                      task_run_error,
                                      failure_reason,
                                      session=None):
        # type: (TaskRun, TaskRunError, PodFailureReason, Optional[Session]) -> None

        task_instance = get_airflow_task_instance(task_run, session=session)
        task_instance.task = task_run.task.ctrl.airflow_op

        retry_config = self.kube_dbnd.engine_config.pod_retry_config
        retry_count = retry_config.get_retry_count(failure_reason)
        if retry_count is not None:
            # update retry for the latest values (we don't have
            task_run.task.task_retries = retry_count
            task_instance.task.retries = retry_count
            task_instance.max_tries = retry_count

        self.log.info(
            "Retry %s  task: max_retries=%s, task.retries=%s, current:%s state:%s",
            task_run,
            task_instance.max_tries,
            task_instance.task.retries,
            task_instance._try_number,
            task_instance.state,
        )

        if task_instance.state == State.QUEUED:
            # Special case - no airflow code has been run in the pod at all.
            # usually its increased the momen state moved to Running. And while at running state -> it will be the same value
            # Must increment try number,
            task_instance._try_number += 1
            session.merge(task_instance)
            session.commit()

        # Airflow decide if to retry the task by this condition:
        # >>> task_instance.task.retries and task_instance.try_number <= task_instance.max_tries
        task_instance.handle_failure(str(task_run_error.exception),
                                     session=session)

        # will be logged to help debug why we did or didn't retry the task
        retry_data = "task.retries={reties}, try_number={try_number}, max_tries={max_tries}.".format(
            reties=task_instance.task.retries,
            try_number=task_instance.try_number,
            max_tries=task_instance.max_tries,
        )

        if task_instance.state == State.UP_FOR_RETRY:
            self.log.info("Set the task {task_id} to retry: {data}".format(
                task_id=str(task_run), data=retry_data))
            task_run.set_task_run_state(TaskRunState.UP_FOR_RETRY,
                                        track=True,
                                        error=task_run_error)
        else:
            self.log.info(
                "Task {task_id} is not set up to retry: {data}".format(
                    task_id=str(task_run), data=retry_data))
            task_run.set_task_run_state(TaskRunState.FAILED,
                                        track=True,
                                        error=task_run_error)

    def handle_zombie_task_instance(self, zombie_task_instance):
        # type: (SimpleTaskInstance)-> Optional[SubmittedPodState]

        # find a relevant submitted pod based on TaskInstance.key (dag,task,execution_date,try_number)
        zombie_pod_state = [
            pod_state for pod_state in self.submitted_pods.values()
            if pod_state.scheduler_key == zombie_task_instance.key
        ]
        if not zombie_pod_state:
            self.log.info(
                "Zombie task instance %s is not found at running pods, skipping",
                zombie_task_instance.key,
            )
            return None

        self.log.info("Processing zombie task instance %s as failed",
                      zombie_task_instance)
        zombie_pod_state = zombie_pod_state[0]
        self._process_pod_failed(zombie_pod_state)
        return zombie_pod_state

    @staticmethod
    def _datetime_to_label_safe_datestring(datetime_obj):
        """
        Kubernetes doesn't like ":" in labels, since ISO datetime format uses ":" but
        not "_" let's
        replace ":" with "_"

        :param datetime_obj: datetime.datetime object
        :return: ISO-like string representing the datetime
        """
        formatted = datetime_obj
        if isinstance(datetime_obj, datetime.datetime):
            # Airflow 1 has real date, while Airflow 2.x - only str object
            formatted = datetime_obj.isoformat()
        return formatted.replace(":", "_").replace("+", "_plus_")
示例#7
0
class DbndKubernetesJobWatcher(KubernetesJobWatcher):
    def __init__(self, kube_dbnd, **kwargs):
        super(DbndKubernetesJobWatcher, self).__init__(**kwargs)
        self.kube_dbnd = kube_dbnd  # type: DbndKubernetesClient
        self.processed_events = {}
        self.processed_pods = {}
        self.metrics_logger = KubernetesMetricsLogger()

    def run(self):
        """Performs watching"""
        # we are in the different process than Scheduler
        # 1. Must reset filesystem cache to avoid using out-of-cluster credentials within Kubernetes
        self.reset_fs_cache()
        # 2. Must reset signal handlers to avoid driver and watcher sharing signal handlers

        signal.signal(signal.SIGINT, watcher_sig_handler)
        signal.signal(signal.SIGTERM, watcher_sig_handler)
        signal.signal(signal.SIGQUIT, watcher_sig_handler)

        kube_client = self.kube_dbnd.kube_client
        try:
            while True:
                try:
                    self.resource_version = self._run(
                        kube_client,
                        self.resource_version,
                        self.worker_uuid,
                        self.kube_config,
                    )
                except DatabandSigTermError:
                    break
                except Exception:
                    self.log.exception("Unknown error in KubernetesJobWatcher. Failing")
                    raise
                else:
                    self.log.info(
                        "KubernetesWatcher restarting with resource_version: %s in %s seconds",
                        self.resource_version,
                        self.kube_dbnd.engine_config.watcher_recreation_interval_seconds,
                    )
                    time.sleep(
                        self.kube_dbnd.engine_config.watcher_recreation_interval_seconds
                    )
        except (KeyboardInterrupt, DatabandSigTermError):
            pass

    def _run(self, kube_client, resource_version, worker_uuid, kube_config):
        self.log.info(
            "Event: and now my watch begins starting at resource_version: %s",
            resource_version,
        )

        from kubernetes import watch

        watcher = watch.Watch()
        request_timeout = self.kube_dbnd.engine_config.watcher_request_timeout_seconds
        kwargs = {
            "label_selector": "airflow-worker={}".format(worker_uuid),
            "_request_timeout": (request_timeout, request_timeout),
            "timeout_seconds": self.kube_dbnd.engine_config.watcher_client_timeout_seconds,
        }
        if resource_version:
            kwargs["resource_version"] = resource_version
        if kube_config.kube_client_request_args:
            for key, value in kube_config.kube_client_request_args.items():
                kwargs[key] = value

        for event in watcher.stream(
            kube_client.list_namespaced_pod, self.namespace, **kwargs
        ):
            try:
                # DBND PATCH
                # we want to process the message
                task = event["object"]
                self.log.debug(
                    " %s had an event of type %s", task.metadata.name, event["type"],
                )

                if event["type"] == "ERROR":
                    return self.process_error(event)

                pod_data = event["object"]
                pod_name = pod_data.metadata.name
                phase = pod_data.status.phase

                if self.processed_events.get(pod_name):
                    self.log.debug(
                        "Event: %s at %s - skipping as seen", phase, pod_name
                    )
                    continue
                status = self.kube_dbnd.process_pod_event(event)

                self._update_node_name(pod_name, pod_data)

                if status in ["Succeeded", "Failed"]:
                    self.processed_events[pod_name] = status

                self.process_status_quite(
                    task.metadata.name,
                    status,
                    task.metadata.labels,
                    task.metadata.resource_version,
                )
                self.resource_version = task.metadata.resource_version

            except Exception as e:
                self.log.warning(
                    "Event: Exception raised on specific event: %s, Exception: %s",
                    event,
                    e,
                )
        return self.resource_version

    def process_error(self, event):
        # Overriding airflow's order of operation to prevent redundant error logs (no actual error, just reset
        # resource version)
        raw_object = event["raw_object"]
        if raw_object["code"] == 410:
            self.log.info(
                "Kubernetes resource version is too old, resetting to 0 => %s",
                (raw_object["message"],),
            )
            # Return resource version 0
            return "0"
        raise DatabandRuntimeError(
            "Kubernetes failure for %s with code %s and message: %s"
            % (raw_object["reason"], raw_object["code"], raw_object["message"])
        )

    def _update_node_name(self, pod_name, pod_data):
        if self.processed_pods.get(pod_name):
            self.log.debug("Pod %s has already been logged to metrics - skipping")
            return
        node_name = pod_data.spec.node_name
        if not node_name:
            return
        # Some events are missing the node name, but it will get there for sure
        try:
            task_id = pod_data.metadata.labels.get("task_id")
            if not task_id:
                return

            dr = try_get_databand_run()
            if not dr:
                return
            task_run = dr.get_task_run(task_id)
            if not task_run:
                return

            self.metrics_logger.log_pod_information(task_run.task, pod_name, node_name)
        except Exception as ex:
            logger.info("Failed to gather node name for %s", pod_name)
        finally:
            self.processed_pods[pod_name] = True

    def process_status_quite(self, pod_id, status, labels, resource_version):
        """Process status response"""
        if status == "Pending":
            self.log.debug("Event: %s Pending", pod_id)
        elif status == "Failed":
            self.log.debug("Event: %s Failed", pod_id)
            self.watcher_queue.put((pod_id, State.FAILED, labels, resource_version))
        elif status == "Succeeded":
            self.log.debug("Event: %s Succeeded", pod_id)
            self.watcher_queue.put((pod_id, State.SUCCESS, labels, resource_version))
        elif status == "Running":
            self.log.debug("Event: %s is Running", pod_id)
        else:
            self.log.warning(
                "Event: Invalid state: %s on pod: %s with labels: %s with "
                "resource_version: %s",
                status,
                pod_id,
                labels,
                resource_version,
            )

    def reset_fs_cache(self):
        from targets.fs import reset_fs_cache

        reset_fs_cache()