Exemplo n.º 1
0
    def _process_backfill_task_instances(self,
                                         ti_status,
                                         executor,
                                         pickle_id,
                                         start_date=None,
                                         session=None):
        """
        Process a set of task instances from a set of dag runs. Special handling is done
        to account for different task instance states that could be present when running
        them in a backfill process.

        :param ti_status: the internal status of the job
        :type ti_status: BackfillJob._DagRunTaskStatus
        :param executor: the executor to run the task instances
        :type executor: BaseExecutor
        :param pickle_id: the pickle_id if dag is pickled, None otherwise
        :type pickle_id: int
        :param start_date: the start date of the backfill job
        :type start_date: datetime.datetime
        :param session: the current session object
        :type session: sqlalchemy.orm.session.Session
        :return: the list of execution_dates for the finished dag runs
        :rtype: list
        """

        executed_run_dates = []

        while ((len(ti_status.to_run) > 0 or len(ti_status.running) > 0)
               and len(ti_status.deadlocked) == 0):
            self.log.debug("*** Clearing out not_ready list ***")
            ti_status.not_ready.clear()

            # we need to execute the tasks bottom to top
            # or leaf to root, as otherwise tasks might be
            # determined deadlocked while they are actually
            # waiting for their upstream to finish
            @provide_session
            def _per_task_process(task, key, ti, session=None):
                ti.refresh_from_db()

                task = self.dag.get_task(ti.task_id)
                ti.task = task

                ignore_depends_on_past = (self.ignore_first_depends_on_past
                                          and ti.execution_date
                                          == (start_date or ti.start_date))
                self.log.debug("Task instance to run %s state %s", ti,
                               ti.state)

                # The task was already marked successful or skipped by a
                # different Job. Don't rerun it.
                if ti.state == State.SUCCESS:
                    ti_status.succeeded.add(key)
                    self.log.debug("Task instance %s succeeded. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return
                elif ti.state == State.SKIPPED:
                    ti_status.skipped.add(key)
                    self.log.debug("Task instance %s skipped. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # guard against externally modified tasks instances or
                # in case max concurrency has been reached at task runtime
                elif ti.state == State.NONE:
                    self.log.warning(
                        "FIXME: task instance {} state was set to None "
                        "externally. This should not happen")
                    ti.set_state(State.SCHEDULED, session=session)
                if self.rerun_failed_tasks:
                    # Rerun failed tasks or upstreamed failed tasks
                    if ti.state in (State.FAILED, State.UPSTREAM_FAILED):
                        self.log.error("Task instance {ti} "
                                       "with state {state}".format(
                                           ti=ti, state=ti.state))
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        # Reset the failed task in backfill to scheduled state
                        ti.set_state(State.SCHEDULED, session=session)
                else:
                    # Default behaviour which works for subdag.
                    if ti.state in (State.FAILED, State.UPSTREAM_FAILED):
                        self.log.error("Task instance {ti} "
                                       "with {state} state".format(
                                           ti=ti, state=ti.state))
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        return

                backfill_context = DepContext(
                    deps=BACKFILL_QUEUED_DEPS,
                    ignore_depends_on_past=ignore_depends_on_past,
                    ignore_task_deps=self.ignore_task_deps,
                    flag_upstream_failed=True)

                ti.refresh_from_db(lock_for_update=True, session=session)
                # Is the task runnable? -- then run it
                # the dependency checker can change states of tis
                if ti.are_dependencies_met(dep_context=backfill_context,
                                           session=session,
                                           verbose=self.verbose):
                    if executor.has_task(ti):
                        self.log.debug(
                            "Task Instance %s already in executor "
                            "waiting for queue to clear", ti)
                    else:
                        self.log.debug('Sending %s to executor', ti)
                        # Skip scheduled state, we are executing immediately
                        ti.state = State.QUEUED
                        ti.queued_dttm = timezone.utcnow(
                        ) if not ti.queued_dttm else ti.queued_dttm
                        session.merge(ti)

                        cfg_path = None
                        if executor.__class__ in (
                                executors.LocalExecutor,
                                executors.SequentialExecutor):
                            cfg_path = tmp_configuration_copy()

                        executor.queue_task_instance(
                            ti,
                            mark_success=self.mark_success,
                            pickle_id=pickle_id,
                            ignore_task_deps=self.ignore_task_deps,
                            ignore_depends_on_past=ignore_depends_on_past,
                            pool=self.pool,
                            cfg_path=cfg_path)
                        ti_status.running[key] = ti
                        ti_status.to_run.pop(key)
                    session.commit()
                    return

                if ti.state == State.UPSTREAM_FAILED:
                    self.log.error("Task instance %s upstream failed", ti)
                    ti_status.failed.add(key)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # special case
                if ti.state == State.UP_FOR_RETRY:
                    self.log.debug(
                        "Task instance %s retry period not "
                        "expired yet", ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # special case
                if ti.state == State.UP_FOR_RESCHEDULE:
                    self.log.debug(
                        "Task instance %s reschedule period not "
                        "expired yet", ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # all remaining tasks
                self.log.debug('Adding %s to not_ready', ti)
                ti_status.not_ready.add(key)

            try:
                for task in self.dag.topological_sort():
                    for key, ti in list(ti_status.to_run.items()):
                        if task.task_id != ti.task_id:
                            continue

                        pool = session.query(models.Pool) \
                            .filter(models.Pool.pool == task.pool) \
                            .first()
                        if not pool:
                            raise PoolNotFound('Unknown pool: {}'.format(
                                task.pool))

                        open_slots = pool.open_slots(session=session)
                        if open_slots <= 0:
                            raise NoAvailablePoolSlot(
                                "Not scheduling since there are "
                                "%s open slots in pool %s".format(
                                    open_slots, task.pool))

                        num_running_task_instances_in_dag = DAG.get_num_task_instances(
                            self.dag_id,
                            states=self.STATES_COUNT_AS_RUNNING,
                        )

                        if num_running_task_instances_in_dag >= self.dag.concurrency:
                            raise DagConcurrencyLimitReached(
                                "Not scheduling since DAG concurrency limit "
                                "is reached.")

                        if task.task_concurrency:
                            num_running_task_instances_in_task = DAG.get_num_task_instances(
                                dag_id=self.dag_id,
                                task_ids=[task.task_id],
                                states=self.STATES_COUNT_AS_RUNNING,
                            )

                            if num_running_task_instances_in_task >= task.task_concurrency:
                                raise TaskConcurrencyLimitReached(
                                    "Not scheduling since Task concurrency limit "
                                    "is reached.")

                        _per_task_process(task, key, ti)
            except (NoAvailablePoolSlot, DagConcurrencyLimitReached,
                    TaskConcurrencyLimitReached) as e:
                self.log.debug(e)

            # execute the tasks in the queue
            self.heartbeat()
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run and there are no running tasks then the backfill
            # is deadlocked
            if (ti_status.not_ready
                    and ti_status.not_ready == set(ti_status.to_run)
                    and len(ti_status.running) == 0):
                self.log.warning("Deadlock discovered for ti_status.to_run=%s",
                                 ti_status.to_run.values())
                ti_status.deadlocked.update(ti_status.to_run.values())
                ti_status.to_run.clear()

            # check executor state
            self._manage_executor_state(ti_status.running)

            # update the task counters
            self._update_counters(ti_status=ti_status)

            # update dag run state
            _dag_runs = ti_status.active_runs[:]
            for run in _dag_runs:
                run.update_state(session=session)
                if run.state in State.finished():
                    ti_status.finished_runs += 1
                    ti_status.active_runs.remove(run)
                    executed_run_dates.append(run.execution_date)

            self._log_progress(ti_status)

        # return updated status
        return executed_run_dates
Exemplo n.º 2
0
    def _process_backfill_task_instances(
        self,
        ti_status,
        executor,
        pickle_id,
        start_date=None,
        session=None,
    ):
        """
        Process a set of task instances from a set of dag runs. Special handling is done
        to account for different task instance states that could be present when running
        them in a backfill process.

        :param ti_status: the internal status of the job
        :param executor: the executor to run the task instances
        :param pickle_id: the pickle_id if dag is pickled, None otherwise
        :param start_date: the start date of the backfill job
        :param session: the current session object
        :return: the list of execution_dates for the finished dag runs
        :rtype: list
        """
        executed_run_dates = []

        is_unit_test = airflow_conf.getboolean('core', 'unit_test_mode')

        while (len(ti_status.to_run) > 0 or
               len(ti_status.running) > 0) and len(ti_status.deadlocked) == 0:
            self.log.debug("*** Clearing out not_ready list ***")
            ti_status.not_ready.clear()

            # we need to execute the tasks bottom to top
            # or leaf to root, as otherwise tasks might be
            # determined deadlocked while they are actually
            # waiting for their upstream to finish
            def _per_task_process(key, ti: TaskInstance, session=None):
                ti.refresh_from_db(lock_for_update=True, session=session)

                task = self.dag.get_task(ti.task_id, include_subdags=True)
                ti.task = task

                self.log.debug("Task instance to run %s state %s", ti,
                               ti.state)

                # The task was already marked successful or skipped by a
                # different Job. Don't rerun it.
                if ti.state == TaskInstanceState.SUCCESS:
                    ti_status.succeeded.add(key)
                    self.log.debug("Task instance %s succeeded. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return
                elif ti.state == TaskInstanceState.SKIPPED:
                    ti_status.skipped.add(key)
                    self.log.debug("Task instance %s skipped. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # guard against externally modified tasks instances or
                # in case max concurrency has been reached at task runtime
                elif ti.state == State.NONE:
                    self.log.warning(
                        "FIXME: Task instance %s state was set to None externally. This should not happen",
                        ti)
                    ti.set_state(TaskInstanceState.SCHEDULED, session=session)
                if self.rerun_failed_tasks:
                    # Rerun failed tasks or upstreamed failed tasks
                    if ti.state in (TaskInstanceState.FAILED,
                                    TaskInstanceState.UPSTREAM_FAILED):
                        self.log.error("Task instance %s with state %s", ti,
                                       ti.state)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        # Reset the failed task in backfill to scheduled state
                        ti.set_state(TaskInstanceState.SCHEDULED,
                                     session=session)
                else:
                    # Default behaviour which works for subdag.
                    if ti.state in (TaskInstanceState.FAILED,
                                    TaskInstanceState.UPSTREAM_FAILED):
                        self.log.error("Task instance %s with state %s", ti,
                                       ti.state)
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        return

                if self.ignore_first_depends_on_past:
                    dagrun = ti.get_dagrun(session=session)
                    ignore_depends_on_past = dagrun.execution_date == (
                        start_date or ti.start_date)
                else:
                    ignore_depends_on_past = False

                backfill_context = DepContext(
                    deps=BACKFILL_QUEUED_DEPS,
                    ignore_depends_on_past=ignore_depends_on_past,
                    ignore_task_deps=self.ignore_task_deps,
                    flag_upstream_failed=True,
                )

                # Is the task runnable? -- then run it
                # the dependency checker can change states of tis
                if ti.are_dependencies_met(dep_context=backfill_context,
                                           session=session,
                                           verbose=self.verbose):
                    if executor.has_task(ti):
                        self.log.debug(
                            "Task Instance %s already in executor waiting for queue to clear",
                            ti)
                    else:
                        self.log.debug('Sending %s to executor', ti)
                        # Skip scheduled state, we are executing immediately
                        ti.state = TaskInstanceState.QUEUED
                        ti.queued_by_job_id = self.id
                        ti.queued_dttm = timezone.utcnow()
                        session.merge(ti)
                        try:
                            session.commit()
                        except OperationalError:
                            self.log.exception(
                                "Failed to commit task state change due to operational error"
                            )
                            session.rollback()
                            # early exit so the outer loop can retry
                            return

                        cfg_path = None
                        if self.executor_class in (
                                executor_constants.LOCAL_EXECUTOR,
                                executor_constants.SEQUENTIAL_EXECUTOR,
                        ):
                            cfg_path = tmp_configuration_copy()

                        executor.queue_task_instance(
                            ti,
                            mark_success=self.mark_success,
                            pickle_id=pickle_id,
                            ignore_task_deps=self.ignore_task_deps,
                            ignore_depends_on_past=ignore_depends_on_past,
                            pool=self.pool,
                            cfg_path=cfg_path,
                        )
                        ti_status.running[key] = ti
                        ti_status.to_run.pop(key)
                    return

                if ti.state == TaskInstanceState.UPSTREAM_FAILED:
                    self.log.error("Task instance %s upstream failed", ti)
                    ti_status.failed.add(key)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # special case
                if ti.state == TaskInstanceState.UP_FOR_RETRY:
                    self.log.debug(
                        "Task instance %s retry period not expired yet", ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # special case
                if ti.state == TaskInstanceState.UP_FOR_RESCHEDULE:
                    self.log.debug(
                        "Task instance %s reschedule period not expired yet",
                        ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # all remaining tasks
                self.log.debug('Adding %s to not_ready', ti)
                ti_status.not_ready.add(key)

            try:
                for task in self.dag.topological_sort(
                        include_subdag_tasks=True):
                    for key, ti in list(ti_status.to_run.items()):
                        if task.task_id != ti.task_id:
                            continue

                        pool = session.query(models.Pool).filter(
                            models.Pool.pool == task.pool).first()
                        if not pool:
                            raise PoolNotFound(f'Unknown pool: {task.pool}')

                        open_slots = pool.open_slots(session=session)
                        if open_slots <= 0:
                            raise NoAvailablePoolSlot(
                                f"Not scheduling since there are {open_slots} open slots in pool {task.pool}"
                            )

                        num_running_task_instances_in_dag = DAG.get_num_task_instances(
                            self.dag_id,
                            states=self.STATES_COUNT_AS_RUNNING,
                            session=session,
                        )

                        if num_running_task_instances_in_dag >= self.dag.max_active_tasks:
                            raise DagConcurrencyLimitReached(
                                "Not scheduling since DAG max_active_tasks limit is reached."
                            )

                        if task.max_active_tis_per_dag:
                            num_running_task_instances_in_task = DAG.get_num_task_instances(
                                dag_id=self.dag_id,
                                task_ids=[task.task_id],
                                states=self.STATES_COUNT_AS_RUNNING,
                                session=session,
                            )

                            if num_running_task_instances_in_task >= task.max_active_tis_per_dag:
                                raise TaskConcurrencyLimitReached(
                                    "Not scheduling since Task concurrency limit is reached."
                                )

                        _per_task_process(key, ti, session)
                        session.commit()
            except (NoAvailablePoolSlot, DagConcurrencyLimitReached,
                    TaskConcurrencyLimitReached) as e:
                self.log.debug(e)

            self.heartbeat(only_if_necessary=is_unit_test)
            # execute the tasks in the queue
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run and there are no running tasks then the backfill
            # is deadlocked
            if (ti_status.not_ready
                    and ti_status.not_ready == set(ti_status.to_run)
                    and len(ti_status.running) == 0):
                self.log.warning("Deadlock discovered for ti_status.to_run=%s",
                                 ti_status.to_run.values())
                ti_status.deadlocked.update(ti_status.to_run.values())
                ti_status.to_run.clear()

            # check executor state -- and expand any mapped TIs
            for node, run_id, new_mapped_tis, max_map_index in self._manage_executor_state(
                    ti_status.running, session):

                def to_keep(key: TaskInstanceKey) -> bool:
                    if key.dag_id != node.dag_id or key.task_id != node.task_id or key.run_id != run_id:
                        # For another Dag/Task/Run -- don't remove
                        return True
                    return 0 <= key.map_index <= max_map_index

                # remove the old unmapped TIs for node -- they have been replaced with the mapped TIs
                ti_status.to_run = {
                    key: ti
                    for (key, ti) in ti_status.to_run.items() if to_keep(key)
                }

                ti_status.to_run.update({ti.key: ti for ti in new_mapped_tis})

                for new_ti in new_mapped_tis:
                    new_ti.set_state(TaskInstanceState.SCHEDULED,
                                     session=session)

            # update the task counters
            self._update_counters(ti_status=ti_status, session=session)
            session.commit()

            # update dag run state
            _dag_runs = ti_status.active_runs[:]
            for run in _dag_runs:
                run.update_state(session=session)
                if run.state in State.finished:
                    ti_status.finished_runs += 1
                    ti_status.active_runs.remove(run)
                    executed_run_dates.append(run.execution_date)

            self._log_progress(ti_status)
            session.commit()

        # return updated status
        return executed_run_dates