def _process_backfill_task_instances( self, # pylint: disable=too-many-statements ti_status, executor, pickle_id, start_date=None, session=None): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :type ti_status: BackfillJob._DagRunTaskStatus :param executor: the executor to run the task instances :type executor: BaseExecutor :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job :type start_date: datetime.datetime :param session: the current session object :type session: sqlalchemy.orm.session.Session :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] while ((len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(ti_status.deadlocked) == 0): self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish @provide_session def _per_task_process(task, key, ti, session=None): # pylint: disable=too-many-return-statements ti.refresh_from_db(lock_for_update=True, session=session) task = self.dag.get_task(ti.task_id, include_subdags=True) ti.task = task ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen") ti.set_state(State.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(State.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = State.QUEUED ti.queued_by_job_id = self.id ti.queued_dttm = timezone.utcnow() session.merge(ti) cfg_path = None if self.executor_class in ( ExecutorLoader.LOCAL_EXECUTOR, ExecutorLoader.SEQUENTIAL_EXECUTOR): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == State.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key) try: # pylint: disable=too-many-nested-blocks for task in self.dag.topological_sort( include_subdag_tasks=True): for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue pool = session.query(models.Pool) \ .filter(models.Pool.pool == task.pool) \ .first() if not pool: raise PoolNotFound('Unknown pool: {}'.format( task.pool)) open_slots = pool.open_slots(session=session) if open_slots <= 0: raise NoAvailablePoolSlot( "Not scheduling since there are " "{0} open slots in pool {1}".format( open_slots, task.pool)) num_running_task_instances_in_dag = DAG.get_num_task_instances( self.dag_id, states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_dag >= self.dag.concurrency: raise DagConcurrencyLimitReached( "Not scheduling since DAG concurrency limit " "is reached.") if task.task_concurrency: num_running_task_instances_in_task = DAG.get_num_task_instances( dag_id=self.dag_id, task_ids=[task.task_id], states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_task >= task.task_concurrency: raise TaskConcurrencyLimitReached( "Not scheduling since Task concurrency limit " "is reached.") _per_task_process(task, key, ti) except (NoAvailablePoolSlot, DagConcurrencyLimitReached, TaskConcurrencyLimitReached) as e: self.log.debug(e) # execute the tasks in the queue self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if (ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0): self.log.warning("Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values()) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() # check executor state self._manage_executor_state(ti_status.running) # update the task counters self._update_counters(ti_status=ti_status) # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) if run.state in State.finished: ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) # return updated status return executed_run_dates
def _process_backfill_task_instances( self, ti_status, executor, pickle_id, start_date=None, session=None, ): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :param executor: the executor to run the task instances :param pickle_id: the pickle_id if dag is pickled, None otherwise :param start_date: the start date of the backfill job :param session: the current session object :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] is_unit_test = airflow_conf.getboolean('core', 'unit_test_mode') while (len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(ti_status.deadlocked) == 0: self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish def _per_task_process(key, ti: TaskInstance, session=None): ti.refresh_from_db(lock_for_update=True, session=session) task = self.dag.get_task(ti.task_id, include_subdags=True) ti.task = task self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == TaskInstanceState.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == TaskInstanceState.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: Task instance %s state was set to None externally. This should not happen", ti) ti.set_state(TaskInstanceState.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (TaskInstanceState.FAILED, TaskInstanceState.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(TaskInstanceState.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (TaskInstanceState.FAILED, TaskInstanceState.UPSTREAM_FAILED): self.log.error("Task instance %s with state %s", ti, ti.state) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return if self.ignore_first_depends_on_past: dagrun = ti.get_dagrun(session=session) ignore_depends_on_past = dagrun.execution_date == ( start_date or ti.start_date) else: ignore_depends_on_past = False backfill_context = DepContext( deps=BACKFILL_QUEUED_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True, ) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = TaskInstanceState.QUEUED ti.queued_by_job_id = self.id ti.queued_dttm = timezone.utcnow() session.merge(ti) cfg_path = None if self.executor_class in ( executor_constants.LOCAL_EXECUTOR, executor_constants.SEQUENTIAL_EXECUTOR, ): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path, ) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == TaskInstanceState.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == TaskInstanceState.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == TaskInstanceState.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key) try: for task in self.dag.topological_sort( include_subdag_tasks=True): for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue pool = session.query(models.Pool).filter( models.Pool.pool == task.pool).first() if not pool: raise PoolNotFound(f'Unknown pool: {task.pool}') open_slots = pool.open_slots(session=session) if open_slots <= 0: raise NoAvailablePoolSlot( f"Not scheduling since there are {open_slots} open slots in pool {task.pool}" ) num_running_task_instances_in_dag = DAG.get_num_task_instances( self.dag_id, states=self.STATES_COUNT_AS_RUNNING, session=session, ) if num_running_task_instances_in_dag >= self.dag.max_active_tasks: raise DagConcurrencyLimitReached( "Not scheduling since DAG max_active_tasks limit is reached." ) if task.max_active_tis_per_dag: num_running_task_instances_in_task = DAG.get_num_task_instances( dag_id=self.dag_id, task_ids=[task.task_id], states=self.STATES_COUNT_AS_RUNNING, session=session, ) if num_running_task_instances_in_task >= task.max_active_tis_per_dag: raise TaskConcurrencyLimitReached( "Not scheduling since Task concurrency limit is reached." ) _per_task_process(key, ti, session) session.commit() except (NoAvailablePoolSlot, DagConcurrencyLimitReached, TaskConcurrencyLimitReached) as e: self.log.debug(e) self.heartbeat(only_if_necessary=is_unit_test) # execute the tasks in the queue executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if (ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0): self.log.warning("Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values()) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() # check executor state -- and expand any mapped TIs for node, run_id, mapped_tis in self._manage_executor_state( ti_status.running, session): def to_keep(key: TaskInstanceKey) -> bool: if key.dag_id != node.dag_id or key.task_id != node.task_id or key.run_id != run_id: # For another Dag/Task/Run -- don't remove return True return False # remove the old unmapped TIs for node -- they have been replaced with the mapped TIs ti_status.to_run = { key: ti for (key, ti) in ti_status.to_run.items() if to_keep(key) } ti_status.to_run.update({ti.key: ti for ti in mapped_tis}) # update the task counters self._update_counters(ti_status=ti_status, session=session) session.commit() # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) if run.state in State.finished: ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) session.commit() # return updated status return executed_run_dates