Exemplo n.º 1
0
 def test_update_task_state(self):
     self._create_task_state("1", "2", self.execution_date)
     task_state = TaskState.get_task_state(self.dag_id, self.task_id,
                                           self.execution_date)
     task_state.task_state = 100
     task_state.update_task_state()
     task_state = TaskState.get_task_state(self.dag_id, self.task_id,
                                           self.execution_date)
     assert task_state.task_state == 100
Exemplo n.º 2
0
    def verify_integrity(self, session=None):
        """
        Verifies the DagRun by checking for removed tasks or tasks that are not in the
        database yet. It will set state to removed or add the task if required.
        """
        from airflow.models.taskinstance import TaskInstance  # Avoid circular import

        dag = self.get_dag()
        tis = self.get_task_instances(session=session)

        # check for removed or restored tasks
        task_ids = []
        for ti in tis:
            task_ids.append(ti.task_id)
            task = None
            try:
                task = dag.get_task(ti.task_id)
            except AirflowException:
                if ti.state == State.REMOVED:
                    pass  # ti has already been removed, just ignore it
                elif self.state is not State.RUNNING and not dag.partial:
                    self.log.warning("Failed to get task '{}' for dag '{}'. "
                                     "Marking it as removed.".format(ti, dag))
                    Stats.incr(
                        "task_removed_from_dag.{}".format(dag.dag_id), 1, 1)
                    ti.state = State.REMOVED

            is_task_in_dag = task is not None
            should_restore_task = is_task_in_dag and ti.state == State.REMOVED
            if should_restore_task:
                self.log.info("Restoring task '{}' which was previously "
                              "removed from DAG '{}'".format(ti, dag))
                Stats.incr("task_restored_to_dag.{}".format(dag.dag_id), 1, 1)
                ti.state = State.NONE

        # check for missing tasks
        for task in six.itervalues(dag.task_dict):
            if task.start_date > self.execution_date and not self.is_backfill:
                continue

            if task.task_id not in task_ids:
                Stats.incr(
                    "task_instance_created-{}".format(task.__class__.__name__),
                    1, 1)
                # add TaskState to db
                ti = TaskInstance(task, self.execution_date)
                ts = TaskState(ti)
                if task.event_met_handler() is not None:
                    ts.event_handler = task.event_met_handler()
                session.add(ti)
                session.add(ts)

        session.commit()
    def _process_finished_ti(session, ti):
        """
        Process the TaskInstance object which already finished.
        :param session:
        :param ti:
        :return:
        """
        from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep
        from airflow.ti_deps.deps.valid_state_dep import ValidStateDep
        from airflow.ti_deps.deps.events_dep import EventTIDep

        EVENT_SCHEDULED_DEPS = {
            RunnableExecDateDep(),
            ValidStateDep(FINISHED_STATES),
            EventTIDep(),
        }
        dep_context = DepContext(deps=EVENT_SCHEDULED_DEPS)
        if ti.are_dependencies_met(dep_context=dep_context, session=session):
            ts = TaskState.query_task_state(ti, session=session)
            if ts.action is None or TaskAction(ts.action) == TaskAction.NONE:
                return
            if TaskAction(ts.action) == TaskAction.RESTART:
                log.debug('Queuing Finished task: %s', ti)
                ti.state = State.SCHEDULED
                log.info("Creating / updating %s in ORM", ti)
                session.merge(ti)
            ts.action = None
            session.merge(ts)
            session.commit()
Exemplo n.º 4
0
 def _create_task_state(dag_id, task_id, execution_date):
     with create_session() as session:
         ts = TaskState(dag_id=dag_id,
                        task_id=task_id,
                        execution_date=execution_date)
         session.merge(ts)
         session.commit()
Exemplo n.º 5
0
 def test_get_task_state(self):
     dag_id = self.dag_id
     task_id = self.task_id
     task_state = TaskState.get_task_state(dag_id, task_id,
                                           self.execution_date)
     assert task_state.dag_id == dag_id
     assert task_state.task_id == task_id
Exemplo n.º 6
0
 def _get_dep_statuses(self, ti, session, dep_context=None):
     from airflow.models.taskstate import TaskState, TaskAction
     task_state: TaskState = TaskState.query_task_state(ti, session)
     event_handler = task_state.event_handler
     if event_handler is None:
         yield self._passing_status(reason="handler is NULL")
         return
     if task_state.action is None or task_state.action == TaskAction.NONE:
         yield self._failing_status("{0} action is None".format(ti))
     else:
         yield self._passing_status(
             reason="{0} handler pass status".format(ti))
    def test_execute_event_handler(self):
        event = BaseEvent("test_event", "test_event", namespace="default")

        self.create_task_state(dag_run=self._dag_run,
                               task_id='operator_toggle_handler')

        event_executor = DagRunEventExecutor(self._serialized_dag)
        actions = event_executor.execute_event_handler(self._dag_run,
                                                       event=event)
        assert actions['operator_toggle_handler'] == SchedulingAction.START
        assert TaskState.get_task_state(
            dag_id="test_event_handler",
            task_id="operator_toggle_handler",
            executor_date=self._dag_run.execution_date).task_state is True

        actions = event_executor.execute_event_handler(self._dag_run,
                                                       event=event)
        assert actions['operator_toggle_handler'] == SchedulingAction.STOP
        assert TaskState.get_task_state(
            dag_id="test_event_handler",
            task_id="operator_toggle_handler",
            executor_date=self._dag_run.execution_date).task_state is False
Exemplo n.º 8
0
 def _operator_handle_event(event, operator,
                            execution_date) -> SchedulingAction:
     task_state = TaskState.get_task_state(operator.dag_id,
                                           operator.task_id, execution_date)
     event_handler = operator.get_events_handler()
     if task_state:
         scheduling_action, state = event_handler.handle_event(
             event, task_state.task_state)
         task_state.task_state = state
         task_state.update_task_state()
     else:
         scheduling_action, state = event_handler.handle_event(event, None)
     return scheduling_action
    def handle_event(self, event: Event, ti: TaskInstance, ts: TaskState, session=None)->TaskAction:

        if ts.task_state is None:
            ts.task_state = AiFlowTs()
        af_ts = copy.deepcopy(ts.task_state)
        af_ts.event_map[(event.key, event.event_type)] = event
        aw = ActionWrapper()
        res = self.met_sc(af_ts, aw)
        if res:
            if aw.action in SCHEDULED_ACTION:
                af_ts.schedule_time = time.time_ns()
            ts.task_state = af_ts
            session.merge(ts)
            session.commit()
            if len(self.configs) == 0:
                return TaskAction.START
            else:
                return aw.action
        else:
            ts.task_state = af_ts
            session.merge(ts)
            session.commit()
            return TaskAction.NONE
 def _operator_handle_event(event, operator,
                            execution_date) -> SchedulingAction:
     task_state = TaskState.get_task_state(operator.dag_id,
                                           operator.task_id, execution_date)
     event_handler = operator.get_events_handler()
     if not task_state:
         task_state = TaskState(task_id=operator.task_id,
                                dag_id=operator.dag_id,
                                execution_date=execution_date)
         with create_session() as session:
             session.add(task_state)
             session.commit()
     scheduling_action, state = event_handler.handle_event(
         event, task_state.task_state)
     task_state.task_state = state
     task_state.update_task_state()
     return scheduling_action
    def _set_task_instance_state(self,
                                 dag_run,
                                 dag_id,
                                 task_id,
                                 execution_date,
                                 state,
                                 try_number,
                                 session=None):
        """
        Set the task state to db and maybe set the dagrun object finished to db.
        :param dag_run: DagRun object
        :param dag_id: Dag identify
        :param task_id: task identify
        :param execution_date: the dag run execution date
        :param state: the task state should be set.
        :param try_number: the task try_number.
        :param session:
        :return:
        """
        TI = models.TaskInstance
        qry = session.query(TI).filter(TI.dag_id == dag_id,
                                       TI.task_id == task_id,
                                       TI.execution_date == execution_date)
        ti = qry.first()
        if not ti:
            self.log.warning("TaskInstance %s went missing from the database",
                             ti)
            return
        ts = TaskState.query_task_state(ti, session)
        self.log.debug(
            "set task state dag_id {0} task_id {1} execution_date {2} try_number {3} "
            "current try_number {4} state {5} ack_id {6} action {7}.".format(
                dag_id, task_id, execution_date, try_number, ti.try_number,
                state, ts.ack_id, ts.action))
        is_restart = False
        if state == State.FAILED or state == State.SUCCESS or state == State.SHUTDOWN:
            if ti.try_number == try_number and ti.state == State.QUEUED:
                msg = ("Executor reports task instance {} finished ({}) "
                       "although the task says its {}. Was the task "
                       "killed externally?".format(ti, state, ti.state))
                Stats.incr('scheduler.tasks.killed_externally')
                self.log.error(msg)
                try:
                    dag = self.task_route.find_dagrun(dag_id, execution_date)
                    ti.task = dag.get_task(task_id)
                    ti.handle_failure(msg)
                except Exception:
                    self.log.error(
                        "Cannot load the dag bag to handle failure for %s"
                        ". Setting task to FAILED without callbacks or "
                        "retries. Do you have enough resources?", ti)
                ti.state = State.FAILED
                session.merge(ti)
            else:
                if ts.action is None:
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} action is None."
                        .format(dag_id, task_id, execution_date))
                elif TaskAction(ts.action) == TaskAction.RESTART:
                    # if ts.stop_flag is not None and ts.stop_flag == try_number:
                    ti.state = State.SCHEDULED
                    ts.action = None
                    ts.stop_flag = None
                    ts.ack_id = 0
                    session.merge(ti)
                    session.merge(ts)
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} try_number {3} restart action."
                        .format(dag_id, task_id, execution_date,
                                str(try_number)))
                    is_restart = True
                elif TaskAction(ts.action) == TaskAction.STOP:
                    # if ts.stop_flag is not None and ts.stop_flag == try_number:
                    ts.action = None
                    ts.stop_flag = None
                    ts.ack_id = 0
                    session.merge(ts)
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} try_number {3} stop action."
                        .format(dag_id, task_id, execution_date,
                                str(try_number)))
                else:
                    self.log.debug(
                        "task dag_id {0} task_id {1} execution_date {2} action {3}."
                        .format(dag_id, task_id, execution_date, ts.action))
            session.commit()

        if not is_restart and ti.state == State.RUNNING:
            self.log.debug(
                "set task dag_id {0} task_id {1} execution_date {2} state {3}".
                format(dag_id, task_id, execution_date, state))
            ti.state = state
            session.merge(ti)
        session.commit()
        # update dagrun state
        sync_dag_run = session.query(DagRun).filter(
            DagRun.id == dag_run.id).first()
        if sync_dag_run.state not in FINISHED_STATES:
            if self.dagrun_route.find_dagrun_by_id(sync_dag_run.id) is None:
                self.log.error(
                    "DagRun lost dag_id {0} task_id {1} execution_date {2}".
                    format(dag_id, task_id, execution_date))
            else:
                run_process_func(target=dag_run_update_state,
                                 args=(
                                     dag_run,
                                     self.dagrun_route.find_simple_dag(
                                         dag_run.id),
                                 ))
    def _get_dag_runs(self, event, session):
        dag_runs = []
        if EventType.is_in(event.event_type) and EventType(
                event.event_type) != EventType.UNDEFINED:
            if EventType(event.event_type) == EventType.DAG_RUN_EXECUTABLE:
                dag_run_id = int(event.key)
                dag_run = session.query(DagRun).filter(
                    DagRun.id == dag_run_id).first()
                if dag_run is None:
                    self.log.error("DagRun is None id {0}".format(dag_run_id))
                    return dag_runs
                simple_dag = event.simple_dag
                dag_run.pickle_id = None
                # create route
                self.dagrun_route.add_dagrun(dag_run, simple_dag, session)
                dag_runs.append(dag_run)

            elif EventType(event.event_type) == EventType.TASK_STATUS_CHANGED:
                dag_id, task_id, execution_date = TaskInstanceHelper.from_task_key(
                    event.key)
                state, try_num = TaskInstanceHelper.from_event_value(
                    event.value)
                dag_run = self.dagrun_route.find_dagrun(dag_id, execution_date)
                if dag_run is None:
                    return dag_runs
                self._set_task_instance_state(dag_run, dag_id, task_id,
                                              execution_date, state, try_num)

                sync_dag_run = session.query(DagRun).filter(
                    DagRun.id == dag_run.id).first()
                if sync_dag_run.state in State.finished():
                    self.log.info(
                        "DagRun finished dag_id {0} execution_date {1} state {2}"
                        .format(dag_run.dag_id, dag_run.execution_date,
                                sync_dag_run.state))
                    if self.dagrun_route.find_dagrun_by_id(
                            sync_dag_run.id) is not None:
                        self.dagrun_route.remove_dagrun(dag_run, session)
                        self.log.debug("Route remove dag run {0}".format(
                            sync_dag_run.id))
                        self.mail_box.send_message(
                            DagRunFinishedEvent(dag_run.id,
                                                sync_dag_run.state))
                else:
                    dag_runs.append(dag_run)

            elif EventType(event.event_type) == EventType.DAG_RUN_FINISHED:
                self.log.debug("DagRun {0} finished".format(event.key))
            elif EventType(event.event_type) == EventType.STOP_SCHEDULER_CMD:
                if self.unit_test_mode:
                    self.running = False
                return dag_runs
        else:
            runs = self.dagrun_route.find_dagruns_by_event(
                event_key=event.key, event_type=event.event_type)
            if runs is not None:
                for run in runs:
                    task_deps = load_task_dependencies(dag_id=run.dag_id,
                                                       session=session)
                    tis = run.get_task_instances(session=session)
                    for ti in tis:
                        if ti.task_id not in task_deps:
                            continue
                        if (event.key,
                                event.event_type) in task_deps[ti.task_id]:
                            self.log.debug("{0} handle event {1}".format(
                                ti.task_id, event))
                            ts = TaskState.query_task_state(ti,
                                                            session=session)
                            handler = ts.event_handler
                            if handler is not None:
                                action = handler.handle_event(event,
                                                              ti=ti,
                                                              ts=ts,
                                                              session=session)
                                ts.action = action
                                session.merge(ts)
                                session.commit()
                                self.log.debug(
                                    "set task action {0} {1}".format(
                                        ti.task_id, action))
                dag_runs.extend(runs)
                session.commit()

        for dag_run in dag_runs:
            run_process_func(target=process_tasks,
                             args=(
                                 dag_run,
                                 self.dagrun_route.find_simple_dag(dag_run.id),
                                 self.log,
                             ))
        return dag_runs
 def create_task_state(dag_run: DagRun, task_id: str):
     TaskState(dag_id=dag_run.dag_id,
               task_id=task_id,
               execution_date=dag_run.execution_date).update_task_state()