示例#1
0
    def _find_schedulable_tasks(
            self,
            dag_run: DagRun,
            session: Session,
            check_execution_date=False) -> Optional[List[TI]]:
        """
        Make scheduling decisions about an individual dag run

        ``currently_active_runs`` is passed in so that a batch query can be
        used to ask this for all dag runs in the batch, to avoid an n+1 query.

        :param dag_run: The DagRun to schedule
        :return: scheduled tasks
        """
        if not dag_run or dag_run.get_state() in State.finished:
            return
        try:
            dag = dag_run.dag = self.dagbag.get_dag(dag_run.dag_id,
                                                    session=session)
        except SerializedDagNotFound:
            self.log.exception("DAG '%s' not found in serialized_dag table",
                               dag_run.dag_id)
            return None

        if not dag:
            self.log.error("Couldn't find dag %s in DagBag/DB!",
                           dag_run.dag_id)
            return None

        currently_active_runs = session.query(TI.execution_date, ).filter(
            TI.dag_id == dag_run.dag_id,
            TI.state.notin_(list(State.finished)),
        ).all()

        if check_execution_date and dag_run.execution_date > timezone.utcnow(
        ) and not dag.allow_future_exec_dates:
            self.log.warning("Execution date is in future: %s",
                             dag_run.execution_date)
            return None

        if dag.max_active_runs:
            if (len(currently_active_runs) >= dag.max_active_runs
                    and dag_run.execution_date not in currently_active_runs):
                self.log.info(
                    "DAG %s already has %d active runs, not queuing any tasks for run %s",
                    dag.dag_id,
                    len(currently_active_runs),
                    dag_run.execution_date,
                )
                return None

        self._verify_integrity_if_dag_changed(dag_run=dag_run, session=session)

        schedulable_tis, callback_to_run = dag_run.update_state(
            session=session, execute_callbacks=False)
        dag_run.schedule_tis(schedulable_tis, session)

        query = (session.query(TI).outerjoin(TI.dag_run).filter(
            or_(DR.run_id.is_(None),
                DR.run_type != DagRunType.BACKFILL_JOB)).join(
                    TI.dag_model).filter(not_(DM.is_paused)).filter(
                        TI.state == State.SCHEDULED).options(
                            selectinload('dag_model')))
        scheduled_tis: List[TI] = with_row_locks(
            query,
            of=TI,
            **skip_locked(session=session),
        ).all()
        # filter need event tasks
        serialized_dag = session.query(SerializedDagModel).filter(
            SerializedDagModel.dag_id == dag_run.dag_id).first()
        dep: DagEventDependencies = DagEventDependencies.from_json(
            serialized_dag.event_relationships)
        event_task_set = dep.find_event_dependencies_tasks()
        final_scheduled_tis = []
        for ti in scheduled_tis:
            if ti.task_id not in event_task_set:
                final_scheduled_tis.append(ti)

        return final_scheduled_tis
示例#2
0
    def _schedule_dag_run(
        self,
        dag_run: DagRun,
        session: Session,
    ) -> Optional[DagCallbackRequest]:
        """
        Make scheduling decisions about an individual dag run

        :param dag_run: The DagRun to schedule
        :return: Callback that needs to be executed
        """
        dag = dag_run.dag = self.dagbag.get_dag(dag_run.dag_id, session=session)

        if not dag:
            self.log.error("Couldn't find dag %s in DagBag/DB!", dag_run.dag_id)
            return 0

        if (
            dag_run.start_date
            and dag.dagrun_timeout
            and dag_run.start_date < timezone.utcnow() - dag.dagrun_timeout
        ):
            dag_run.set_state(State.FAILED)
            unfinished_task_instances = (
                session.query(TI)
                .filter(TI.dag_id == dag_run.dag_id)
                .filter(TI.run_id == dag_run.run_id)
                .filter(TI.state.in_(State.unfinished))
            )
            for task_instance in unfinished_task_instances:
                task_instance.state = State.SKIPPED
                session.merge(task_instance)
            session.flush()
            self.log.info("Run %s of %s has timed-out", dag_run.run_id, dag_run.dag_id)

            callback_to_execute = DagCallbackRequest(
                full_filepath=dag.fileloc,
                dag_id=dag.dag_id,
                run_id=dag_run.run_id,
                is_failure_callback=True,
                msg='timed_out',
            )

            # Send SLA & DAG Success/Failure Callbacks to be executed
            self._send_dag_callbacks_to_processor(dag_run, callback_to_execute)

            return 0

        if dag_run.execution_date > timezone.utcnow() and not dag.allow_future_exec_dates:
            self.log.error("Execution date is in future: %s", dag_run.execution_date)
            return 0

        self._verify_integrity_if_dag_changed(dag_run=dag_run, session=session)
        # TODO[HA]: Rename update_state -> schedule_dag_run, ?? something else?
        schedulable_tis, callback_to_run = dag_run.update_state(session=session, execute_callbacks=False)

        # This will do one query per dag run. We "could" build up a complex
        # query to update all the TIs across all the execution dates and dag
        # IDs in a single query, but it turns out that can be _very very slow_
        # see #11147/commit ee90807ac for more details
        dag_run.schedule_tis(schedulable_tis, session)

        return callback_to_run