def _get_dep_statuses(self, ti, session, dep_context):
        from dbnd_airflow.scheduler.single_dag_run_job import SingleDagRunJob

        if not SingleDagRunJob.has_instance():
            # if we are in Scheduler or Web Server
            # we don't have current SingleDagRunJob
            # let standard not optimized implementation
            for d in super(TriggerRuleDepOptimizied, self)._get_dep_statuses(
                ti, session, dep_context
            ):
                yield d
            return

        from airflow.utils.trigger_rule import TriggerRule

        TR = TriggerRule

        # Checking that all upstream dependencies have succeeded
        if not ti.task.upstream_list:
            yield self._passing_status(
                reason="The task instance did not have any upstream tasks."
            )
            return

        if ti.task.trigger_rule == TR.DUMMY:
            yield self._passing_status(reason="The task had a dummy trigger rule set.")
            return

        status = (
            SingleDagRunJob.instance().ti_state_manager.get_aggregated_state_status(
                dag_id=ti.dag_id,
                execution_date=ti.execution_date,
                task_ids=ti.task.upstream_task_ids,
            )
        )

        successes = status[State.SUCCESS]
        skipped = status[State.SKIPPED]
        failed = status[State.FAILED]
        upstream_failed = status[State.UPSTREAM_FAILED]

        for dep_status in self._evaluate_trigger_rule(
            ti=ti,
            successes=successes,
            skipped=skipped,
            failed=failed,
            upstream_failed=upstream_failed,
            done=successes + skipped + failed + upstream_failed,
            flag_upstream_failed=dep_context.flag_upstream_failed,
            session=session,
        ):
            yield dep_status
Пример #2
0
    def run_airflow_dag(self, dag, session=None):
        # type:  (DAG, Session) -> None
        af_dag = dag
        databand_run = self.run
        databand_context = databand_run.context
        execution_date = databand_run.execution_date
        s = databand_context.settings  # type: DatabandSettings
        s_run = s.run  # type: RunConfig

        run_id = s_run.id
        if not run_id:
            # we need this name, otherwise Airflow will try to manage our local jobs at scheduler
            # ..zombies cleanup and so on
            run_id = "backfill_{0}_{1}".format(
                databand_run.name, databand_run.execution_date.isoformat())

        if self.airflow_config.disable_db_ping_on_connect:
            from airflow import settings as airflow_settings

            try:
                remove_listener_by_name(airflow_settings.engine,
                                        "engine_connect", "ping_connection")
            except Exception as ex:
                logger.warning("Failed to optimize DB access: %s" % ex)

        if isinstance(self.airflow_task_executor, InProcessExecutor):
            heartrate = 0
        else:
            # we are in parallel mode
            heartrate = airflow_conf.getfloat("scheduler", "JOB_HEARTBEAT_SEC")

        # "Amount of time in seconds to wait when the limit "
        # "on maximum active dag runs (max_active_runs) has "
        # "been reached before trying to execute a dag run "
        # "again.
        delay_on_limit = 1.0

        self._pickle_dag_and_save_pickle_id_for_versioned(af_dag,
                                                          session=session)
        af_dag.sync_to_db(session=session)

        # let create relevant TaskInstance, so SingleDagRunJob will run them
        create_dagrun_from_dbnd_run(
            databand_run=databand_run,
            dag=af_dag,
            run_id=run_id,
            execution_date=execution_date,
            session=session,
            state=State.RUNNING,
            external_trigger=False,
        )

        self.airflow_task_executor.fail_fast = s_run.fail_fast
        # we don't want to be stopped by zombie jobs/tasks
        airflow_conf.set("core", "dag_concurrency", str(10000))
        airflow_conf.set("core", "max_active_runs_per_dag", str(10000))

        job = SingleDagRunJob(
            dag=af_dag,
            execution_date=databand_run.execution_date,
            mark_success=s_run.mark_success,
            executor=self.airflow_task_executor,
            donot_pickle=(s_run.donot_pickle
                          or airflow_conf.getboolean("core", "donot_pickle")),
            ignore_first_depends_on_past=s_run.ignore_first_depends_on_past,
            ignore_task_deps=s_run.ignore_dependencies,
            fail_fast=s_run.fail_fast,
            pool=s_run.pool,
            delay_on_limit_secs=delay_on_limit,
            verbose=s.system.verbose,
            heartrate=heartrate,
            airflow_config=self.airflow_config,
        )

        # we need localDagJob to be available from "internal" functions
        # because of ti_state_manager use
        from dbnd._core.current import is_verbose

        with SingleDagRunJob.new_context(_context=job,
                                         allow_override=True,
                                         verbose=is_verbose()):
            job.run()