def ensure_plugins_loaded(): """ Load plugins from plugins directory and entrypoints. Plugins are only loaded if they have not been previously loaded. """ from airflow.stats import Stats global plugins, registered_hooks if plugins is not None: log.debug("Plugins are already loaded. Skipping.") return if not settings.PLUGINS_FOLDER: raise ValueError("Plugins folder is not set") log.debug("Loading plugins") with Stats.timer() as timer: plugins = [] registered_hooks = [] load_plugins_from_plugin_directory() load_entrypoint_plugins() # We don't do anything with these for now, but we want to keep track of # them so we can integrate them in to the UI's Connection screens for plugin in plugins: registered_hooks.extend(plugin.hooks) num_loaded = len(plugins) if num_loaded > 0: log.debug("Loading %d plugin(s) took %.2f seconds", num_loaded, timer.duration)
def collect_dags_from_db(self): """Collects DAGs from database.""" from airflow.models.serialized_dag import SerializedDagModel with Stats.timer('collect_db_dags'): self.log.info("Filling up the DagBag from database") # The dagbag contains all rows in serialized_dag table. Deleted DAGs are deleted # from the table by the scheduler job. self.dags = SerializedDagModel.read_all_dags() # Adds subdags. # DAG post-processing steps such as self.bag_dag and croniter are not needed as # they are done by scheduler before serialization. subdags = {} for dag in self.dags.values(): for subdag in dag.subdags: subdags[subdag.dag_id] = subdag self.dags.update(subdags)
def _load_sensor_works(self, session=None): """ Refresh sensor instances need to be handled by this operator. Create smart sensor internal object based on the information persisted in the sensor_instance table. """ SI = SensorInstance with Stats.timer() as timer: query = (session.query(SI).filter( SI.state == State.SENSING).filter( SI.shardcode < self.shard_max, SI.shardcode >= self.shard_min)) tis = query.all() self.log.info("Performance query %s tis, time: %.3f", len(tis), timer.duration) # Query without checking dagrun state might keep some failed dag_run tasks alive. # Join with DagRun table will be very slow based on the number of sensor tasks we # need to handle. We query all smart tasks in this operator # and expect scheduler correct the states in _change_state_for_tis_without_dagrun() sensor_works = [] for ti in tis: try: sensor_works.append(SensorWork(ti)) except Exception as e: # pylint: disable=broad-except self.log.exception( "Exception at creating sensor work for ti %s", ti.key) self.log.exception(e, exc_info=True) self.log.info("%d tasks detected.", len(sensor_works)) new_sensor_works = [ x for x in sensor_works if x not in self.sensor_works ] self._update_ti_hostname(new_sensor_works) self.sensor_works = sensor_works
def _run_file_processor( result_channel: MultiprocessingConnection, parent_channel: MultiprocessingConnection, file_path: str, pickle_dags: bool, dag_ids: Optional[List[str]], thread_name: str, callback_requests: List[CallbackRequest], ) -> None: """ Process the given file. :param result_channel: the connection to use for passing back the result :type result_channel: multiprocessing.Connection :param parent_channel: the parent end of the channel to close in the child :type parent_channel: multiprocessing.Connection :param file_path: the file to process :type file_path: str :param pickle_dags: whether to pickle the DAGs found in the file and save them to the DB :type pickle_dags: bool :param dag_ids: if specified, only examine DAG ID's that are in this list :type dag_ids: list[str] :param thread_name: the name to use for the process that is launched :type thread_name: str :param callback_requests: failure callback to execute :type callback_requests: List[airflow.utils.callback_requests.CallbackRequest] :return: the process that was launched :rtype: multiprocessing.Process """ # This helper runs in the newly created process log: logging.Logger = logging.getLogger("airflow.processor") # Since we share all open FDs from the parent, we need to close the parent side of the pipe here in # the child, else it won't get closed properly until we exit. log.info("Closing parent pipe") parent_channel.close() del parent_channel set_context(log, file_path) setproctitle(f"airflow scheduler - DagFileProcessor {file_path}") try: # redirect stdout/stderr to log with redirect_stdout(StreamLogWriter( log, logging.INFO)), redirect_stderr( StreamLogWriter(log, logging.WARN)), Stats.timer() as timer: # Re-configure the ORM engine as there are issues with multiple processes settings.configure_orm() # Change the thread name to differentiate log lines. This is # really a separate process, but changing the name of the # process doesn't work, so changing the thread name instead. threading.current_thread().name = thread_name log.info("Started process (PID=%s) to work on %s", os.getpid(), file_path) dag_file_processor = DagFileProcessor(dag_ids=dag_ids, log=log) result: Tuple[int, int] = dag_file_processor.process_file( file_path=file_path, pickle_dags=pickle_dags, callback_requests=callback_requests, ) result_channel.send(result) log.info("Processing %s took %.3f seconds", file_path, timer.duration) except Exception: # pylint: disable=broad-except # Log exceptions through the logging framework. log.exception("Got an exception! Propagating...") raise finally: # We re-initialized the ORM within this Process above so we need to # tear it down manually here settings.dispose_orm() result_channel.close()
def update_state( self, session: Session = None, execute_callbacks: bool = True ) -> Tuple[List[TI], Optional[callback_requests.DagCallbackRequest]]: """ Determines the overall state of the DagRun based on the state of its TaskInstances. :param session: Sqlalchemy ORM Session :type session: Session :param execute_callbacks: Should dag callbacks (success/failure, SLA etc) be invoked directly (default: true) or recorded as a pending request in the ``callback`` property :type execute_callbacks: bool :return: Tuple containing tis that can be scheduled in the current loop & `callback` that needs to be executed """ # Callback to execute in case of Task Failures callback: Optional[callback_requests.DagCallbackRequest] = None start_dttm = timezone.utcnow() self.last_scheduling_decision = start_dttm with Stats.timer(f"dagrun.dependency-check.{self.dag_id}"): dag = self.get_dag() info = self.task_instance_scheduling_decisions(session) tis = info.tis schedulable_tis = info.schedulable_tis changed_tis = info.changed_tis finished_tasks = info.finished_tasks unfinished_tasks = info.unfinished_tasks none_depends_on_past = all(not t.task.depends_on_past for t in unfinished_tasks) none_task_concurrency = all(t.task.max_active_tis_per_dag is None for t in unfinished_tasks) none_deferred = all(t.state != State.DEFERRED for t in unfinished_tasks) if unfinished_tasks and none_depends_on_past and none_task_concurrency and none_deferred: # small speed up are_runnable_tasks = (schedulable_tis or self._are_premature_tis( unfinished_tasks, finished_tasks, session) or changed_tis) leaf_task_ids = {t.task_id for t in dag.leaves} leaf_tis = [ti for ti in tis if ti.task_id in leaf_task_ids] # if all roots finished and at least one failed, the run failed if not unfinished_tasks and any(leaf_ti.state in State.failed_states for leaf_ti in leaf_tis): self.log.error('Marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='task_failure', session=session) elif dag.has_on_failure_callback: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='task_failure', ) # if all leaves succeeded and no unfinished tasks, the run succeeded elif not unfinished_tasks and all(leaf_ti.state in State.success_states for leaf_ti in leaf_tis): self.log.info('Marking run %s successful', self) self.set_state(State.SUCCESS) if execute_callbacks: dag.handle_callback(self, success=True, reason='success', session=session) elif dag.has_on_success_callback: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=False, msg='success', ) # if *all tasks* are deadlocked, the run failed elif (unfinished_tasks and none_depends_on_past and none_task_concurrency and none_deferred and not are_runnable_tasks): self.log.error('Deadlock; marking run %s failed', self) self.set_state(State.FAILED) if execute_callbacks: dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', session=session) elif dag.has_on_failure_callback: callback = callback_requests.DagCallbackRequest( full_filepath=dag.fileloc, dag_id=self.dag_id, execution_date=self.execution_date, is_failure_callback=True, msg='all_tasks_deadlocked', ) # finally, if the roots aren't done, the dag is still running else: self.set_state(State.RUNNING) self._emit_true_scheduling_delay_stats_for_finished_state( finished_tasks) self._emit_duration_stats_for_finished_state() session.merge(self) return schedulable_tis, callback
def _do_scheduling(self, session) -> int: """ This function is where the main scheduling decisions take places. It: - Creates any necessary DAG runs by examining the next_dagrun_create_after column of DagModel Since creating Dag Runs is a relatively time consuming process, we select only 10 dags by default (configurable via ``scheduler.max_dagruns_to_create_per_loop`` setting) - putting this higher will mean one scheduler could spend a chunk of time creating dag runs, and not ever get around to scheduling tasks. - Finds the "next n oldest" running DAG Runs to examine for scheduling (n=20 by default, configurable via ``scheduler.max_dagruns_per_loop_to_schedule`` config setting) and tries to progress state (TIs to SCHEDULED, or DagRuns to SUCCESS/FAILURE etc) By "next oldest", we mean hasn't been examined/scheduled in the most time. The reason we don't select all dagruns at once because the rows are selected with row locks, meaning that only one scheduler can "process them", even it is waiting behind other dags. Increasing this limit will allow more throughput for smaller DAGs but will likely slow down throughput for larger (>500 tasks.) DAGs - Then, via a Critical Section (locking the rows of the Pool model) we queue tasks, and then send them to the executor. See docs of _critical_section_execute_task_instances for more. :return: Number of TIs enqueued in this iteration :rtype: int """ # Put a check in place to make sure we don't commit unexpectedly with prohibit_commit(session) as guard: if settings.USE_JOB_SCHEDULE: self._create_dagruns_for_dags(guard, session) self._start_queued_dagruns(session) guard.commit() dag_runs = self._get_next_dagruns_to_examine(State.RUNNING, session) # Bulk fetch the currently active dag runs for the dags we are # examining, rather than making one query per DagRun callback_tuples = [] for dag_run in dag_runs: callback_to_run = self._schedule_dag_run(dag_run, session) callback_tuples.append((dag_run, callback_to_run)) guard.commit() # Send the callbacks after we commit to ensure the context is up to date when it gets run for dag_run, callback_to_run in callback_tuples: self._send_dag_callbacks_to_processor(dag_run, callback_to_run) # Without this, the session has an invalid view of the DB session.expunge_all() # END: schedule TIs try: if self.executor.slots_available <= 0: # We know we can't do anything here, so don't even try! self.log.debug("Executor full, skipping critical section") return 0 timer = Stats.timer('scheduler.critical_section_duration') timer.start() # Find anything TIs in state SCHEDULED, try to QUEUE it (send it to the executor) num_queued_tis = self._critical_section_execute_task_instances(session=session) # Make sure we only sent this metric if we obtained the lock, otherwise we'll skew the # metric, way down timer.stop(send=True) except OperationalError as e: timer.stop(send=False) if is_lock_not_available_error(error=e): self.log.debug("Critical section lock held by another Scheduler") Stats.incr('scheduler.critical_section_busy') session.rollback() return 0 raise guard.commit() return num_queued_tis
def _run_scheduler_loop(self) -> None: """ The actual scheduler loop. The main steps in the loop are: #. Harvest DAG parsing results through DagFileProcessorAgent #. Find and queue executable tasks #. Change task instance state in DB #. Queue tasks in executor #. Heartbeat executor #. Execute queued tasks in executor asynchronously #. Sync on the states of running tasks Following is a graphic representation of these steps. .. image:: ../docs/apache-airflow/img/scheduler_loop.jpg :rtype: None """ if not self.processor_agent: raise ValueError("Processor agent is not started.") is_unit_test: bool = conf.getboolean('core', 'unit_test_mode') timers = EventScheduler() # Check on start up, then every configured interval self.adopt_or_reset_orphaned_tasks() timers.call_regular_interval( conf.getfloat('scheduler', 'orphaned_tasks_check_interval', fallback=300.0), self.adopt_or_reset_orphaned_tasks, ) timers.call_regular_interval( conf.getfloat('scheduler', 'trigger_timeout_check_interval', fallback=15.0), self.check_trigger_timeouts, ) timers.call_regular_interval( conf.getfloat('scheduler', 'pool_metrics_interval', fallback=5.0), self._emit_pool_metrics, ) for loop_count in itertools.count(start=1): with Stats.timer() as timer: if self.using_sqlite: self.processor_agent.run_single_parsing_loop() # For the sqlite case w/ 1 thread, wait until the processor # is finished to avoid concurrent access to the DB. self.log.debug("Waiting for processors to finish since we're using sqlite") self.processor_agent.wait_until_finished() with create_session() as session: num_queued_tis = self._do_scheduling(session) self.executor.heartbeat() session.expunge_all() num_finished_events = self._process_executor_events(session=session) self.processor_agent.heartbeat() # Heartbeat the scheduler periodically self.heartbeat(only_if_necessary=True) # Run any pending timed events next_event = timers.run(blocking=False) self.log.debug("Next timed event is in %f", next_event) self.log.debug("Ran scheduling loop in %.2f seconds", timer.duration) if not is_unit_test and not num_queued_tis and not num_finished_events: # If the scheduler is doing things, don't sleep. This means when there is work to do, the # scheduler will run "as quick as possible", but when it's stopped, it can sleep, dropping CPU # usage when "idle" time.sleep(min(self._processor_poll_interval, next_event)) if loop_count >= self.num_runs > 0: self.log.info( "Exiting scheduler loop as requested number of runs (%d - got to %d) has been reached", self.num_runs, loop_count, ) break if self.processor_agent.done: self.log.info( "Exiting scheduler loop as requested DAG parse count (%d) has been reached after %d" " scheduler loops", self.num_times_parse_dags, loop_count, ) break