def _refresh_dag_dir(self): """ Refresh file paths from dag dir if we haven't done it for too long. """ airflow_home = os.getenv("AIRFLOW_HOME", "/usr/lib/airflow") now = timezone.utcnow() elapsed_time_since_refresh = ( now - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval or os.path.exists( airflow_home + "/scheduler.refresh"): # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = now self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) # noinspection PyBroadException try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() # pylint: disable=no-value-for-parameter os.remove(airflow_home + "/scheduler.refresh") self.logger.info("Refresh Dags of scheduler triggered") except Exception: # pylint: disable=broad-except self.log.exception("Error removing old import errors") if STORE_SERIALIZED_DAGS: from airflow.models.serialized_dag import SerializedDagModel from airflow.models.dag import DagModel SerializedDagModel.remove_deleted_dags(self._file_paths) DagModel.deactivate_deleted_dags(self._file_paths)
def _refresh_dag_dir(self): """ Refresh file paths from dag dir if we haven't done it for too long. """ now = timezone.utcnow() elapsed_time_since_refresh = (now - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval: # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = now self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) # noinspection PyBroadException try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() # pylint: disable=no-value-for-parameter except Exception: # pylint: disable=broad-except self.log.exception("Error removing old import errors") if STORE_SERIALIZED_DAGS: from airflow.models.serialized_dag import SerializedDagModel from airflow.models.dag import DagModel SerializedDagModel.remove_deleted_dags(self._file_paths) DagModel.deactivate_deleted_dags(self._file_paths) if self.store_dag_code: from airflow.models.dagcode import DagCode DagCode.remove_deleted_code(self._file_paths)
def get_dag(self, dag_id): """ Gets the DAG out of the dictionary, and refreshes it if expired :param dag_id: DAG Id :type dag_id: str """ # Avoid circular import from airflow.models.dag import DagModel if self.read_dags_from_db: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag self._add_dag_from_db(dag_id=dag_id) return self.dags.get(dag_id) # If DAG is in the DagBag, check the following # 1. if time has come to check if DAG is updated (controlled by min_serialized_dag_fetch_secs) # 2. check the last_updated column in SerializedDag table to see if Serialized DAG is updated # 3. if (2) is yes, fetch the Serialized DAG. min_serialized_dag_fetch_secs = timedelta(seconds=settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL) if ( dag_id in self.dags_last_fetched and timezone.utcnow() > self.dags_last_fetched[dag_id] + min_serialized_dag_fetch_secs ): sd_last_updated_datetime = SerializedDagModel.get_last_updated_datetime(dag_id=dag_id) if sd_last_updated_datetime > self.dags_last_fetched[dag_id]: self._add_dag_from_db(dag_id=dag_id) return self.dags.get(dag_id) # If asking for a known subdag, we want to refresh the parent dag = None root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized? orm_dag = DagModel.get_current(root_dag_id) if not orm_dag: return self.dags.get(dag_id) # If the dag corresponding to root_dag_id is absent or expired is_missing = root_dag_id not in self.dags is_expired = (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired) if is_missing or is_expired: # Reprocess source file found_dags = self.process_file( filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def get_dag(self, dag_id): """ Gets the DAG out of the dictionary, and refreshes it if expired """ from airflow.models.dag import DagModel # Avoid circular import # If asking for a known subdag, we want to refresh the parent root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # If the dag corresponding to root_dag_id is absent or expired orm_dag = DagModel.get_current(root_dag_id) if orm_dag and (root_dag_id not in self.dags or (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired)): # Reprocess source file found_dags = self.process_file(filepath=orm_dag.fileloc, only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [ found_dag.dag_id for found_dag in found_dags ]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def get_dag(self, dag_id): """ Gets the DAG out of the dictionary, and refreshes it if expired :param dag_id: DAG Id :type dag_id: str """ # Avoid circular import from airflow.models.dag import DagModel # Only read DAGs from DB if this dagbag is store_serialized_dags. if self.store_serialized_dags: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag row = SerializedDagModel.get(dag_id) if not row: return None dag = row.dag for subdag in dag.subdags: self.dags[subdag.dag_id] = subdag self.dags[dag.dag_id] = dag return self.dags.get(dag_id) # If asking for a known subdag, we want to refresh the parent dag = None root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # Needs to load from file for a store_serialized_dags dagbag. enforce_from_file = False if self.store_serialized_dags and dag is not None: from airflow.serialization.serialized_objects import SerializedDAG enforce_from_file = isinstance(dag, SerializedDAG) # If the dag corresponding to root_dag_id is absent or expired orm_dag = DagModel.get_current(root_dag_id) if (orm_dag and (root_dag_id not in self.dags or (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired)) ) or enforce_from_file: # Reprocess source file found_dags = self.process_file(filepath=correct_maybe_zipped( orm_dag.fileloc), only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [ found_dag.dag_id for found_dag in found_dags ]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def _create_dagruns_for_dags(self, guard, session): """Find Dag Models needing DagRuns and Create Dag Runs with retries in case of OperationalError""" query = DagModel.dags_needing_dagruns(session) self._create_dag_runs(query.all(), session) # commit the session - Release the write lock on DagModel table. guard.commit()
def get_dag(self, dag_id): """ Gets the DAG out of the dictionary, and refreshes it if expired :param dag_id: DAG Id :type dag_id: str """ # Avoid circular import from airflow.models.dag import DagModel # Only read DAGs from DB if this dagbag is read_dags_from_db. if self.read_dags_from_db: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag row = SerializedDagModel.get(dag_id) if not row: return None dag = row.dag for subdag in dag.subdags: self.dags[subdag.dag_id] = subdag self.dags[dag.dag_id] = dag return self.dags.get(dag_id) # If asking for a known subdag, we want to refresh the parent dag = None root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized? orm_dag = DagModel.get_current(root_dag_id) if not orm_dag: return self.dags.get(dag_id) # If the dag corresponding to root_dag_id is absent or expired is_missing = root_dag_id not in self.dags is_expired = (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired) if is_missing or is_expired: # Reprocess source file found_dags = self.process_file(filepath=correct_maybe_zipped( orm_dag.fileloc), only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [ found_dag.dag_id for found_dag in found_dags ]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def _list_dags(self): dagbag = DagBag() dags = [] for dag_id in dagbag.dags: orm_dag = DagModel.get_current(dag_id) # inactive DAGs can't be backfilled.... is_active = ( not orm_dag.is_paused) if orm_dag is not None else False if is_active: dags.append(dag_id) return dags
def process_file( self, file_path: str, callback_requests: List[CallbackRequest], pickle_dags: bool = False, session: Session = None, ) -> Tuple[int, int]: """ Process a Python file containing Airflow DAGs. This includes: 1. Execute the file and look for DAG objects in the namespace. 2. Execute any Callbacks if passed to this method. 3. Serialize the DAGs and save it to DB (or update existing record in the DB). 4. Pickle the DAG and save it to the DB (if necessary). 5. Mark any DAGs which are no longer present as inactive 6. Record any errors importing the file into ORM :param file_path: the path to the Python file that should be executed :param callback_requests: failure callback to execute :param pickle_dags: whether serialize the DAGs found in the file and save them to the db :param session: Sqlalchemy ORM Session :return: number of dags found, count of import errors :rtype: Tuple[int, int] """ self.log.info("Processing file %s for tasks to queue", file_path) try: dagbag = DagBag(file_path, include_examples=False, include_smart_sensor=False) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) return 0, 0 self._deactivate_missing_dags(session, dagbag, file_path) if len(dagbag.dags) > 0: self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(), file_path) else: self.log.warning("No viable dags retrieved from %s", file_path) self.update_import_errors(session, dagbag) return 0, len(dagbag.import_errors) self.execute_callbacks(dagbag, callback_requests) # Save individual DAGs in the ORM dagbag.sync_to_db() if pickle_dags: paused_dag_ids = DagModel.get_paused_dag_ids( dag_ids=dagbag.dag_ids) unpaused_dags: List[DAG] = [ dag for dag_id, dag in dagbag.dags.items() if dag_id not in paused_dag_ids ] for dag in unpaused_dags: dag.pickle(session) # Record import errors into the ORM try: self.update_import_errors(session, dagbag) except Exception: self.log.exception("Error logging import errors!") return len(dagbag.dags), len(dagbag.import_errors)
def get_dag(self, dag_id, from_file_only=False): """ Gets the DAG out of the dictionary, and refreshes it if expired :param dag_id: DAG Id :type dag_id: str :param from_file_only: returns a DAG loaded from file. :type from_file_only: bool """ # Avoid circular import from airflow.models.dag import DagModel # Only read DAGs from DB if this dagbag is store_serialized_dags. # from_file_only is an exception, currently it is for renderring templates # in UI only. Because functions are gone in serialized DAGs, DAGs must be # imported from files. # FIXME: this exception should be removed in future, then webserver can be # decoupled from DAG files. if self.store_serialized_dags and not from_file_only: # Import here so that serialized dag is only imported when serialization is enabled from airflow.models.serialized_dag import SerializedDagModel if dag_id not in self.dags: # Load from DB if not (yet) in the bag row = SerializedDagModel.get(dag_id) if not row: return None dag = row.dag for subdag in dag.subdags: self.dags[subdag.dag_id] = subdag self.dags[dag.dag_id] = dag return self.dags.get(dag_id) # If asking for a known subdag, we want to refresh the parent dag = None root_dag_id = dag_id if dag_id in self.dags: dag = self.dags[dag_id] if dag.is_subdag: root_dag_id = dag.parent_dag.dag_id # Needs to load from file for a store_serialized_dags dagbag. enforce_from_file = False if self.store_serialized_dags and dag is not None: from airflow.serialization.serialized_dag import SerializedDAG enforce_from_file = isinstance(dag, SerializedDAG) # If the dag corresponding to root_dag_id is absent or expired orm_dag = DagModel.get_current(root_dag_id) if (orm_dag and ( root_dag_id not in self.dags or ( orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired ) )) or enforce_from_file: # Reprocess source file found_dags = self.process_file( filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False) # If the source file no longer exports `dag_id`, delete it from self.dags if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]: return self.dags[dag_id] elif dag_id in self.dags: del self.dags[dag_id] return self.dags.get(dag_id)
def test_scheduler_task(self): TEST_DAG_FOLDER = os.environ['AIRFLOW__CORE__DAGS_FOLDER'] DEFAULT_DATE = timezone.datetime(2020, 1, 1) dag_id = 'test_event_based_dag' task_id = 'sleep_1000_secs' with create_session() as session: dag_bag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag = dag_bag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun( run_id="sleep_1000_secs_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.state = State.SCHEDULED dag_model = DagModel( dag_id=dag_id, is_paused=False, concurrency=5, has_task_concurrency_limits=False, ) session.merge(dag_model) session.merge(ti) session.commit() executor = LocalExecutor(2) executor.start() executor.heartbeat() executor.schedule_task(ti.key, SchedulingAction.START) executor.heartbeat() time.sleep(30) # wait for task instance started ti.refresh_from_db() self.assertEqual(ti.state, State.RUNNING) process = psutil.Process(ti.pid) self.assertIsNotNone(process) child = process.children(recursive=False) self.assertEqual(1, len(child)) grandchild = child[0].children(recursive=False) self.assertEqual(1, len(grandchild)) tes = self._check_task_execution(ti) self.assertEqual(1, len(tes)) # restart the task instance executor.schedule_task(ti.key, SchedulingAction.RESTART) executor.heartbeat() time.sleep(30) self.assertFalse(self._check_process_exist(process.pid)) self.assertFalse(self._check_process_exist(child[0].pid)) self.assertFalse(self._check_process_exist(grandchild[0].pid)) ti.refresh_from_db() self.assertEqual(ti.state, State.RUNNING) process = psutil.Process(ti.pid) self.assertIsNotNone(process) child = process.children(recursive=False) self.assertEqual(1, len(child)) grandchild = child[0].children(recursive=False) self.assertEqual(1, len(grandchild)) tes = self._check_task_execution(ti) self.assertEqual(2, len(tes)) self.assertEqual(2, tes[0].seq_num) executor.schedule_task(ti.key, SchedulingAction.STOP) ti.refresh_from_db() time.sleep(10) self.assertEqual(State.KILLED, ti.state) self.assertFalse(self._check_process_exist(process.pid)) self.assertFalse(self._check_process_exist(child[0].pid)) self.assertFalse(self._check_process_exist(grandchild[0].pid)) self._check_task_execution(ti) executor.end()
def execute(self, context: Context): if isinstance(self.execution_date, datetime.datetime): parsed_execution_date = self.execution_date elif isinstance(self.execution_date, str): parsed_execution_date = timezone.parse(self.execution_date) else: parsed_execution_date = timezone.utcnow() if self.trigger_run_id: run_id = self.trigger_run_id else: run_id = DagRun.generate_run_id(DagRunType.MANUAL, parsed_execution_date) try: dag_run = trigger_dag( dag_id=self.trigger_dag_id, run_id=run_id, conf=self.conf, execution_date=parsed_execution_date, replace_microseconds=False, ) except DagRunAlreadyExists as e: if self.reset_dag_run: self.log.info("Clearing %s on %s", self.trigger_dag_id, parsed_execution_date) # Get target dag object and call clear() dag_model = DagModel.get_current(self.trigger_dag_id) if dag_model is None: raise DagNotFound( f"Dag id {self.trigger_dag_id} not found in DagModel") dag_bag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) dag = dag_bag.get_dag(self.trigger_dag_id) dag.clear(start_date=parsed_execution_date, end_date=parsed_execution_date) dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0] else: raise e if dag_run is None: raise RuntimeError("The dag_run should be set here!") # Store the execution date from the dag run (either created or found above) to # be used when creating the extra link on the webserver. ti = context['task_instance'] ti.xcom_push(key=XCOM_EXECUTION_DATE_ISO, value=dag_run.execution_date.isoformat()) ti.xcom_push(key=XCOM_RUN_ID, value=dag_run.run_id) if self.wait_for_completion: # wait for dag to complete while True: self.log.info( 'Waiting for %s on %s to become allowed state %s ...', self.trigger_dag_id, dag_run.execution_date, self.allowed_states, ) time.sleep(self.poke_interval) dag_run.refresh_from_db() state = dag_run.state if state in self.failed_states: raise AirflowException( f"{self.trigger_dag_id} failed with failed states {state}" ) if state in self.allowed_states: self.log.info("%s finished with allowed state %s", self.trigger_dag_id, state) return
def resume_workflow_scheduling(self, project_name: Text, workflow_name: Text) -> WorkflowInfo: dag_id = self.airflow_dag_id(project_name, workflow_name) DagModel.get_dagmodel(dag_id=dag_id).set_is_paused(is_paused=False) return WorkflowInfo(namespace=project_name, workflow_name=workflow_name)