Exemplo n.º 1
0
    def _refresh_dag_dir(self):
        """
        Refresh file paths from dag dir if we haven't done it for too long.
        """
        airflow_home = os.getenv("AIRFLOW_HOME", "/usr/lib/airflow")
        now = timezone.utcnow()
        elapsed_time_since_refresh = (
            now - self.last_dag_dir_refresh_time).total_seconds()
        if elapsed_time_since_refresh > self.dag_dir_list_interval or os.path.exists(
                airflow_home + "/scheduler.refresh"):
            # Build up a list of Python files that could contain DAGs
            self.log.info("Searching for files in %s", self._dag_directory)
            self._file_paths = list_py_file_paths(self._dag_directory)
            self.last_dag_dir_refresh_time = now
            self.log.info("There are %s files in %s", len(self._file_paths),
                          self._dag_directory)
            self.set_file_paths(self._file_paths)

            # noinspection PyBroadException
            try:
                self.log.debug("Removing old import errors")
                self.clear_nonexistent_import_errors()  # pylint: disable=no-value-for-parameter
                os.remove(airflow_home + "/scheduler.refresh")
                self.logger.info("Refresh Dags of scheduler triggered")
            except Exception:  # pylint: disable=broad-except
                self.log.exception("Error removing old import errors")

            if STORE_SERIALIZED_DAGS:
                from airflow.models.serialized_dag import SerializedDagModel
                from airflow.models.dag import DagModel
                SerializedDagModel.remove_deleted_dags(self._file_paths)
                DagModel.deactivate_deleted_dags(self._file_paths)
Exemplo n.º 2
0
    def _refresh_dag_dir(self):
        """
        Refresh file paths from dag dir if we haven't done it for too long.
        """
        now = timezone.utcnow()
        elapsed_time_since_refresh = (now - self.last_dag_dir_refresh_time).total_seconds()
        if elapsed_time_since_refresh > self.dag_dir_list_interval:
            # Build up a list of Python files that could contain DAGs
            self.log.info("Searching for files in %s", self._dag_directory)
            self._file_paths = list_py_file_paths(self._dag_directory)
            self.last_dag_dir_refresh_time = now
            self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory)
            self.set_file_paths(self._file_paths)

            # noinspection PyBroadException
            try:
                self.log.debug("Removing old import errors")
                self.clear_nonexistent_import_errors()  # pylint: disable=no-value-for-parameter
            except Exception:  # pylint: disable=broad-except
                self.log.exception("Error removing old import errors")

            if STORE_SERIALIZED_DAGS:
                from airflow.models.serialized_dag import SerializedDagModel
                from airflow.models.dag import DagModel
                SerializedDagModel.remove_deleted_dags(self._file_paths)
                DagModel.deactivate_deleted_dags(self._file_paths)

            if self.store_dag_code:
                from airflow.models.dagcode import DagCode
                DagCode.remove_deleted_code(self._file_paths)
Exemplo n.º 3
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        if self.read_dags_from_db:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                self._add_dag_from_db(dag_id=dag_id)
                return self.dags.get(dag_id)

            # If DAG is in the DagBag, check the following
            # 1. if time has come to check if DAG is updated (controlled by min_serialized_dag_fetch_secs)
            # 2. check the last_updated column in SerializedDag table to see if Serialized DAG is updated
            # 3. if (2) is yes, fetch the Serialized DAG.
            min_serialized_dag_fetch_secs = timedelta(seconds=settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL)
            if (
                dag_id in self.dags_last_fetched and
                timezone.utcnow() > self.dags_last_fetched[dag_id] + min_serialized_dag_fetch_secs
            ):
                sd_last_updated_datetime = SerializedDagModel.get_last_updated_datetime(dag_id=dag_id)
                if sd_last_updated_datetime > self.dags_last_fetched[dag_id]:
                    self._add_dag_from_db(dag_id=dag_id)

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized?
        orm_dag = DagModel.get_current(root_dag_id)
        if not orm_dag:
            return self.dags.get(dag_id)

        # If the dag corresponding to root_dag_id is absent or expired
        is_missing = root_dag_id not in self.dags
        is_expired = (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired)
        if is_missing or is_expired:
            # Reprocess source file
            found_dags = self.process_file(
                filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
Exemplo n.º 4
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired
        """
        from airflow.models.dag import DagModel  # Avoid circular import

        # If asking for a known subdag, we want to refresh the parent
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # If the dag corresponding to root_dag_id is absent or expired
        orm_dag = DagModel.get_current(root_dag_id)
        if orm_dag and (root_dag_id not in self.dags or
                        (orm_dag.last_expired
                         and dag.last_loaded < orm_dag.last_expired)):
            # Reprocess source file
            found_dags = self.process_file(filepath=orm_dag.fileloc,
                                           only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [
                    found_dag.dag_id for found_dag in found_dags
            ]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
Exemplo n.º 5
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is store_serialized_dags.
        if self.store_serialized_dags:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # Needs to load from file for a store_serialized_dags dagbag.
        enforce_from_file = False
        if self.store_serialized_dags and dag is not None:
            from airflow.serialization.serialized_objects import SerializedDAG
            enforce_from_file = isinstance(dag, SerializedDAG)

        # If the dag corresponding to root_dag_id is absent or expired
        orm_dag = DagModel.get_current(root_dag_id)
        if (orm_dag and
            (root_dag_id not in self.dags or
             (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired))
            ) or enforce_from_file:
            # Reprocess source file
            found_dags = self.process_file(filepath=correct_maybe_zipped(
                orm_dag.fileloc),
                                           only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [
                    found_dag.dag_id for found_dag in found_dags
            ]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
Exemplo n.º 6
0
    def _create_dagruns_for_dags(self, guard, session):
        """Find Dag Models needing DagRuns and Create Dag Runs with retries in case of OperationalError"""
        query = DagModel.dags_needing_dagruns(session)
        self._create_dag_runs(query.all(), session)

        # commit the session - Release the write lock on DagModel table.
        guard.commit()
Exemplo n.º 7
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is read_dags_from_db.
        if self.read_dags_from_db:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized?
        orm_dag = DagModel.get_current(root_dag_id)
        if not orm_dag:
            return self.dags.get(dag_id)

        # If the dag corresponding to root_dag_id is absent or expired
        is_missing = root_dag_id not in self.dags
        is_expired = (orm_dag.last_expired
                      and dag.last_loaded < orm_dag.last_expired)
        if is_missing or is_expired:
            # Reprocess source file
            found_dags = self.process_file(filepath=correct_maybe_zipped(
                orm_dag.fileloc),
                                           only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [
                    found_dag.dag_id for found_dag in found_dags
            ]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
Exemplo n.º 8
0
    def _list_dags(self):
        dagbag = DagBag()
        dags = []

        for dag_id in dagbag.dags:
            orm_dag = DagModel.get_current(dag_id)
            # inactive DAGs can't be backfilled....
            is_active = (
                not orm_dag.is_paused) if orm_dag is not None else False

            if is_active:
                dags.append(dag_id)

        return dags
Exemplo n.º 9
0
    def process_file(
        self,
        file_path: str,
        callback_requests: List[CallbackRequest],
        pickle_dags: bool = False,
        session: Session = None,
    ) -> Tuple[int, int]:
        """
        Process a Python file containing Airflow DAGs.

        This includes:

        1. Execute the file and look for DAG objects in the namespace.
        2. Execute any Callbacks if passed to this method.
        3. Serialize the DAGs and save it to DB (or update existing record in the DB).
        4. Pickle the DAG and save it to the DB (if necessary).
        5. Mark any DAGs which are no longer present as inactive
        6. Record any errors importing the file into ORM

        :param file_path: the path to the Python file that should be executed
        :param callback_requests: failure callback to execute
        :param pickle_dags: whether serialize the DAGs found in the file and
            save them to the db
        :param session: Sqlalchemy ORM Session
        :return: number of dags found, count of import errors
        :rtype: Tuple[int, int]
        """
        self.log.info("Processing file %s for tasks to queue", file_path)

        try:
            dagbag = DagBag(file_path,
                            include_examples=False,
                            include_smart_sensor=False)
        except Exception:
            self.log.exception("Failed at reloading the DAG file %s",
                               file_path)
            Stats.incr('dag_file_refresh_error', 1, 1)
            return 0, 0

        self._deactivate_missing_dags(session, dagbag, file_path)

        if len(dagbag.dags) > 0:
            self.log.info("DAG(s) %s retrieved from %s", dagbag.dags.keys(),
                          file_path)
        else:
            self.log.warning("No viable dags retrieved from %s", file_path)
            self.update_import_errors(session, dagbag)
            return 0, len(dagbag.import_errors)

        self.execute_callbacks(dagbag, callback_requests)

        # Save individual DAGs in the ORM
        dagbag.sync_to_db()

        if pickle_dags:
            paused_dag_ids = DagModel.get_paused_dag_ids(
                dag_ids=dagbag.dag_ids)

            unpaused_dags: List[DAG] = [
                dag for dag_id, dag in dagbag.dags.items()
                if dag_id not in paused_dag_ids
            ]

            for dag in unpaused_dags:
                dag.pickle(session)

        # Record import errors into the ORM
        try:
            self.update_import_errors(session, dagbag)
        except Exception:
            self.log.exception("Error logging import errors!")

        return len(dagbag.dags), len(dagbag.import_errors)
Exemplo n.º 10
0
    def get_dag(self, dag_id, from_file_only=False):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        :param from_file_only: returns a DAG loaded from file.
        :type from_file_only: bool
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is store_serialized_dags.
        # from_file_only is an exception, currently it is for renderring templates
        # in UI only. Because functions are gone in serialized DAGs, DAGs must be
        # imported from files.
        # FIXME: this exception should be removed in future, then webserver can be
        # decoupled from DAG files.
        if self.store_serialized_dags and not from_file_only:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # Needs to load from file for a store_serialized_dags dagbag.
        enforce_from_file = False
        if self.store_serialized_dags and dag is not None:
            from airflow.serialization.serialized_dag import SerializedDAG
            enforce_from_file = isinstance(dag, SerializedDAG)

        # If the dag corresponding to root_dag_id is absent or expired
        orm_dag = DagModel.get_current(root_dag_id)
        if (orm_dag and (
                root_dag_id not in self.dags or
                (
                    orm_dag.last_expired and
                    dag.last_loaded < orm_dag.last_expired
                )
        )) or enforce_from_file:
            # Reprocess source file
            found_dags = self.process_file(
                filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
    def test_scheduler_task(self):
        TEST_DAG_FOLDER = os.environ['AIRFLOW__CORE__DAGS_FOLDER']
        DEFAULT_DATE = timezone.datetime(2020, 1, 1)
        dag_id = 'test_event_based_dag'
        task_id = 'sleep_1000_secs'

        with create_session() as session:
            dag_bag = DagBag(
                dag_folder=TEST_DAG_FOLDER,
                include_examples=False,
            )

            dag = dag_bag.get_dag(dag_id)
            task = dag.get_task(task_id)
            dag.create_dagrun(
                run_id="sleep_1000_secs_run",
                state=State.RUNNING,
                execution_date=DEFAULT_DATE,
                start_date=DEFAULT_DATE,
                session=session,
            )
            ti = TaskInstance(task=task, execution_date=DEFAULT_DATE)
            ti.state = State.SCHEDULED
            dag_model = DagModel(
                dag_id=dag_id,
                is_paused=False,
                concurrency=5,
                has_task_concurrency_limits=False,
            )
            session.merge(dag_model)
            session.merge(ti)
            session.commit()
        executor = LocalExecutor(2)
        executor.start()
        executor.heartbeat()
        executor.schedule_task(ti.key, SchedulingAction.START)
        executor.heartbeat()
        time.sleep(30)

        #  wait for task instance started
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.RUNNING)
        process = psutil.Process(ti.pid)
        self.assertIsNotNone(process)
        child = process.children(recursive=False)
        self.assertEqual(1, len(child))
        grandchild = child[0].children(recursive=False)
        self.assertEqual(1, len(grandchild))
        tes = self._check_task_execution(ti)
        self.assertEqual(1, len(tes))

        #  restart the task instance
        executor.schedule_task(ti.key, SchedulingAction.RESTART)
        executor.heartbeat()
        time.sleep(30)

        self.assertFalse(self._check_process_exist(process.pid))
        self.assertFalse(self._check_process_exist(child[0].pid))
        self.assertFalse(self._check_process_exist(grandchild[0].pid))

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.RUNNING)
        process = psutil.Process(ti.pid)
        self.assertIsNotNone(process)
        child = process.children(recursive=False)
        self.assertEqual(1, len(child))
        grandchild = child[0].children(recursive=False)
        self.assertEqual(1, len(grandchild))
        tes = self._check_task_execution(ti)
        self.assertEqual(2, len(tes))
        self.assertEqual(2, tes[0].seq_num)

        executor.schedule_task(ti.key, SchedulingAction.STOP)
        ti.refresh_from_db()
        time.sleep(10)
        self.assertEqual(State.KILLED, ti.state)
        self.assertFalse(self._check_process_exist(process.pid))
        self.assertFalse(self._check_process_exist(child[0].pid))
        self.assertFalse(self._check_process_exist(grandchild[0].pid))
        self._check_task_execution(ti)

        executor.end()
Exemplo n.º 12
0
    def execute(self, context: Context):
        if isinstance(self.execution_date, datetime.datetime):
            parsed_execution_date = self.execution_date
        elif isinstance(self.execution_date, str):
            parsed_execution_date = timezone.parse(self.execution_date)
        else:
            parsed_execution_date = timezone.utcnow()

        if self.trigger_run_id:
            run_id = self.trigger_run_id
        else:
            run_id = DagRun.generate_run_id(DagRunType.MANUAL,
                                            parsed_execution_date)
        try:
            dag_run = trigger_dag(
                dag_id=self.trigger_dag_id,
                run_id=run_id,
                conf=self.conf,
                execution_date=parsed_execution_date,
                replace_microseconds=False,
            )

        except DagRunAlreadyExists as e:
            if self.reset_dag_run:
                self.log.info("Clearing %s on %s", self.trigger_dag_id,
                              parsed_execution_date)

                # Get target dag object and call clear()

                dag_model = DagModel.get_current(self.trigger_dag_id)
                if dag_model is None:
                    raise DagNotFound(
                        f"Dag id {self.trigger_dag_id} not found in DagModel")

                dag_bag = DagBag(dag_folder=dag_model.fileloc,
                                 read_dags_from_db=True)
                dag = dag_bag.get_dag(self.trigger_dag_id)
                dag.clear(start_date=parsed_execution_date,
                          end_date=parsed_execution_date)
                dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0]
            else:
                raise e
        if dag_run is None:
            raise RuntimeError("The dag_run should be set here!")
        # Store the execution date from the dag run (either created or found above) to
        # be used when creating the extra link on the webserver.
        ti = context['task_instance']
        ti.xcom_push(key=XCOM_EXECUTION_DATE_ISO,
                     value=dag_run.execution_date.isoformat())
        ti.xcom_push(key=XCOM_RUN_ID, value=dag_run.run_id)

        if self.wait_for_completion:
            # wait for dag to complete
            while True:
                self.log.info(
                    'Waiting for %s on %s to become allowed state %s ...',
                    self.trigger_dag_id,
                    dag_run.execution_date,
                    self.allowed_states,
                )
                time.sleep(self.poke_interval)

                dag_run.refresh_from_db()
                state = dag_run.state
                if state in self.failed_states:
                    raise AirflowException(
                        f"{self.trigger_dag_id} failed with failed states {state}"
                    )
                if state in self.allowed_states:
                    self.log.info("%s finished with allowed state %s",
                                  self.trigger_dag_id, state)
                    return
Exemplo n.º 13
0
 def resume_workflow_scheduling(self, project_name: Text,
                                workflow_name: Text) -> WorkflowInfo:
     dag_id = self.airflow_dag_id(project_name, workflow_name)
     DagModel.get_dagmodel(dag_id=dag_id).set_is_paused(is_paused=False)
     return WorkflowInfo(namespace=project_name,
                         workflow_name=workflow_name)