示例#1
0
    def test_correct_maybe_zipped_normal_file_with_zip_in_name(self, mocked_is_zipfile):
        path = '/path/to/fakearchive.zip.other/file.txt'
        mocked_is_zipfile.return_value = False

        dag_folder = correct_maybe_zipped(path)

        assert dag_folder == path
示例#2
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        if self.read_dags_from_db:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                self._add_dag_from_db(dag_id=dag_id)
                return self.dags.get(dag_id)

            # If DAG is in the DagBag, check the following
            # 1. if time has come to check if DAG is updated (controlled by min_serialized_dag_fetch_secs)
            # 2. check the last_updated column in SerializedDag table to see if Serialized DAG is updated
            # 3. if (2) is yes, fetch the Serialized DAG.
            min_serialized_dag_fetch_secs = timedelta(seconds=settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL)
            if (
                dag_id in self.dags_last_fetched and
                timezone.utcnow() > self.dags_last_fetched[dag_id] + min_serialized_dag_fetch_secs
            ):
                sd_last_updated_datetime = SerializedDagModel.get_last_updated_datetime(dag_id=dag_id)
                if sd_last_updated_datetime > self.dags_last_fetched[dag_id]:
                    self._add_dag_from_db(dag_id=dag_id)

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized?
        orm_dag = DagModel.get_current(root_dag_id)
        if not orm_dag:
            return self.dags.get(dag_id)

        # If the dag corresponding to root_dag_id is absent or expired
        is_missing = root_dag_id not in self.dags
        is_expired = (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired)
        if is_missing or is_expired:
            # Reprocess source file
            found_dags = self.process_file(
                filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
示例#3
0
    def test_correct_maybe_zipped_normal_file(self, mocked_is_zipfile):
        path = '/path/to/some/file.txt'
        mocked_is_zipfile.return_value = False

        dag_folder = correct_maybe_zipped(path)

        assert dag_folder == path
示例#4
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is store_serialized_dags.
        if self.store_serialized_dags:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # Needs to load from file for a store_serialized_dags dagbag.
        enforce_from_file = False
        if self.store_serialized_dags and dag is not None:
            from airflow.serialization.serialized_objects import SerializedDAG
            enforce_from_file = isinstance(dag, SerializedDAG)

        # If the dag corresponding to root_dag_id is absent or expired
        orm_dag = DagModel.get_current(root_dag_id)
        if (orm_dag and
            (root_dag_id not in self.dags or
             (orm_dag.last_expired and dag.last_loaded < orm_dag.last_expired))
            ) or enforce_from_file:
            # Reprocess source file
            found_dags = self.process_file(filepath=correct_maybe_zipped(
                orm_dag.fileloc),
                                           only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [
                    found_dag.dag_id for found_dag in found_dags
            ]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
示例#5
0
    def collect_dags(
        self,
        dag_folder: Union[str, "pathlib.Path", None] = None,
        only_if_updated: bool = True,
        include_examples: bool = conf.getboolean('core', 'LOAD_EXAMPLES'),
        include_smart_sensor: bool = conf.getboolean('smart_sensor',
                                                     'USE_SMART_SENSOR'),
        safe_mode: bool = conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'),
    ):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the patterns specified
        in the file.

        **Note**: The patterns in ``.airflowignore`` are interpreted as either
        un-anchored regexes or gitignore-like glob expressions, depending on
        the ``DAG_IGNORE_FILE_SYNTAX`` configuration parameter.
        """
        if self.read_dags_from_db:
            return

        self.log.info("Filling up the DagBag from %s", dag_folder)
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []

        # Ensure dag_folder is a str -- it may have been a pathlib.Path
        dag_folder = correct_maybe_zipped(str(dag_folder))
        for filepath in list_py_file_paths(
                dag_folder,
                safe_mode=safe_mode,
                include_examples=include_examples,
                include_smart_sensor=include_smart_sensor,
        ):
            try:
                file_parse_start_dttm = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                file_parse_end_dttm = timezone.utcnow()
                stats.append(
                    FileLoadStat(
                        file=filepath.replace(settings.DAGS_FOLDER, ''),
                        duration=file_parse_end_dttm - file_parse_start_dttm,
                        dag_num=len(found_dags),
                        task_num=sum(len(dag.tasks) for dag in found_dags),
                        dags=str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:
                self.log.exception(e)

        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
示例#6
0
    def get_dag(self, dag_id):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is read_dags_from_db.
        if self.read_dags_from_db:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # If DAG Model is absent, we can't check last_expired property. Is the DAG not yet synchronized?
        orm_dag = DagModel.get_current(root_dag_id)
        if not orm_dag:
            return self.dags.get(dag_id)

        # If the dag corresponding to root_dag_id is absent or expired
        is_missing = root_dag_id not in self.dags
        is_expired = (orm_dag.last_expired
                      and dag.last_loaded < orm_dag.last_expired)
        if is_missing or is_expired:
            # Reprocess source file
            found_dags = self.process_file(filepath=correct_maybe_zipped(
                orm_dag.fileloc),
                                           only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [
                    found_dag.dag_id for found_dag in found_dags
            ]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)
示例#7
0
    def bulk_sync_to_db(cls, filelocs: Iterable[str], session=None):
        """Writes code in bulk into database.

        :param filelocs: file paths of DAGs to sync
        :param session: ORM Session
        """
        filelocs = set(filelocs)
        filelocs_to_hashes = {
            fileloc: DagCode.dag_fileloc_hash(fileloc) for fileloc in filelocs
        }
        existing_orm_dag_codes = (
            session
            .query(DagCode)
            .filter(DagCode.fileloc_hash.in_(filelocs_to_hashes.values()))
            .with_for_update(of=DagCode)
            .all()
        )
        existing_orm_dag_codes_by_fileloc_hashes = {
            orm.fileloc_hash: orm for orm in existing_orm_dag_codes
        }
        exisitng_orm_filelocs = {
            orm.fileloc for orm in existing_orm_dag_codes_by_fileloc_hashes.values()
        }
        if not exisitng_orm_filelocs.issubset(filelocs):
            conflicting_filelocs = exisitng_orm_filelocs.difference(filelocs)
            hashes_to_filelocs = {
                DagCode.dag_fileloc_hash(fileloc): fileloc for fileloc in filelocs
            }
            message = ""
            for fileloc in conflicting_filelocs:
                message += ("Filename '{}' causes a hash collision in the " +
                            "database with '{}'. Please rename the file.")\
                    .format(
                        hashes_to_filelocs[DagCode.dag_fileloc_hash(fileloc)],
                        fileloc)
            raise AirflowException(message)

        existing_filelocs = {
            dag_code.fileloc for dag_code in existing_orm_dag_codes
        }
        missing_filelocs = filelocs.difference(existing_filelocs)

        for fileloc in missing_filelocs:
            orm_dag_code = DagCode(fileloc)
            session.add(orm_dag_code)

        for fileloc in existing_filelocs:
            old_version = existing_orm_dag_codes_by_fileloc_hashes[
                filelocs_to_hashes[fileloc]
            ]
            file_modified = datetime.fromtimestamp(
                os.path.getmtime(correct_maybe_zipped(fileloc)), tz=timezone.utc)

            if (file_modified - timedelta(seconds=120)) > old_version.last_updated:
                orm_dag_code.last_updated = timezone.utcnow()
                orm_dag_code.source_code = DagCode._read_code(orm_dag_code.fileloc)
                session.update(orm_dag_code)
示例#8
0
    def test_correct_maybe_zipped_archive(self, mocked_is_zipfile):
        path = '/path/to/archive.zip/deep/path/to/file.txt'
        mocked_is_zipfile.return_value = True

        dag_folder = correct_maybe_zipped(path)

        assert mocked_is_zipfile.call_count == 1
        (args, kwargs) = mocked_is_zipfile.call_args_list[0]
        assert '/path/to/archive.zip' == args[0]

        assert dag_folder == '/path/to/archive.zip'
示例#9
0
    def collect_dags(self,
                     dag_folder=None,
                     only_if_updated=True,
                     include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
                     safe_mode=conf.getboolean('core',
                                               'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        if self.store_serialized_dags:
            return

        self.log.info("Filling up the DagBag from %s", dag_folder)
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []

        dag_folder = correct_maybe_zipped(dag_folder)
        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                file_parse_start_dttm = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                file_parse_end_dttm = timezone.utcnow()
                stats.append(
                    FileLoadStat(
                        file=filepath.replace(settings.DAGS_FOLDER, ''),
                        duration=file_parse_end_dttm - file_parse_start_dttm,
                        dag_num=len(found_dags),
                        task_num=sum([len(dag.tasks) for dag in found_dags]),
                        dags=str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:  # pylint: disable=broad-except
                self.log.exception(e)

        end_dttm = timezone.utcnow()
        durations = (end_dttm - start_dttm).total_seconds()
        Stats.gauge('collect_dags', durations, 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
        for file_stat in self.dagbag_stats:
            # file_stat.file similar format: /subdir/dag_name.py
            # TODO: Remove for Airflow 2.0
            filename = file_stat.file.split('/')[-1].replace('.py', '')
            Stats.timing('dag.loading-duration.{}'.format(filename),
                         file_stat.duration)
示例#10
0
    def bulk_sync_to_db(cls, filelocs: Iterable[str], session=None):
        """Writes code in bulk into database.

        :param filelocs: file paths of DAGs to sync
        :param session: ORM Session
        """
        filelocs = set(filelocs)
        filelocs_to_hashes = {
            fileloc: DagCode.dag_fileloc_hash(fileloc)
            for fileloc in filelocs
        }
        existing_orm_dag_codes = (session.query(DagCode).filter(
            DagCode.fileloc_hash.in_(
                filelocs_to_hashes.values())).with_for_update(
                    of=DagCode).all())

        if existing_orm_dag_codes:
            existing_orm_dag_codes_map = {
                orm_dag_code.fileloc: orm_dag_code
                for orm_dag_code in existing_orm_dag_codes
            }
        else:
            existing_orm_dag_codes_map = {}

        existing_orm_dag_codes_by_fileloc_hashes = {
            orm.fileloc_hash: orm
            for orm in existing_orm_dag_codes
        }
        existing_orm_filelocs = {
            orm.fileloc
            for orm in existing_orm_dag_codes_by_fileloc_hashes.values()
        }
        if not existing_orm_filelocs.issubset(filelocs):
            conflicting_filelocs = existing_orm_filelocs.difference(filelocs)
            hashes_to_filelocs = {
                DagCode.dag_fileloc_hash(fileloc): fileloc
                for fileloc in filelocs
            }
            message = ""
            for fileloc in conflicting_filelocs:
                filename = hashes_to_filelocs[DagCode.dag_fileloc_hash(
                    fileloc)]
                message += (
                    f"Filename '{filename}' causes a hash collision in the "
                    f"database with '{fileloc}'. Please rename the file.")
            raise AirflowException(message)

        existing_filelocs = {
            dag_code.fileloc
            for dag_code in existing_orm_dag_codes
        }
        missing_filelocs = filelocs.difference(existing_filelocs)

        for fileloc in missing_filelocs:
            orm_dag_code = DagCode(fileloc, cls._get_code_from_file(fileloc))
            session.add(orm_dag_code)

        for fileloc in existing_filelocs:
            current_version = existing_orm_dag_codes_by_fileloc_hashes[
                filelocs_to_hashes[fileloc]]
            file_mod_time = datetime.fromtimestamp(os.path.getmtime(
                correct_maybe_zipped(fileloc)),
                                                   tz=timezone.utc)

            if file_mod_time > current_version.last_updated:
                orm_dag_code = existing_orm_dag_codes_map[fileloc]
                orm_dag_code.last_updated = file_mod_time
                orm_dag_code.source_code = cls._get_code_from_file(
                    orm_dag_code.fileloc)
                session.merge(orm_dag_code)
示例#11
0
文件: dagbag.py 项目: ddwelle/airflow
    def get_dag(self, dag_id, from_file_only=False):
        """
        Gets the DAG out of the dictionary, and refreshes it if expired

        :param dag_id: DAG Id
        :type dag_id: str
        :param from_file_only: returns a DAG loaded from file.
        :type from_file_only: bool
        """
        # Avoid circular import
        from airflow.models.dag import DagModel

        # Only read DAGs from DB if this dagbag is store_serialized_dags.
        # from_file_only is an exception, currently it is for renderring templates
        # in UI only. Because functions are gone in serialized DAGs, DAGs must be
        # imported from files.
        # FIXME: this exception should be removed in future, then webserver can be
        # decoupled from DAG files.
        if self.store_serialized_dags and not from_file_only:
            # Import here so that serialized dag is only imported when serialization is enabled
            from airflow.models.serialized_dag import SerializedDagModel
            if dag_id not in self.dags:
                # Load from DB if not (yet) in the bag
                row = SerializedDagModel.get(dag_id)
                if not row:
                    return None

                dag = row.dag
                for subdag in dag.subdags:
                    self.dags[subdag.dag_id] = subdag
                self.dags[dag.dag_id] = dag

            return self.dags.get(dag_id)

        # If asking for a known subdag, we want to refresh the parent
        dag = None
        root_dag_id = dag_id
        if dag_id in self.dags:
            dag = self.dags[dag_id]
            if dag.is_subdag:
                root_dag_id = dag.parent_dag.dag_id

        # Needs to load from file for a store_serialized_dags dagbag.
        enforce_from_file = False
        if self.store_serialized_dags and dag is not None:
            from airflow.serialization.serialized_dag import SerializedDAG
            enforce_from_file = isinstance(dag, SerializedDAG)

        # If the dag corresponding to root_dag_id is absent or expired
        orm_dag = DagModel.get_current(root_dag_id)
        if (orm_dag and (
                root_dag_id not in self.dags or
                (
                    orm_dag.last_expired and
                    dag.last_loaded < orm_dag.last_expired
                )
        )) or enforce_from_file:
            # Reprocess source file
            found_dags = self.process_file(
                filepath=correct_maybe_zipped(orm_dag.fileloc), only_if_updated=False)

            # If the source file no longer exports `dag_id`, delete it from self.dags
            if found_dags and dag_id in [found_dag.dag_id for found_dag in found_dags]:
                return self.dags[dag_id]
            elif dag_id in self.dags:
                del self.dags[dag_id]
        return self.dags.get(dag_id)