Exemplo n.º 1
0
    def _load_modules_from_file(self, filepath, safe_mode):
        if not might_contain_dag(filepath, safe_mode):
            # Don't want to spam user with skip messages
            if not self.has_logged:
                self.has_logged = True
                self.log.info("File %s assumed to contain no DAGs. Skipping.", filepath)
            return []

        self.log.debug("Importing %s", filepath)
        org_mod_name, _ = os.path.splitext(os.path.split(filepath)[-1])
        path_hash = hashlib.sha1(filepath.encode('utf-8')).hexdigest()
        mod_name = f'unusual_prefix_{path_hash}_{org_mod_name}'

        if mod_name in sys.modules:
            del sys.modules[mod_name]

        timeout_msg = f"DagBag import timeout for {filepath} after {self.DAGBAG_IMPORT_TIMEOUT}s"
        with timeout(self.DAGBAG_IMPORT_TIMEOUT, error_message=timeout_msg):
            try:
                loader = importlib.machinery.SourceFileLoader(mod_name, filepath)
                spec = importlib.util.spec_from_loader(mod_name, loader)
                new_module = importlib.util.module_from_spec(spec)
                sys.modules[spec.name] = new_module
                loader.exec_module(new_module)
                return [new_module]
            except Exception as e:  # pylint: disable=broad-except
                self.log.exception("Failed to import: %s", filepath)
                if self.dagbag_import_error_tracebacks:
                    self.import_errors[filepath] = traceback.format_exc(
                        limit=-self.dagbag_import_error_traceback_depth
                    )
                else:
                    self.import_errors[filepath] = str(e)
        return []
Exemplo n.º 2
0
    def _load_modules_from_file(self, filepath, safe_mode):
        if not might_contain_dag(filepath, safe_mode):
            # Don't want to spam user with skip messages
            if not self.has_logged:
                self.has_logged = True
                self.log.info("File %s assumed to contain no DAGs. Skipping.",
                              filepath)
            return []

        self.log.debug("Importing %s", filepath)
        org_mod_name, _ = os.path.splitext(os.path.split(filepath)[-1])
        path_hash = hashlib.sha1(filepath.encode('utf-8')).hexdigest()
        mod_name = f'unusual_prefix_{path_hash}_{org_mod_name}'

        if mod_name in sys.modules:
            del sys.modules[mod_name]

        def parse(mod_name, filepath):
            try:
                loader = importlib.machinery.SourceFileLoader(
                    mod_name, filepath)
                spec = importlib.util.spec_from_loader(mod_name, loader)
                new_module = importlib.util.module_from_spec(spec)
                sys.modules[spec.name] = new_module
                loader.exec_module(new_module)
                return [new_module]
            except Exception as e:
                self.log.exception("Failed to import: %s", filepath)
                if self.dagbag_import_error_tracebacks:
                    self.import_errors[filepath] = traceback.format_exc(
                        limit=-self.dagbag_import_error_traceback_depth)
                else:
                    self.import_errors[filepath] = str(e)
                return []

        dagbag_import_timeout = settings.get_dagbag_import_timeout(filepath)

        if not isinstance(dagbag_import_timeout, (int, float)):
            raise TypeError(
                f'Value ({dagbag_import_timeout}) from get_dagbag_import_timeout must be int or float'
            )

        if dagbag_import_timeout <= 0:  # no parsing timeout
            return parse(mod_name, filepath)

        timeout_msg = (
            f"DagBag import timeout for {filepath} after {dagbag_import_timeout}s.\n"
            "Please take a look at these docs to improve your DAG import time:\n"
            f"* {get_docs_url('best-practices.html#top-level-python-code')}\n"
            f"* {get_docs_url('best-practices.html#reducing-dag-complexity')}")
        with timeout(dagbag_import_timeout, error_message=timeout_msg):
            return parse(mod_name, filepath)
Exemplo n.º 3
0
    def _load_modules_from_zip(self, filepath, safe_mode):
        mods = []
        with zipfile.ZipFile(filepath) as current_zip_file:
            for zip_info in current_zip_file.infolist():
                head, _ = os.path.split(zip_info.filename)
                mod_name, ext = os.path.splitext(zip_info.filename)
                if ext not in [".py", ".pyc"]:
                    continue
                if head:
                    continue

                if mod_name == '__init__':
                    self.log.warning("Found __init__.%s at root of %s", ext,
                                     filepath)

                self.log.debug("Reading %s from %s", zip_info.filename,
                               filepath)

                if not might_contain_dag(zip_info.filename, safe_mode,
                                         current_zip_file):
                    # todo: create ignore list
                    # Don't want to spam user with skip messages
                    if not self.has_logged:
                        self.has_logged = True
                        self.log.info(
                            "File %s:%s assumed to contain no DAGs. Skipping.",
                            filepath, zip_info.filename)
                    continue

                if mod_name in sys.modules:
                    del sys.modules[mod_name]

                try:
                    sys.path.insert(0, filepath)
                    current_module = importlib.import_module(mod_name)
                    mods.append(current_module)
                except Exception as e:
                    fileloc = os.path.join(filepath, zip_info.filename)
                    self.log.exception("Failed to import: %s", fileloc)
                    if self.dagbag_import_error_tracebacks:
                        self.import_errors[fileloc] = traceback.format_exc(
                            limit=-self.dagbag_import_error_traceback_depth)
                    else:
                        self.import_errors[fileloc] = str(e)
                finally:
                    if sys.path[0] == filepath:
                        del sys.path[0]
        return mods
Exemplo n.º 4
0
    def _refresh_dag_dir(self):
        """Refresh file paths from dag dir if we haven't done it for too long."""
        now = timezone.utcnow()
        elapsed_time_since_refresh = (
            now - self.last_dag_dir_refresh_time).total_seconds()
        if elapsed_time_since_refresh > self.dag_dir_list_interval:
            # Build up a list of Python files that could contain DAGs
            self.log.info("Searching for files in %s", self._dag_directory)
            self._file_paths = list_py_file_paths(self._dag_directory)
            self.last_dag_dir_refresh_time = now
            self.log.info("There are %s files in %s", len(self._file_paths),
                          self._dag_directory)
            self.set_file_paths(self._file_paths)

            try:
                self.log.debug("Removing old import errors")
                self.clear_nonexistent_import_errors()
            except Exception:
                self.log.exception("Error removing old import errors")

            # Check if file path is a zipfile and get the full path of the python file.
            # Without this, SerializedDagModel.remove_deleted_files would delete zipped dags.
            # Likewise DagCode.remove_deleted_code
            dag_filelocs = []
            for fileloc in self._file_paths:
                if not fileloc.endswith(".py") and zipfile.is_zipfile(fileloc):
                    with zipfile.ZipFile(fileloc) as z:
                        dag_filelocs.extend([
                            os.path.join(fileloc, info.filename)
                            for info in z.infolist()
                            if might_contain_dag(info.filename, True, z)
                        ])
                else:
                    dag_filelocs.append(fileloc)

            SerializedDagModel.remove_deleted_dags(dag_filelocs)
            DagModel.deactivate_deleted_dags(self._file_paths)

            from airflow.models.dagcode import DagCode

            DagCode.remove_deleted_code(dag_filelocs)
Exemplo n.º 5
0
    def _load_modules_from_zip(self, filepath, safe_mode):
        mods = []
        current_zip_file = zipfile.ZipFile(filepath)
        for zip_info in current_zip_file.infolist():
            head, _ = os.path.split(zip_info.filename)
            mod_name, ext = os.path.splitext(zip_info.filename)
            if ext not in [".py", ".pyc"]:
                continue
            if head:
                continue

            if mod_name == '__init__':
                self.log.warning("Found __init__.%s at root of %s", ext,
                                 filepath)

            self.log.debug("Reading %s from %s", zip_info.filename, filepath)

            if not might_contain_dag(zip_info.filename, safe_mode,
                                     current_zip_file):
                # todo: create ignore list
                # Don't want to spam user with skip messages
                if not self.has_logged or True:
                    self.has_logged = True
                    self.log.info(
                        "File %s:%s assumed to contain no DAGs. Skipping.",
                        filepath, zip_info.filename)
                continue

            if mod_name in sys.modules:
                del sys.modules[mod_name]

            try:
                sys.path.insert(0, filepath)
                current_module = importlib.import_module(mod_name)
                mods.append(current_module)
            except Exception as e:  # pylint: disable=broad-except
                self.log.exception("Failed to import: %s", filepath)
                self.import_errors[filepath] = str(e)
        return mods