Exemplo n.º 1
0
def test_add_to_sys_path_with_none():
    original = copy.copy(sys.path)

    with add_to_sys_path(None, chdir=False):
        assert sys.path == original

    assert sys.path == original
Exemplo n.º 2
0
def add():
    """Add scaffold templates for tasks whose source does not exist
    """
    # setting lazy_import to true causes sources to be returned as paths,
    # instead of placeholders
    spec, path_to_spec = DAGSpec._auto_load(to_dag=False, lazy_import=True)
    loader = ScaffoldLoader('ploomber_add')

    # TODO: when the dag has a source loader, the argument passed to
    # ploomber_add should take that into account to place the new file
    # in the appropriate location (instead of doing it relative to
    # pipeline.yaml)

    # TODO: raise an error if the location is inside the site-packages folder

    # NOTE: lazy loading freom source loader will giev errors because
    # initializing a source with a path only, loses the information from the
    # jinja environment to make macros workj. I have to test this. the best
    # solution is to add a lazy_load param to Placeholder, so it can be
    # initialized with a path for a file that does not exist

    if path_to_spec:
        print('Found spec at {}'.format(path_to_spec))

        # make sure current working dir is in the path, otherwise we might not
        # be able to import the PythonCallable functions, which we need to do
        # to locate the modules
        with add_to_sys_path(path_to_spec, chdir=False):
            for task in spec['tasks']:
                loader.create(source=task['source'],
                              params=spec['meta'],
                              class_=task['class'])
    else:
        print('Error: No pipeline.yaml spec found...')
Exemplo n.º 3
0
def test_add_to_sys_path():
    path = str(Path('/path/to/add').resolve())

    with add_to_sys_path(path, chdir=False):
        assert path in sys.path

    assert path not in sys.path
Exemplo n.º 4
0
def test_add_to_sys_path_with_exception():
    path = str(Path('/path/to/add').resolve())

    with pytest.raises(Exception):
        with add_to_sys_path(path, chdir=False):
            assert path in sys.path
            raise Exception

    assert path not in sys.path
Exemplo n.º 5
0
def test_add_to_sys_path_with_chdir(tmp_directory):
    path = Path('.').resolve() / 'some_directory'
    path.mkdir()
    path = str(path)
    old_dir = os.getcwd()

    with add_to_sys_path(path, chdir=True):
        assert path in sys.path
        assert path == os.getcwd()

    assert path not in sys.path
    assert old_dir == os.getcwd()
Exemplo n.º 6
0
def add(spec, path_to_spec):
    """Add scaffold templates for tasks whose source does not exist

    Parameters
    ----------
    spec : DAGSpec
        The spec to inspect to create missing task.source

    path_to_spec : str
        Path to the spec, only used to emit messages to the console
    """
    loader = ScaffoldLoader()

    # TODO: when the dag has a source loader, the argument passed to
    # ploomber_add should take that into account to place the new file
    # in the appropriate location (instead of doing it relative to
    # pipeline.yaml)

    # TODO: raise an error if the location is inside the site-packages folder

    # NOTE: lazy loading freom source loader will giev errors because
    # initializing a source with a path only, loses the information from the
    # jinja environment to make macros workj. I have to test this. the best
    # solution is to add a lazy_load param to Placeholder, so it can be
    # initialized with a path for a file that does not exist

    if path_to_spec:
        click.echo(f'Found spec at {str(path_to_spec)!r}')

        n = 0

        # make sure current working dir is in the path, otherwise we might not
        # be able to import the PythonCallable functions, which we need to do
        # to locate the modules
        with add_to_sys_path(path_to_spec, chdir=False):
            for task in spec['tasks']:
                did_create = loader.create(source=task['source'],
                                           params=spec['meta'],
                                           class_=task['class'])
                n += int(did_create)

        if not n:
            click.echo(f'All tasks sources declared in {str(path_to_spec)!r} '
                       'exist, nothing was created.')
        else:
            click.echo(f'Created {n} new task sources.')

    else:
        click.echo('Error: No pipeline.yaml spec found...')
Exemplo n.º 7
0
    def to_dag(self):
        """Converts the DAG spec to a DAG object
        """
        # when initializing DAGs from pipeline.yaml files, we have to ensure
        # that the folder where pipeline.yaml is located is in sys.path for
        # imports to work (for dag clients), this happens most of the time but
        # for some (unknown) reason, it doesn't
        # happen when initializing PloomberContentsManager.
        # pipeline.yaml paths are written relative to that file, for source
        # scripts to be located we temporarily change the current working
        # directory
        with add_to_sys_path(self._parent_path, chdir=True):
            dag = self._to_dag()

        return dag
Exemplo n.º 8
0
    def __init__(self,
                 data,
                 env=None,
                 lazy_import=False,
                 reload=False,
                 parent_path=None):
        if isinstance(data, (str, Path)):
            if parent_path is not None:
                raise ValueError('parent_path must be None when '
                                 f'initializing {type(self).__name__} with '
                                 'a path to a YAML spec')
            # this is only used to display an error message with the path
            # to the loaded file
            path_for_errors = data
            # resolve the parent path to make sources and products unambiguous
            # even if the current working directory changes
            path_to_entry_point = Path(data).resolve()
            self._parent_path = str(path_to_entry_point.parent)

            content = Path(data).read_text()

            try:
                data = yaml.safe_load(content)
            except (yaml.parser.ParserError,
                    yaml.constructor.ConstructorError) as e:
                error = e
            else:
                error = None

            if error:
                if '{{' in content or '}}' in content:
                    raise DAGSpecInitializationError(
                        'Failed to initialize spec. It looks like '
                        'you\'re using placeholders (i.e. {{placeholder}}). '
                        'Make sure values are enclosed in parentheses '
                        '(e.g. key: "{{placeholder}}"). Original '
                        'parser error:\n\n'
                        f'{error}')
                else:
                    raise error

        else:
            path_for_errors = None
            # FIXME: add test cases, some of those features wont work if
            # _parent_path is None. We should make sure that we either raise
            # an error if _parent_path is needed or use the current working
            # directory if it's appropriate - this is mostly to make relative
            # paths consistent: they should be relative to the file that
            # contains them
            self._parent_path = (None if not parent_path else str(
                Path(parent_path).resolve()))

        # try to look env.yaml in default locations
        env_default_path = default.path_to_env(self._parent_path)

        self.data = data

        if isinstance(self.data, list):
            self.data = {'tasks': self.data}

        # validate keys defined at the top (nested keys are not validated here)
        self._validate_top_keys(self.data, path_for_errors)

        logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env))

        env = env or dict()

        # NOTE: when loading from a path, EnvDict recursively looks
        # at parent folders, this is useful when loading envs
        # in nested directories where scripts/functions need the env
        # but here, since we just need this for the spec, we might
        # want to turn it off. should we add a parameter to EnvDict
        # to control this?
        if env_default_path:
            defaults = yaml.safe_load(Path(env_default_path).read_text())
            self.env = EnvDict(env,
                               path_to_here=self._parent_path,
                               defaults=defaults)
        else:
            self.env = EnvDict(env, path_to_here=self._parent_path)

        self.data = expand_raw_dictionary(self.data, self.env)

        logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data))

        # if there is a "location" top key, we don't have to do anything else
        # as we will just load the dotted path when .to_dag() is called
        if 'location' not in self.data:

            Meta.initialize_inplace(self.data)

            import_tasks_from = self.data['meta']['import_tasks_from']

            if import_tasks_from is not None:
                # when using a relative path in "import_tasks_from", we must
                # make it absolute...
                if not Path(import_tasks_from).is_absolute():
                    # use _parent_path if there is one
                    if self._parent_path:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(self._parent_path, import_tasks_from))
                    # otherwise just make it absolute
                    else:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(import_tasks_from).resolve())

                imported = yaml.safe_load(
                    Path(self.data['meta']['import_tasks_from']).read_text())

                if self.env is not None:
                    imported = expand_raw_dictionaries(imported, self.env)

                # relative paths here are relative to the file where they
                # are declared
                base_path = Path(self.data['meta']['import_tasks_from']).parent

                for task in imported:
                    add_base_path_to_source_if_relative(task,
                                                        base_path=base_path)

                self.data['tasks'].extend(imported)

            self.data['tasks'] = [
                normalize_task(task) for task in self.data['tasks']
            ]

            # make sure the folder where the pipeline is located is in sys.path
            # otherwise dynamic imports needed by TaskSpec will fail
            with add_to_sys_path(self._parent_path, chdir=False):
                self.data['tasks'] = [
                    TaskSpec(t,
                             self.data['meta'],
                             project_root=self._parent_path,
                             lazy_import=lazy_import,
                             reload=reload) for t in self.data['tasks']
                ]
        else:
            self.data['meta'] = Meta.empty()
Exemplo n.º 9
0
    def _init(self, data, env, lazy_import, reload, parent_path,
              look_up_project_root_recursively):
        self._lazy_import = lazy_import

        # initialized with a path to a yaml file...
        if isinstance(data, (str, Path)):
            # TODO: test this
            if parent_path is not None:
                raise ValueError('parent_path must be None when '
                                 f'initializing {type(self).__name__} with '
                                 'a path to a YAML spec')
            # resolve the parent path to make sources and products unambiguous
            # even if the current working directory changes
            self._path = Path(data).resolve()
            self._parent_path = str(self._path.parent)

            if not Path(data).is_file():
                raise FileNotFoundError(
                    'Error initializing DAGSpec with argument '
                    f'{data!r}: Expected it to be a path to a YAML file, but '
                    'such file does not exist')

            content = Path(data).read_text()

            try:
                data = yaml.safe_load(content)
            except (yaml.parser.ParserError,
                    yaml.constructor.ConstructorError) as e:
                error = e
            else:
                error = None

            if error:
                if '{{' in content or '}}' in content:
                    raise DAGSpecInitializationError(
                        'Failed to initialize spec. It looks like '
                        'you\'re using placeholders (i.e. {{placeholder}}). '
                        'Make sure values are enclosed in parentheses '
                        '(e.g. key: "{{placeholder}}"). Original '
                        'parser error:\n\n'
                        f'{error}')
                else:
                    raise error

        # initialized with a dictionary...
        else:
            self._path = None
            # FIXME: add test cases, some of those features wont work if
            # _parent_path is None. We should make sure that we either raise
            # an error if _parent_path is needed or use the current working
            # directory if it's appropriate - this is mostly to make relative
            # paths consistent: they should be relative to the file that
            # contains them
            self._parent_path = (None if not parent_path else str(
                Path(parent_path).resolve()))

        self.data = data

        if isinstance(self.data, list):
            self.data = {'tasks': self.data}

        # validate keys defined at the top (nested keys are not validated here)
        self._validate_top_keys(self.data, self._path)

        logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env))

        env = env or dict()
        path_to_defaults = default.path_to_env_from_spec(
            path_to_spec=self._path)

        if path_to_defaults:
            defaults = yaml.safe_load(Path(path_to_defaults).read_text())
            self.env = EnvDict(env,
                               path_to_here=self._parent_path,
                               defaults=defaults)
        else:
            self.env = EnvDict(env, path_to_here=self._parent_path)

        self.data, tags = expand_raw_dictionary_and_extract_tags(
            self.data, self.env)

        logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data))

        # if there is a "location" top key, we don't have to do anything else
        # as we will just load the dotted path when .to_dag() is called
        if 'location' not in self.data:

            Meta.initialize_inplace(self.data)

            import_tasks_from = self.data['meta']['import_tasks_from']

            if import_tasks_from is not None:
                # when using a relative path in "import_tasks_from", we must
                # make it absolute...
                if not Path(import_tasks_from).is_absolute():
                    # use _parent_path if there is one
                    if self._parent_path:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(self._parent_path, import_tasks_from))
                    # otherwise just make it absolute
                    else:
                        self.data['meta']['import_tasks_from'] = str(
                            Path(import_tasks_from).resolve())

                imported = yaml.safe_load(
                    Path(self.data['meta']['import_tasks_from']).read_text())

                if self.env is not None:
                    (imported,
                     tags_other) = expand_raw_dictionaries_and_extract_tags(
                         imported, self.env)
                    tags = tags | tags_other

                # relative paths here are relative to the file where they
                # are declared
                base_path = Path(self.data['meta']['import_tasks_from']).parent

                for task in imported:
                    add_base_path_to_source_if_relative(task,
                                                        base_path=base_path)

                self.data['tasks'].extend(imported)

            # check if there are any params declared in env, not used in
            # in the pipeline
            extra = set(self.env) - self.env.default_keys - tags

            if extra:
                warnings.warn('The following placeholders are declared in the '
                              'environment but '
                              f'unused in the spec: {extra}')

            self.data['tasks'] = [
                normalize_task(task) for task in self.data['tasks']
            ]

            # NOTE: for simple projects, project root is the parent folder
            # of pipeline.yaml, for package projects is the parent folder
            # of setup.py
            if look_up_project_root_recursively:
                project_root = (
                    None if not self._parent_path else
                    default.find_root_recursively(
                        starting_dir=self._parent_path,
                        filename=None if not self._path else self._path.name))
            else:
                project_root = self._parent_path

            # make sure the folder where the pipeline is located is in sys.path
            # otherwise dynamic imports needed by TaskSpec will fail
            with add_to_sys_path(self._parent_path, chdir=False):
                self.data['tasks'] = [
                    TaskSpec(t,
                             self.data['meta'],
                             project_root=project_root,
                             lazy_import=lazy_import,
                             reload=reload) for t in self.data['tasks']
                ]
        else:
            self.data['meta'] = Meta.empty()