def test_add_to_sys_path_with_none(): original = copy.copy(sys.path) with add_to_sys_path(None, chdir=False): assert sys.path == original assert sys.path == original
def add(): """Add scaffold templates for tasks whose source does not exist """ # setting lazy_import to true causes sources to be returned as paths, # instead of placeholders spec, path_to_spec = DAGSpec._auto_load(to_dag=False, lazy_import=True) loader = ScaffoldLoader('ploomber_add') # TODO: when the dag has a source loader, the argument passed to # ploomber_add should take that into account to place the new file # in the appropriate location (instead of doing it relative to # pipeline.yaml) # TODO: raise an error if the location is inside the site-packages folder # NOTE: lazy loading freom source loader will giev errors because # initializing a source with a path only, loses the information from the # jinja environment to make macros workj. I have to test this. the best # solution is to add a lazy_load param to Placeholder, so it can be # initialized with a path for a file that does not exist if path_to_spec: print('Found spec at {}'.format(path_to_spec)) # make sure current working dir is in the path, otherwise we might not # be able to import the PythonCallable functions, which we need to do # to locate the modules with add_to_sys_path(path_to_spec, chdir=False): for task in spec['tasks']: loader.create(source=task['source'], params=spec['meta'], class_=task['class']) else: print('Error: No pipeline.yaml spec found...')
def test_add_to_sys_path(): path = str(Path('/path/to/add').resolve()) with add_to_sys_path(path, chdir=False): assert path in sys.path assert path not in sys.path
def test_add_to_sys_path_with_exception(): path = str(Path('/path/to/add').resolve()) with pytest.raises(Exception): with add_to_sys_path(path, chdir=False): assert path in sys.path raise Exception assert path not in sys.path
def test_add_to_sys_path_with_chdir(tmp_directory): path = Path('.').resolve() / 'some_directory' path.mkdir() path = str(path) old_dir = os.getcwd() with add_to_sys_path(path, chdir=True): assert path in sys.path assert path == os.getcwd() assert path not in sys.path assert old_dir == os.getcwd()
def add(spec, path_to_spec): """Add scaffold templates for tasks whose source does not exist Parameters ---------- spec : DAGSpec The spec to inspect to create missing task.source path_to_spec : str Path to the spec, only used to emit messages to the console """ loader = ScaffoldLoader() # TODO: when the dag has a source loader, the argument passed to # ploomber_add should take that into account to place the new file # in the appropriate location (instead of doing it relative to # pipeline.yaml) # TODO: raise an error if the location is inside the site-packages folder # NOTE: lazy loading freom source loader will giev errors because # initializing a source with a path only, loses the information from the # jinja environment to make macros workj. I have to test this. the best # solution is to add a lazy_load param to Placeholder, so it can be # initialized with a path for a file that does not exist if path_to_spec: click.echo(f'Found spec at {str(path_to_spec)!r}') n = 0 # make sure current working dir is in the path, otherwise we might not # be able to import the PythonCallable functions, which we need to do # to locate the modules with add_to_sys_path(path_to_spec, chdir=False): for task in spec['tasks']: did_create = loader.create(source=task['source'], params=spec['meta'], class_=task['class']) n += int(did_create) if not n: click.echo(f'All tasks sources declared in {str(path_to_spec)!r} ' 'exist, nothing was created.') else: click.echo(f'Created {n} new task sources.') else: click.echo('Error: No pipeline.yaml spec found...')
def to_dag(self): """Converts the DAG spec to a DAG object """ # when initializing DAGs from pipeline.yaml files, we have to ensure # that the folder where pipeline.yaml is located is in sys.path for # imports to work (for dag clients), this happens most of the time but # for some (unknown) reason, it doesn't # happen when initializing PloomberContentsManager. # pipeline.yaml paths are written relative to that file, for source # scripts to be located we temporarily change the current working # directory with add_to_sys_path(self._parent_path, chdir=True): dag = self._to_dag() return dag
def __init__(self, data, env=None, lazy_import=False, reload=False, parent_path=None): if isinstance(data, (str, Path)): if parent_path is not None: raise ValueError('parent_path must be None when ' f'initializing {type(self).__name__} with ' 'a path to a YAML spec') # this is only used to display an error message with the path # to the loaded file path_for_errors = data # resolve the parent path to make sources and products unambiguous # even if the current working directory changes path_to_entry_point = Path(data).resolve() self._parent_path = str(path_to_entry_point.parent) content = Path(data).read_text() try: data = yaml.safe_load(content) except (yaml.parser.ParserError, yaml.constructor.ConstructorError) as e: error = e else: error = None if error: if '{{' in content or '}}' in content: raise DAGSpecInitializationError( 'Failed to initialize spec. It looks like ' 'you\'re using placeholders (i.e. {{placeholder}}). ' 'Make sure values are enclosed in parentheses ' '(e.g. key: "{{placeholder}}"). Original ' 'parser error:\n\n' f'{error}') else: raise error else: path_for_errors = None # FIXME: add test cases, some of those features wont work if # _parent_path is None. We should make sure that we either raise # an error if _parent_path is needed or use the current working # directory if it's appropriate - this is mostly to make relative # paths consistent: they should be relative to the file that # contains them self._parent_path = (None if not parent_path else str( Path(parent_path).resolve())) # try to look env.yaml in default locations env_default_path = default.path_to_env(self._parent_path) self.data = data if isinstance(self.data, list): self.data = {'tasks': self.data} # validate keys defined at the top (nested keys are not validated here) self._validate_top_keys(self.data, path_for_errors) logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env)) env = env or dict() # NOTE: when loading from a path, EnvDict recursively looks # at parent folders, this is useful when loading envs # in nested directories where scripts/functions need the env # but here, since we just need this for the spec, we might # want to turn it off. should we add a parameter to EnvDict # to control this? if env_default_path: defaults = yaml.safe_load(Path(env_default_path).read_text()) self.env = EnvDict(env, path_to_here=self._parent_path, defaults=defaults) else: self.env = EnvDict(env, path_to_here=self._parent_path) self.data = expand_raw_dictionary(self.data, self.env) logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data)) # if there is a "location" top key, we don't have to do anything else # as we will just load the dotted path when .to_dag() is called if 'location' not in self.data: Meta.initialize_inplace(self.data) import_tasks_from = self.data['meta']['import_tasks_from'] if import_tasks_from is not None: # when using a relative path in "import_tasks_from", we must # make it absolute... if not Path(import_tasks_from).is_absolute(): # use _parent_path if there is one if self._parent_path: self.data['meta']['import_tasks_from'] = str( Path(self._parent_path, import_tasks_from)) # otherwise just make it absolute else: self.data['meta']['import_tasks_from'] = str( Path(import_tasks_from).resolve()) imported = yaml.safe_load( Path(self.data['meta']['import_tasks_from']).read_text()) if self.env is not None: imported = expand_raw_dictionaries(imported, self.env) # relative paths here are relative to the file where they # are declared base_path = Path(self.data['meta']['import_tasks_from']).parent for task in imported: add_base_path_to_source_if_relative(task, base_path=base_path) self.data['tasks'].extend(imported) self.data['tasks'] = [ normalize_task(task) for task in self.data['tasks'] ] # make sure the folder where the pipeline is located is in sys.path # otherwise dynamic imports needed by TaskSpec will fail with add_to_sys_path(self._parent_path, chdir=False): self.data['tasks'] = [ TaskSpec(t, self.data['meta'], project_root=self._parent_path, lazy_import=lazy_import, reload=reload) for t in self.data['tasks'] ] else: self.data['meta'] = Meta.empty()
def _init(self, data, env, lazy_import, reload, parent_path, look_up_project_root_recursively): self._lazy_import = lazy_import # initialized with a path to a yaml file... if isinstance(data, (str, Path)): # TODO: test this if parent_path is not None: raise ValueError('parent_path must be None when ' f'initializing {type(self).__name__} with ' 'a path to a YAML spec') # resolve the parent path to make sources and products unambiguous # even if the current working directory changes self._path = Path(data).resolve() self._parent_path = str(self._path.parent) if not Path(data).is_file(): raise FileNotFoundError( 'Error initializing DAGSpec with argument ' f'{data!r}: Expected it to be a path to a YAML file, but ' 'such file does not exist') content = Path(data).read_text() try: data = yaml.safe_load(content) except (yaml.parser.ParserError, yaml.constructor.ConstructorError) as e: error = e else: error = None if error: if '{{' in content or '}}' in content: raise DAGSpecInitializationError( 'Failed to initialize spec. It looks like ' 'you\'re using placeholders (i.e. {{placeholder}}). ' 'Make sure values are enclosed in parentheses ' '(e.g. key: "{{placeholder}}"). Original ' 'parser error:\n\n' f'{error}') else: raise error # initialized with a dictionary... else: self._path = None # FIXME: add test cases, some of those features wont work if # _parent_path is None. We should make sure that we either raise # an error if _parent_path is needed or use the current working # directory if it's appropriate - this is mostly to make relative # paths consistent: they should be relative to the file that # contains them self._parent_path = (None if not parent_path else str( Path(parent_path).resolve())) self.data = data if isinstance(self.data, list): self.data = {'tasks': self.data} # validate keys defined at the top (nested keys are not validated here) self._validate_top_keys(self.data, self._path) logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env)) env = env or dict() path_to_defaults = default.path_to_env_from_spec( path_to_spec=self._path) if path_to_defaults: defaults = yaml.safe_load(Path(path_to_defaults).read_text()) self.env = EnvDict(env, path_to_here=self._parent_path, defaults=defaults) else: self.env = EnvDict(env, path_to_here=self._parent_path) self.data, tags = expand_raw_dictionary_and_extract_tags( self.data, self.env) logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data)) # if there is a "location" top key, we don't have to do anything else # as we will just load the dotted path when .to_dag() is called if 'location' not in self.data: Meta.initialize_inplace(self.data) import_tasks_from = self.data['meta']['import_tasks_from'] if import_tasks_from is not None: # when using a relative path in "import_tasks_from", we must # make it absolute... if not Path(import_tasks_from).is_absolute(): # use _parent_path if there is one if self._parent_path: self.data['meta']['import_tasks_from'] = str( Path(self._parent_path, import_tasks_from)) # otherwise just make it absolute else: self.data['meta']['import_tasks_from'] = str( Path(import_tasks_from).resolve()) imported = yaml.safe_load( Path(self.data['meta']['import_tasks_from']).read_text()) if self.env is not None: (imported, tags_other) = expand_raw_dictionaries_and_extract_tags( imported, self.env) tags = tags | tags_other # relative paths here are relative to the file where they # are declared base_path = Path(self.data['meta']['import_tasks_from']).parent for task in imported: add_base_path_to_source_if_relative(task, base_path=base_path) self.data['tasks'].extend(imported) # check if there are any params declared in env, not used in # in the pipeline extra = set(self.env) - self.env.default_keys - tags if extra: warnings.warn('The following placeholders are declared in the ' 'environment but ' f'unused in the spec: {extra}') self.data['tasks'] = [ normalize_task(task) for task in self.data['tasks'] ] # NOTE: for simple projects, project root is the parent folder # of pipeline.yaml, for package projects is the parent folder # of setup.py if look_up_project_root_recursively: project_root = ( None if not self._parent_path else default.find_root_recursively( starting_dir=self._parent_path, filename=None if not self._path else self._path.name)) else: project_root = self._parent_path # make sure the folder where the pipeline is located is in sys.path # otherwise dynamic imports needed by TaskSpec will fail with add_to_sys_path(self._parent_path, chdir=False): self.data['tasks'] = [ TaskSpec(t, self.data['meta'], project_root=project_root, lazy_import=lazy_import, reload=reload) for t in self.data['tasks'] ] else: self.data['meta'] = Meta.empty()