def test_initialization(spec, expected, tmp_sample_tasks, tmp_imports): meta = Meta.default_meta({ 'extract_product': False, 'extract_upstream': True }) spec = TaskSpec(spec, meta=meta, project_root='.') # check values after initialization assert spec['class'] == expected assert isinstance(spec['source'], Path) # check we can convert it to a Task spec.to_task(dag=DAG())
def test_error_if_client_dotted_path_returns_none(tmp_sample_tasks, add_current_to_sys_path, no_sys_modules_cache, key): Path('client_dotted_path_returns_none.py').write_text(""" def get(): return None """) meta = Meta.default_meta({ 'extract_product': False, 'extract_upstream': True, }) dag = DAG() spec = { 'source': 'sample.sql', 'product': ['name', 'table'], } spec[key] = 'client_dotted_path_returns_none.get' with pytest.raises(TypeError) as excinfo: TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag) assert ( "Error calling dotted path " "'client_dotted_path_returns_none.get'. Expected a value but got None" ) in str(excinfo.value)
def test_add_hook(tmp_directory, add_current_to_sys_path): task = { 'product': 'notebook.ipynb', 'source': 'source.py', 'on_finish': 'hooks.some_hook', 'on_render': 'hooks.some_hook', 'on_failure': 'hooks.some_hook' } meta = Meta.default_meta() meta['extract_product'] = False Path('source.py').write_text(""" # + tags=["parameters"] # some code """) Path('hooks.py').write_text(""" def some_hook(): pass """) dag = DAG() t, _ = TaskSpec(task, meta, project_root='.').to_task(dag) assert t.on_finish assert t.on_render assert t.on_failure
def test_validate_missing_source(key): with pytest.raises(KeyError): TaskSpec({key: None}, { 'extract_product': False, 'extract_upstream': False }, project_root='.')
def test_grid_with_hook_lazy_import(backup_spec_with_functions_flat, tmp_imports): grid_spec = { 'source': 'my_tasks_flat.raw.function', 'name': 'function-', 'product': 'some_file.txt', 'grid': { 'a': [1, 2], 'b': [3, 4] }, 'on_render': 'hooks.on_render', 'on_finish': 'hooks.on_finish', 'on_failure': 'hooks.on_failure', } meta = Meta.default_meta() dag = DAG() TaskSpec(grid_spec, meta, project_root='.', lazy_import=True).to_task(dag=dag) assert all(t.on_render.callable is None for t in dag.values()) assert all(t.on_finish.callable is None for t in dag.values()) assert all(t.on_failure.callable is None for t in dag.values()) assert all(t.on_render._spec.dotted_path == 'hooks.on_render' for t in dag.values()) assert all(t.on_finish._spec.dotted_path == 'hooks.on_finish' for t in dag.values()) assert all(t.on_failure._spec.dotted_path == 'hooks.on_failure' for t in dag.values())
def test_error_if_dotted_path_does_not_return_a_callable( backup_spec_with_functions_flat, add_current_to_sys_path, no_sys_modules_cache, key): Path('test_error_if_dotted_path_does_not_return_a_callable.py').write_text( """ some_non_function = 1 """) meta = Meta.default_meta({'extract_product': False}) spec = { 'source': 'my_tasks_flat.raw.function', 'product': 'some_file.txt', } spec[key] = ('test_error_if_dotted_path_does_not_return_a_callable' '.some_non_function') with pytest.raises(TypeError) as excinfo: TaskSpec(spec, meta=meta, project_root='.').to_task(dag=DAG()) expected = ("Error loading dotted path 'test_error_if_dotted_path" "_does_not_return_a_callable.some_non_function'. Expected a " "callable object (i.e., some kind of function). Got " "1 (an object of type: int)") assert str(excinfo.value) == expected
def test_error_on_invalid_value_for_file_product(backup_online, tmp_imports): meta = Meta.default_meta() meta['extract_product'] = False spec = TaskSpec({ 'source': 'online_tasks.square', 'product': 1, }, meta=meta, project_root='.') with pytest.raises(TypeError) as excinfo: spec.to_task(dag=DAG()) expected = ('Error initializing File with argument 1 ' '(expected str, bytes or os.PathLike object, not int)') assert expected == str(excinfo.value)
def test_grid_with_missing_name(backup_spec_with_functions_flat, add_current_to_sys_path, spec): del spec['name'] with pytest.raises(KeyError) as excinfo: TaskSpec(spec, Meta.default_meta(), project_root='.').to_task(dag=DAG()) assert 'Error initializing task with spec' in str(excinfo.value)
def test_grid_and_params(backup_spec_with_functions_flat, tmp_imports, grid_spec): grid_spec['params'] = {'a': 1} with pytest.raises(DAGSpecInitializationError) as excinfo: TaskSpec(grid_spec, Meta.default_meta(), project_root='.').to_task(dag=DAG()) assert "'params' is not allowed when using 'grid'" in str(excinfo.value)
def test_grid_with_missing_name(backup_spec_with_functions_flat, tmp_imports, grid_spec): del grid_spec['name'] with pytest.raises(DAGSpecInitializationError) as excinfo: TaskSpec(grid_spec, Meta.default_meta(), project_root='.').to_task(dag=DAG()) assert 'Error initializing task with source' in str(excinfo.value)
def test_error_when_failing_to_init(spec, tmp_sample_tasks, tmp_imports): meta = Meta.default_meta({ 'extract_product': False, 'extract_upstream': True }) dag = DAG() with pytest.raises(DAGSpecInitializationError) as excinfo: TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag) assert 'Error initializing SQLRelation' in str(excinfo.value)
def test_loads_serializer_and_unserializer(backup_online, tmp_imports): meta = Meta.default_meta() meta['extract_product'] = False spec = TaskSpec( { 'source': 'online_tasks.square', 'product': 'output/square.parquet', 'serializer': 'online_io.serialize', 'unserializer': 'online_io.unserialize', }, meta=meta, project_root='.') dag = DAG() task, _ = spec.to_task(dag=dag) from online_io import serialize, unserialize assert task._serializer.callable is serialize assert task._unserializer.callable is unserialize
def test_lazy_load(tmp_directory, tmp_imports): Path('my_module.py').write_text(""" def fn(): pass """) meta = Meta.default_meta() spec = TaskSpec( { 'source': 'my_module.fn', 'product': 'report.ipynb', 'on_finish': 'not_a_module.not_a_function', 'on_render': 'not_a_module.not_a_function', 'on_failure': 'not_a_module.not_a_function', 'serializer': 'not_a_module.not_a_function', 'unserializer': 'not_a_module.not_a_function', }, meta, '.', lazy_import=True) assert spec.to_task(dag=DAG())
def test_grid(backup_spec_with_functions_flat, add_current_to_sys_path, spec): meta = Meta.default_meta() dag = DAG() task_group, _ = TaskSpec(spec, meta, project_root='.').to_task(dag=dag) assert len(task_group) == 4 assert str(dag['function-0'].product) == str( Path('some_file-0.txt').resolve()) assert str(dag['function-1'].product) == str( Path('some_file-1.txt').resolve()) assert str(dag['function-2'].product) == str( Path('some_file-2.txt').resolve()) assert str(dag['function-3'].product) == str( Path('some_file-3.txt').resolve())
def test_error_on_invalid_class(backup_spec_with_functions_flat, tmp_imports): meta = Meta.default_meta({'extract_product': False}) spec = { 'source': 'my_tasks_flat.raw.function', 'product': 'some_file.txt', 'class': 'unknown_class' } with pytest.raises(ValueError) as excinfo: TaskSpec(spec, meta=meta, project_root='.').to_task(dag=DAG()) expected = ("Error validating Task spec (class field): " "'unknown_class' is not a valid Task class name") assert str(excinfo.value) == expected
def test_constructor_deep_copies_spec_and_meta(tmp_directory, tmp_imports): prod_default_class = {'SQLScript': 'SQLRelation'} meta = Meta.default_meta({ 'extract_product': False, 'product_default_class': prod_default_class }) params = {'params': {'a': 1}} spec = { 'source': 'sample.sql', 'product': 'some_file.txt', 'params': params } task_spec = TaskSpec(data=spec, meta=meta, project_root='.') assert spec is not task_spec.data assert meta is not task_spec.meta assert params is not task_spec.data['params'] assert prod_default_class is not task_spec.meta['product_default_class']
def test_skips_source_loader_if_absolute_path(tmp_sample_tasks, tmp_imports): Path('templates').mkdir() meta = Meta.default_meta({ 'extract_product': False, 'extract_upstream': True, 'source_loader': { 'path': 'templates' } }) dag = DAG() spec = { 'source': str(Path(tmp_sample_tasks, 'sample.sql')), 'product': ['name', 'table'], 'client': 'db.get_client' } assert TaskSpec(spec, meta=meta, project_root='.').to_task(dag=dag)
def _init(self, data, env, lazy_import, reload, parent_path, look_up_project_root_recursively): self._lazy_import = lazy_import # initialized with a path to a yaml file... if isinstance(data, (str, Path)): # TODO: test this if parent_path is not None: raise ValueError('parent_path must be None when ' f'initializing {type(self).__name__} with ' 'a path to a YAML spec') # resolve the parent path to make sources and products unambiguous # even if the current working directory changes self._path = Path(data).resolve() self._parent_path = str(self._path.parent) if not Path(data).is_file(): raise FileNotFoundError( 'Error initializing DAGSpec with argument ' f'{data!r}: Expected it to be a path to a YAML file, but ' 'such file does not exist') content = Path(data).read_text() try: data = yaml.safe_load(content) except (yaml.parser.ParserError, yaml.constructor.ConstructorError) as e: error = e else: error = None if error: if '{{' in content or '}}' in content: raise DAGSpecInitializationError( 'Failed to initialize spec. It looks like ' 'you\'re using placeholders (i.e. {{placeholder}}). ' 'Make sure values are enclosed in parentheses ' '(e.g. key: "{{placeholder}}"). Original ' 'parser error:\n\n' f'{error}') else: raise error # initialized with a dictionary... else: self._path = None # FIXME: add test cases, some of those features wont work if # _parent_path is None. We should make sure that we either raise # an error if _parent_path is needed or use the current working # directory if it's appropriate - this is mostly to make relative # paths consistent: they should be relative to the file that # contains them self._parent_path = (None if not parent_path else str( Path(parent_path).resolve())) self.data = data if isinstance(self.data, list): self.data = {'tasks': self.data} # validate keys defined at the top (nested keys are not validated here) self._validate_top_keys(self.data, self._path) logger.debug('DAGSpec enviroment:\n%s', pp.pformat(env)) env = env or dict() path_to_defaults = default.path_to_env_from_spec( path_to_spec=self._path) if path_to_defaults: defaults = yaml.safe_load(Path(path_to_defaults).read_text()) self.env = EnvDict(env, path_to_here=self._parent_path, defaults=defaults) else: self.env = EnvDict(env, path_to_here=self._parent_path) self.data, tags = expand_raw_dictionary_and_extract_tags( self.data, self.env) logger.debug('Expanded DAGSpec:\n%s', pp.pformat(data)) # if there is a "location" top key, we don't have to do anything else # as we will just load the dotted path when .to_dag() is called if 'location' not in self.data: Meta.initialize_inplace(self.data) import_tasks_from = self.data['meta']['import_tasks_from'] if import_tasks_from is not None: # when using a relative path in "import_tasks_from", we must # make it absolute... if not Path(import_tasks_from).is_absolute(): # use _parent_path if there is one if self._parent_path: self.data['meta']['import_tasks_from'] = str( Path(self._parent_path, import_tasks_from)) # otherwise just make it absolute else: self.data['meta']['import_tasks_from'] = str( Path(import_tasks_from).resolve()) imported = yaml.safe_load( Path(self.data['meta']['import_tasks_from']).read_text()) if self.env is not None: (imported, tags_other) = expand_raw_dictionaries_and_extract_tags( imported, self.env) tags = tags | tags_other # relative paths here are relative to the file where they # are declared base_path = Path(self.data['meta']['import_tasks_from']).parent for task in imported: add_base_path_to_source_if_relative(task, base_path=base_path) self.data['tasks'].extend(imported) # check if there are any params declared in env, not used in # in the pipeline extra = set(self.env) - self.env.default_keys - tags if extra: warnings.warn('The following placeholders are declared in the ' 'environment but ' f'unused in the spec: {extra}') self.data['tasks'] = [ normalize_task(task) for task in self.data['tasks'] ] # NOTE: for simple projects, project root is the parent folder # of pipeline.yaml, for package projects is the parent folder # of setup.py if look_up_project_root_recursively: project_root = ( None if not self._parent_path else default.find_root_recursively( starting_dir=self._parent_path, filename=None if not self._path else self._path.name)) else: project_root = self._parent_path # make sure the folder where the pipeline is located is in sys.path # otherwise dynamic imports needed by TaskSpec will fail with add_to_sys_path(self._parent_path, chdir=False): self.data['tasks'] = [ TaskSpec(t, self.data['meta'], project_root=project_root, lazy_import=lazy_import, reload=reload) for t in self.data['tasks'] ] else: self.data['meta'] = Meta.empty()
def test_error_if_extract_but_keys_declared(task, meta): with pytest.raises(DAGSpecInitializationError): TaskSpec(task, meta, project_root='.')
def test_error_if_extract_but_keys_declared(task, meta): with pytest.raises(ValueError): TaskSpec(task, meta, project_root='.')