def test_error_invalid_dag_level_client_dotted_path(tmp_sample_tasks, add_current_to_sys_path, no_sys_modules_cache, code, expected_error): Path('dag_level_client_dotted_path.py').write_text(code) spec = DAGSpec({ 'meta': { 'extract_product': False, 'extract_upstream': True, }, 'tasks': [ { 'source': 'sample.sql', 'product': ['name', 'table'] }, ], 'clients': { 'SQLScript': 'dag_level_client_dotted_path.get' } }) with pytest.raises(TypeError) as excinfo: spec.to_dag() assert expected_error in str(excinfo.value)
def test_sets_clients(tmp_sample_tasks, add_current_to_sys_path, no_sys_modules_cache, dotted_path_spec): Path('test_sets_clients.py').write_text(""" from unittest.mock import Mock def get(a=None): return Mock() """) spec = DAGSpec({ 'meta': { 'extract_product': False, 'extract_upstream': True, }, 'tasks': [ { 'source': 'sample.sql', 'product': ['name', 'table'] }, ], 'clients': { 'SQLScript': dotted_path_spec } }) dag = spec.to_dag() assert isinstance(dag.clients[SQLScript], Mock)
def test_lazy_load_product_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): Path('script.sql').write_text(""" CREATE TABLE {{product}} AS SELECT * FROM my_table """) with sqlite3.connect('my.db') as conn: pd.DataFrame({'x': range(5)}).to_sql('my_table', conn) tasks = [ { 'source': 'script.sql', 'product': [None, 'name', 'table'], 'client': client_spec, 'product_client': client_spec, 'product_class': 'GenericSQLRelation', }, ] data = {'tasks': tasks} spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules
def test_lazy_load_dag_level_client(tmp_directory, tmp_imports, my_testing_module, client_spec): tasks = [ { 'source': 'my_testing_module.task', 'product': 'output.csv' }, ] data = { 'tasks': tasks, 'clients': { 'File': client_spec }, } spec = DAGSpec(data, lazy_import=True) dag = spec.to_dag() dag.executor = Serial(build_in_subprocess=False) # since lazy_load=True, creating the dag should not import # my_testing_module assert 'my_testing_module' not in sys.modules dag.build() # should be imported now assert 'my_testing_module' in sys.modules assert Path('backup', 'output.csv').exists()
def test_lazy_load(tmp_directory, tmp_imports): Path('my_module.py').write_text(""" def fn(): pass """) tasks = [ { 'source': 'my_module.fn', 'product': 'report.ipynb', 'on_finish': 'not_a_module.not_a_function', 'on_render': 'not_a_module.not_a_function', 'on_failure': 'not_a_module.not_a_function', 'serializer': 'not_a_module.not_a_function', 'unserializer': 'not_a_module.not_a_function', 'product_client': 'not_a_module.not_a_function' }, ] data = { 'tasks': tasks, 'serializer': 'not_a_module.not_a_function', 'unserializer': 'not_a_module.not_a_function', } spec = DAGSpec(data, lazy_import=True) assert spec.to_dag()
def test_python_callables_with_extract_upstream(tmp_directory): spec = DAGSpec({ 'tasks': [ { 'source': 'test_pkg.callables.root', 'product': 'root.csv' }, { 'source': 'test_pkg.callables.a', 'product': 'a.csv' }, { 'source': 'test_pkg.callables.b', 'product': 'b.csv' }, ], 'meta': { 'extract_product': False, 'extract_upstream': True } }) dag = spec.to_dag() dag.build() assert set(dag) == {'a', 'b', 'root'} assert not dag['root'].upstream assert set(dag['a'].upstream) == {'root'} assert set(dag['b'].upstream) == {'root'}
def test_add_upstream_modifies_signature(backup_spec_with_functions): dag = DAGSpec('pipeline.yaml').to_dag() dag.render() fn = dag['raw'].source.primitive params = dag['raw'].params.to_json_serializable() dev = CallableInteractiveDeveloper(fn, params) # add an upstream reference... nb = dev.to_nb() nb.cells[-1]['source'] += '\nupstream["some_task"]' dev.overwrite(nb) # source must be updated... source = Path('my_tasks', 'raw', 'functions.py').read_text() top_lines = '\n'.join(source.splitlines()[:5]) expected = ( 'from pathlib import Path\n\n\n' 'def function(product, upstream):\n Path(str(product)).touch()') assert expected == top_lines # if we save again, nothing should change dev.overwrite(nb) source = Path('my_tasks', 'raw', 'functions.py').read_text() top_lines = '\n'.join(source.splitlines()[:5]) assert expected == top_lines
def test_import_tasks_from_loads_relative_to_pipeline_spec(tmp_nbs): some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}] Path('some_tasks.yaml').write_text(yaml.dump(some_tasks)) Path('extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml' Path('pipeline.yaml').write_text(yaml.dump(spec_d)) # move to another dir to make sure we can still load the spec Path('subdir').mkdir() os.chdir('subdir') spec = DAGSpec('../pipeline.yaml') dag = spec.to_dag() dag.render() assert spec['meta']['import_tasks_from'] == str( Path('..', 'some_tasks.yaml').resolve()) assert str(Path('..', 'extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_grid_and_upstream_wildcard_callables(spec_raw, tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('sample_source_callables.py').write_text(""" from pathlib import Path def unserializer(product): return Path(product).read_text() def upstream(product, param): Path(product).touch() def downstream(product, upstream): up = upstream['upstream-*'] one = up['upstream-0'] another = up['upstream-1'] Path(product).touch() """) spec = DAGSpec(spec_raw) dag = spec.to_dag().render() # to build faster dag.executor = Serial(build_in_subprocess=False) # make sure unserializing works correctly dag.build() assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'} assert set(dag['downstream'].params['upstream']['upstream-*']) == { 'upstream-1', 'upstream-0' }
def test_import_tasks_from_with_non_empty_env(tmp_nbs): some_tasks = [{ 'source': 'extra_task.py', 'name': 'extra_task', 'product': 'extra.ipynb', 'params': { 'some_param': '{{some_param}}' } }] Path('some_tasks.yaml').write_text(yaml.dump(some_tasks)) Path('extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) spec_d['meta']['import_tasks_from'] = 'some_tasks.yaml' spec = DAGSpec(spec_d, env={'some_param': 'some_value'}) dag = spec.to_dag() dag.render() assert dag['extra_task'].params['some_param'] == 'some_value' assert str(Path('extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_loads_serializer_and_unserializer(backup_online, add_current_to_sys_path): spec = DAGSpec({ 'tasks': [{ 'source': 'online_tasks.get', 'product': 'output/get.parquet', }, { 'source': 'online_tasks.square', 'product': 'output/square.parquet', }], 'meta': { 'extract_product': False }, 'serializer': 'online_io.serialize', 'unserializer': 'online_io.unserialize', }) dag = spec.to_dag() from online_io import serialize, unserialize assert dag['get']._serializer is serialize assert dag['get']._unserializer is unserialize assert dag['square']._serializer is serialize assert dag['square']._unserializer is unserialize
def test_mixed_db_sql_spec(tmp_pipeline_sql, add_current_to_sys_path, pg_client_and_schema, monkeypatch): _, schema = pg_client_and_schema with open('pipeline-multiple-dbs.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) # clients for this pipeline are initialized without custom create_engine # args but we need to set the default schema, mock the call so it # includes that info monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema)) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) # make sales data for pg and sqlite loader = load_dotted_path(dag_spec['clients']['PostgresRelation']) client = loader() df.to_sql('sales', client.engine, if_exists='replace') client.engine.dispose() # make sales data for pg and sqlite loader = load_dotted_path(dag_spec['clients']['SQLiteRelation']) client = loader() df.to_sql('sales', client.engine) client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() # FIXME: this does no show the custom Upstream key missing error dag.build()
def test_postgres_sql_spec(tmp_pipeline_sql, pg_client_and_schema, add_current_to_sys_path, monkeypatch): _, schema = pg_client_and_schema with open('pipeline-postgres.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) # clients for this pipeline are initialized without custom create_engine # args but we need to set the default schema, mock the call so it # includes that info monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema)) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) loader = load_dotted_path(dag_spec['clients']['SQLScript']) client = loader() df.to_sql('sales', client.engine, if_exists='replace') client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() # FIXME: this does no show the custom Upstream key missing error dag.build() assert not dag['load'].upstream assert list(dag['filter'].upstream.keys()) == ['load'] assert list(dag['transform'].upstream.keys()) == ['filter']
def test_import_tasks_from_paths_are_relative_to_the_yaml_spec( tmp_nbs, tmp_path): tasks_yaml = tmp_path / 'some_tasks.yaml' # source is a relative path some_tasks = [{'source': 'extra_task.py', 'product': 'extra.ipynb'}] tasks_yaml.write_text(yaml.dump(some_tasks)) # write the source code in the same folder as some_tasks.yaml Path(tmp_path, 'extra_task.py').write_text(""" # + tags=["parameters"] # - """) spec_d = yaml.safe_load(Path('pipeline.yaml').read_text()) # set an absolute path spec_d['meta']['import_tasks_from'] = str(tasks_yaml.resolve()) Path('pipeline.yaml').write_text(yaml.dump(spec_d)) spec = DAGSpec('pipeline.yaml') dag = spec.to_dag() dag.render() # paths must be interpreted as relative to tasks.yaml, not to the # current working directory assert str(Path(tmp_path, 'extra_task.py').resolve()) in [ str(t['source']) for t in spec['tasks'] ]
def test_remove_upstream_modifies_signature(backup_spec_with_functions): # by the time we reach this test, my_tasks.raw.functions has alread been # loaded (previous test), so we force reload to avoid wrongfully reading # the modified source code in the raw task from my_tasks.raw import functions importlib.reload(functions) dag = DAGSpec('pipeline.yaml').to_dag() dag.render() fn = dag['clean'].source.primitive params = dag['clean'].params.to_json_serializable() dev = CallableInteractiveDeveloper(fn, params) nb = dev.to_nb() # delete upstream reference del nb.cells[-2] dev.overwrite(nb) source = Path('my_tasks', 'clean', 'functions.py').read_text() top_lines = '\n'.join(source.splitlines()[:5]) expected = ('# adding this to make sure relative imports work ' 'fine\nfrom .util import util_touch\n\n\n' 'def function(product):') assert top_lines == expected
def test_spec_with_functions(lazy_import, backup_spec_with_functions, add_current_to_sys_path): """ Check we can create pipeline where the task is a function defined in a local file """ spec = DAGSpec('pipeline.yaml', lazy_import=lazy_import) spec.to_dag().build()
def test_pipeline_r(tmp_pipeline_r): Path('output').mkdir() with open('pipeline.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) dag = DAGSpec(dag_spec).to_dag() dag.build()
def test_spec_invalid_glob_pattern(tmp_nbs_no_yaml): Path('some_invalid_script.sh').touch() with pytest.raises(ValueError) as excinfo: DAGSpec.from_files('*') assert ('Cannot instantiate DAGSpec from files with invalid extensions' in str(excinfo.value))
def _process_file_dir_or_glob(parser, dagspec_arg=None): """ Process a file entry point file, directory or glob-like pattern, the initialized dag and parsed args Parameters ---------- parser : CustomParser CLI arg parser """ # NOTE: we must use parser.parse_entry_point_value() instead or # args.parse_args because calling the latter wont allow us to add more # cli parameters, but we want that to expose parms from env entry_point_value = dagspec_arg or parser.parse_entry_point_value() entry = EntryPoint(entry_point_value) if entry.type in {EntryPoint.Directory, EntryPoint.Pattern}: # pipelines initialized from directories or patterns cannot be # parametrized path_to_env = None # file else: path_to_env = default.path_to_env_from_spec(entry_point_value) if path_to_env: env_dict = EnvDict(path_to_env, path_to_here=Path(entry_point_value).parent if entry.type == EntryPoint.File else None) _add_cli_args_from_env_dict_keys(parser, env_dict) args = parser.parse_args() dagspec_arg = dagspec_arg or args.entry_point if hasattr(args, 'log'): if args.log is not None: logging.basicConfig(level=args.log.upper()) entry_point = EntryPoint(dagspec_arg) # directory if entry_point.type == EntryPoint.Directory: dag = DAGSpec.from_directory(dagspec_arg).to_dag() # pattern elif entry_point.type == EntryPoint.Pattern: dag = DAGSpec.from_files(dagspec_arg).to_dag() # file else: if path_to_env: # and replace keys depending on passed cli args replaced = _env_keys_to_override(args, parser.static_args) env = env_dict._replace_flatten_keys(replaced) dag = DAGSpec(dagspec_arg, env=env).to_dag() else: dag = DAGSpec(dagspec_arg).to_dag() return dag, args
def test_notebook_spec(processor, tmp_nbs): Path('output').mkdir() with open('pipeline.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) dag_spec = processor(dag_spec) dag = DAGSpec(dag_spec).to_dag() dag.build()
def test_find_searches_in_default_locations(monkeypatch, tmp_nbs, root_path): root_path = Path(root_path).resolve() Path('subdir').mkdir() mock = Mock(wraps=dagspec.default.entry_point_with_name) monkeypatch.setattr(dagspec.default, 'entry_point_with_name', mock) DAGSpec.find(starting_dir=root_path) mock.assert_called_once_with(root_path=root_path, name=None)
def test_searches_in_default_locations(monkeypatch, tmp_nbs, root_path): root_path = Path(root_path).resolve() Path('subdir').mkdir() mock = Mock(wraps=dagspec.entry_point) monkeypatch.setattr(dagspec, 'entry_point', mock) DAGSpec._auto_load(starting_dir=root_path) mock.assert_called_once_with(root_path=root_path)
def test_find(tmp_nbs, monkeypatch): mock = Mock(return_value=[None, None]) monkeypatch.setattr(dagspec.DAGSpec, '_auto_load', mock) env = {'a': 1} DAGSpec.find(env=env) mock.assert_called_once_with(to_dag=False, starting_dir=None, env={'a': 1}, lazy_import=False, reload=False)
def load_entry_point(entry_point): type_ = find_entry_point_type(entry_point) if type_ == EntryPoint.Directory: spec = DAGSpec.from_directory(entry_point) path = Path(entry_point) elif type_ == EntryPoint.File: spec = DAGSpec(entry_point) path = Path(entry_point).parent else: raise NotImplementedError( f'loading entry point type {type_!r} is unsupported') return spec, spec.to_dag(), path
def test_infer_dependencies_sql(tmp_pipeline_sql, add_current_to_sys_path): expected = {'filter': {'load'}, 'transform': {'filter'}, 'load': set()} with open('pipeline-postgres.yaml') as f: d = yaml.safe_load(f) d['meta']['extract_upstream'] = True for t in d['tasks']: t.pop('upstream', None) dag = DAGSpec(d).to_dag() deps = {name: set(task.upstream) for name, task in dag.items()} assert deps == expected
def test_meta_defaults(raw): spec = DAGSpec(raw) meta = spec['meta'] assert meta['extract_upstream'] assert not meta['extract_product'] assert not meta['product_relative_to_source'] assert not meta['jupyter_hot_reload']
def add(): """Add scaffold templates for tasks whose source does not exist """ # setting lazy_import to true causes sources to be returned as paths, # instead of placeholders spec, path_to_spec = DAGSpec._auto_load(to_dag=False, lazy_import=True) loader = ScaffoldLoader('ploomber_add') # TODO: when the dag has a source loader, the argument passed to # ploomber_add should take that into account to place the new file # in the appropriate location (instead of doing it relative to # pipeline.yaml) # TODO: raise an error if the location is inside the site-packages folder # NOTE: lazy loading freom source loader will giev errors because # initializing a source with a path only, loses the information from the # jinja environment to make macros workj. I have to test this. the best # solution is to add a lazy_load param to Placeholder, so it can be # initialized with a path for a file that does not exist if path_to_spec: print('Found spec at {}'.format(path_to_spec)) # make sure current working dir is in the path, otherwise we might not # be able to import the PythonCallable functions, which we need to do # to locate the modules with add_to_sys_path(path_to_spec, chdir=False): for task in spec['tasks']: loader.create(source=task['source'], params=spec['meta'], class_=task['class']) else: print('Error: No pipeline.yaml spec found...')
def test_spec_from_directory(chdir, dir_, tmp_nbs_no_yaml): os.chdir(chdir) Path('output').mkdir() dag = DAGSpec.from_directory(dir_).to_dag() assert list(dag) == ['load', 'clean', 'plot']
def test_spec_glob_pattern(tmp_nbs_no_yaml): # directory should be ignored Path('output').mkdir() # if passed a string, it's interpreted as a glob-like pattern dag = DAGSpec.from_files('load.py').to_dag() assert list(dag) == ['load']
def test_error_invalid_yaml_displays_error_line(tmp_directory): Path('pipeline.yaml').write_text('key: [') with pytest.raises(yaml.parser.ParserError) as excinfo: DAGSpec('pipeline.yaml') assert 'key: [' in str(excinfo.value)