def dag(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(_touch, File('root'), dag=dag, name='root') task = PythonCallable(_touch_upstream, File('file'), dag=dag, name='task') root >> task return dag
def add_features(dag): """ Given a DAG, adds feature engineering tasks. The DAG must have a task "get" that returns the input data. """ get_task = dag['get'] output = Path('output') # instantiate tasks a_feature_task = PythonCallable(a_feature, File(output / 'a_feature.csv'), dag, serializer=serializer, unserializer=unserializer) another_task = PythonCallable(another, File(output / 'another.csv'), dag, serializer=serializer, unserializer=unserializer) join_task = PythonCallable(join, File(output / 'join.csv'), dag, serializer=serializer, unserializer=unserializer) # establish dependencies get_task >> a_feature_task get_task >> another_task (get_task + a_feature_task + another_task) >> join_task return dag
def test_load_error_if_task_has_metaproduct_and_no_key(): dag = DAG() task = PythonCallable(touch_meta, {'a': File('a'), 'b': File('b')}, dag) with pytest.raises(ValueError): task.load()
def test_copy(copy): def _assign_upstream(upstream): _assign_upstream.obj = upstream return 42 dag_ = DAG() root = PythonCallable(_root, File('root.parquet'), dag_, name='root', serializer=serializer, params={'input_data': { 'x': [0, 0, 0] }}) task = PythonCallable(_assign_upstream, File('task.parquet'), dag_, name='task', unserializer=unserializer, serializer=serializer) root >> task dag = InMemoryDAG(dag_) out = dag.build({'root': {'x': [1]}}, copy=copy) # test that the function _assign_upstream received the same object # the task root returned in the upstream argument if copy is disabled. # if copying, then it should be a different object assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
def make_training(): """Instantiates the training DAG """ # setting build_in_subprocess=False because Python does not like when we # use multiprocessing in functions defined in the main module. Works if # we define them in a different one dag = DAG(executor=Serial(build_in_subprocess=False)) output = Path('output') # add "get" task that returns the training data PythonCallable(get, File(output / 'get.csv'), dag, serializer=serializer, unserializer=unserializer) # add features tasks add_features(dag) # add "fit" task for model training fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag) # train after joining features dag['join'] >> fit_t return dag
def test_cycle_exception(): dag = DAG() ta = PythonCallable(touch_root, File(Path("a.txt")), dag, "ta") tb = PythonCallable(touch, File(Path("b.txt")), dag, "tb") ta >> tb >> ta with pytest.raises(DAGCycle): dag.build()
def test_on_finish(tmp_directory): dag = DAG() t = PythonCallable(touch, File('file'), dag, name='touch') t.on_finish = on_finish dag.build()
def test_executor_keeps_running_until_no_more_tasks_can_run( executor, tmp_directory): dag = DAG(executor=executor) t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail') t_fail_downstream = PythonCallable(failing, File('t_fail_downstream'), dag, name='t_fail_downstream') t_touch_aborted = PythonCallable(touch, File('t_touch_aborted'), dag, name='t_touch_aborted') t_fail >> t_fail_downstream >> t_touch_aborted PythonCallable(touch_root, File('t_ok'), dag, name='t_ok') try: dag.build(force=True) except DAGBuildError: pass assert not Path('t_fail').exists() assert not Path('t_fail_downstream').exists() assert Path('t_ok').exists()
def test_runs_on_finish(executor, tmp_directory): hook.count = 0 hook_2.count = 0 hook_3.count = 0 hook_4.count = 0 dag = DAG(executor=executor) t = PythonCallable(fn, File('file1.txt'), dag, 't') t.on_finish = hook t.on_failure = hook_4 t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2') t2.on_finish = hook_2 t3 = PythonCallable(fn, File('file3'), dag, 't3') t3.on_finish = hook_3 t >> t2 dag.build() assert hook.count == 1 assert hook_2.count == 1 assert hook_3.count == 1 assert hook_4.count == 0
def test_sucessful_execution(executor, tmp_directory): dag = DAG(executor=executor) t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1') t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2') t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3') t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4') PythonCallable(touch_root, File('file.txt'), dag, name='t5') t1 >> t2 t1 >> t3 (t2 + t3) >> t4 dag.build() assert Path('ok.txt').exists() assert Path('a_file.txt').exists() assert Path('another_file.txt').exists() assert Path('yet_another_file.txt').exists() assert Path('file.txt').exists() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed} assert set(t.product._is_outdated() for t in dag.values()) == {False} # nothing executed cause everything is up-to-date dag.build() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
def test_hot_reload(backup_test_pkg, tmp_directory): cfg = DAGConfigurator() cfg.params.hot_reload = True dag = cfg.create() t1 = PythonCallable(functions.touch_root, File('file1.txt'), dag) t2 = PythonCallable(functions.touch_upstream, File('file2.txt'), dag) t1 >> t2 path_to_functions = Path(backup_test_pkg, 'functions.py') source_new = """ from pathlib import Path def touch_root(product): Path(str(product)).write_text("hi") def touch_upstream(product, upstream): Path(str(product)).write_text("hello") """ path_to_functions.write_text(source_new) dag.build() assert Path('file1.txt').read_text() == 'hi' assert Path('file2.txt').read_text() == 'hello'
def test_upload_after_task_build(tmp_directory): dag = DAG() product = File('file.txt') product.upload = Mock(wraps=product.upload) task = PythonCallable(_touch, product, dag=dag) task.build() product.upload.assert_called_once()
def test_params_are_copied_upon_initialization(): dag = DAG() params = {'a': 1} t1 = PythonCallable(touch, File('file'), dag, name='t1', params=params) t2 = PythonCallable(touch, File('file'), dag, name='t2', params=params) assert t1.params is not t2.params
def test_runs_on_finish(tmp_directory, capsys): dag = DAG() t = PythonCallable(fn1, File('file1.txt'), dag, name='fn1') t.on_finish = on_finish dag.build() assert capsys.readouterr().out == 'running on finish\n'
def test_python_callable_with_file(): dag = DAG() t = PythonCallable(touch, File('file.txt'), dag, name='name') t.render() assert str(t.product) == 'file.txt' assert str(t.source) == ('def touch(product):\n ' 'Path(str(product)).touch()\n')
def _make_dag_with_upstream(): # run in the same process, to ensure the mock object is called dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root') PythonCallable(_touch, File('2.txt'), dag=dag, name=2) t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3) t1 >> t3 return dag
def test_building_a_single_task_when_rendered_upstream(): dag = DAG() t1 = PythonCallable(touch, File('1.txt'), dag, name=1) t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2) t1 >> t2 dag.render() t2.build()
def test_on_failure_exceptions_are_logged(executor, caplog): dag = DAG(executor='serial') t = PythonCallable(fn_that_fails, File('file.txt'), dag, name='t') t.on_failure = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGBuildError): dag.build() assert 'Exception when running on_failure for task "t"' in caplog.text
def test_on_finish_exceptions_are_logged(executor, tmp_directory, caplog): dag = DAG(executor=executor) t = PythonCallable(fn, File('file.txt'), dag, name='t') t.on_finish = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGBuildError): dag.build() assert 'Exception when running on_finish for task "t"' in caplog.text
def test_on_render_exceptions_are_logged(executor, caplog): dag = DAG(executor=executor) t = PythonCallable(fn, File('file.txt'), dag, name='t') t.on_render = hook_crashing with caplog.at_level(logging.ERROR): with pytest.raises(DAGRenderError): dag.render() assert 'Exception when running on_render for task "t"' in caplog.text
def test_load_from_metaproduct(tmp_directory): Path('a.csv').write_text('a,b\n1,2') dag = DAG() task = PythonCallable(touch_meta, { 'a': File('a.csv'), 'b': File('b') }, dag) assert task.load(key='a') is not None
def test_pythoncallable(tmp_directory, product, kwargs): df = pd.DataFrame({'a': [1, 2, 3]}) df.to_csv('my_file.csv', index=False) def callable_(product): pass task = PythonCallable(callable_, product, DAG(), name='task') loaded = task.load(**kwargs) assert df.equals(loaded)
def test_build_partially_with_wildcard(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1') PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2') PythonCallable(touch_root, File('b.txt'), dag, name='b') dag.build_partially('a-*') assert Path('a-1.txt').exists() assert Path('a-2.txt').exists() assert not Path('b.txt').exists()
def test_python_callable_with_file(): dag = DAG() t = PythonCallable(my_fn, File('/path/to/{{name}}'), dag, name='name', params=dict(name='file')) t.render() assert str(t.product) == '/path/to/file' assert str(t.source) == 'def my_fn(product, upstream):\n pass\n'
def test_task_report_after_building(tmp_directory): dag = DAG() t = PythonCallable(touch_root, File('some_file.txt'), dag, name='task') t.render() report = t.build() assert report['Ran?'] assert report['Elapsed (s)'] assert report['name'] == 'task'
def test_parallel_execution(tmp_directory): dag = DAG('dag', executor='parallel') a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1') a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2') b = PythonCallable(touch, File('b.txt'), dag, 'b') c = PythonCallable(touch, File('c.txt'), dag, 'c') (a1 + a2) >> b >> c dag.build()
def test_warnings_are_shown(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag) t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag) t1 >> t2 with pytest.warns(None) as record: dag.build() assert len(record) == 1 assert 'This is a warning' in str(record[0].message) assert 'This is another warning' in str(record[0].message)
def test_keeps_folder_layout(tmp_directory): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.') Path('dir').mkdir() PythonCallable(_touch, File('file'), dag, name='task') PythonCallable(_touch, File('dir/nested'), dag, name='nested') dag.build() assert Path('backup', 'dir', 'nested').is_file() assert Path('backup', 'dir', '.nested.metadata').is_file() assert Path('backup', 'file').is_file() assert Path('backup', '.file.metadata').is_file()
def test_tracebacks_are_shown_for_all_on_build_failing_tasks(executor): dag = DAG(executor=executor) PythonCallable(failing_root, File('a_file.txt'), dag, name='t1') PythonCallable(failing_root, File('another_file.txt'), dag, name='t2') with pytest.raises(DAGBuildError) as excinfo: dag.build() # excinfo.getrepr() returns full text of chained exceptions assert "PythonCallable: t1 -> File('a_file.txt')" in str(excinfo.getrepr()) assert ("PythonCallable: t2 -> File('another_file.txt')" in str(excinfo.getrepr()))
def _make_dag_with_two_upstream(): dag = DAG(executor=Serial(build_in_subprocess=False)) dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.') root = PythonCallable(_touch, File('root'), dag=dag, name='root') another = PythonCallable(_touch, File('another'), dag=dag, name='another') task = PythonCallable(_touch_upstream, File('file.txt'), dag=dag, name='task') (root + another) >> task return dag