Exemplo n.º 1
0
def dag():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    task = PythonCallable(_touch_upstream, File('file'), dag=dag, name='task')
    root >> task
    return dag
Exemplo n.º 2
0
def add_features(dag):
    """
    Given a DAG, adds feature engineering tasks. The DAG must have a task "get"
    that returns the input data.
    """
    get_task = dag['get']

    output = Path('output')

    # instantiate tasks
    a_feature_task = PythonCallable(a_feature,
                                    File(output / 'a_feature.csv'),
                                    dag,
                                    serializer=serializer,
                                    unserializer=unserializer)
    another_task = PythonCallable(another,
                                  File(output / 'another.csv'),
                                  dag,
                                  serializer=serializer,
                                  unserializer=unserializer)
    join_task = PythonCallable(join,
                               File(output / 'join.csv'),
                               dag,
                               serializer=serializer,
                               unserializer=unserializer)

    # establish dependencies
    get_task >> a_feature_task
    get_task >> another_task
    (get_task + a_feature_task + another_task) >> join_task

    return dag
Exemplo n.º 3
0
def test_load_error_if_task_has_metaproduct_and_no_key():
    dag = DAG()

    task = PythonCallable(touch_meta, {'a': File('a'), 'b': File('b')}, dag)

    with pytest.raises(ValueError):
        task.load()
Exemplo n.º 4
0
def test_copy(copy):
    def _assign_upstream(upstream):
        _assign_upstream.obj = upstream
        return 42

    dag_ = DAG()

    root = PythonCallable(_root,
                          File('root.parquet'),
                          dag_,
                          name='root',
                          serializer=serializer,
                          params={'input_data': {
                              'x': [0, 0, 0]
                          }})

    task = PythonCallable(_assign_upstream,
                          File('task.parquet'),
                          dag_,
                          name='task',
                          unserializer=unserializer,
                          serializer=serializer)

    root >> task

    dag = InMemoryDAG(dag_)

    out = dag.build({'root': {'x': [1]}}, copy=copy)

    # test that the function _assign_upstream received the same object
    # the task root returned in the upstream argument if copy is disabled.
    # if copying, then it should be a different object
    assert (_assign_upstream.obj['root'] is out['root']) is (not copy)
Exemplo n.º 5
0
def make_training():
    """Instantiates the training DAG
    """
    # setting build_in_subprocess=False because Python does not like when we
    # use multiprocessing in functions defined in the main module. Works if
    # we define them in a different one
    dag = DAG(executor=Serial(build_in_subprocess=False))

    output = Path('output')

    # add "get" task that returns the training data
    PythonCallable(get,
                   File(output / 'get.csv'),
                   dag,
                   serializer=serializer,
                   unserializer=unserializer)

    # add features tasks
    add_features(dag)

    # add "fit" task for model training
    fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag)

    # train after joining features
    dag['join'] >> fit_t

    return dag
Exemplo n.º 6
0
def test_cycle_exception():
    dag = DAG()
    ta = PythonCallable(touch_root, File(Path("a.txt")), dag, "ta")
    tb = PythonCallable(touch, File(Path("b.txt")), dag, "tb")
    ta >> tb >> ta
    with pytest.raises(DAGCycle):
        dag.build()
Exemplo n.º 7
0
def test_on_finish(tmp_directory):
    dag = DAG()

    t = PythonCallable(touch, File('file'), dag, name='touch')
    t.on_finish = on_finish

    dag.build()
Exemplo n.º 8
0
def test_executor_keeps_running_until_no_more_tasks_can_run(
        executor, tmp_directory):
    dag = DAG(executor=executor)
    t_fail = PythonCallable(failing_root, File('t_fail'), dag, name='t_fail')
    t_fail_downstream = PythonCallable(failing,
                                       File('t_fail_downstream'),
                                       dag,
                                       name='t_fail_downstream')
    t_touch_aborted = PythonCallable(touch,
                                     File('t_touch_aborted'),
                                     dag,
                                     name='t_touch_aborted')

    t_fail >> t_fail_downstream >> t_touch_aborted

    PythonCallable(touch_root, File('t_ok'), dag, name='t_ok')

    try:
        dag.build(force=True)
    except DAGBuildError:
        pass

    assert not Path('t_fail').exists()
    assert not Path('t_fail_downstream').exists()
    assert Path('t_ok').exists()
Exemplo n.º 9
0
def test_runs_on_finish(executor, tmp_directory):
    hook.count = 0
    hook_2.count = 0
    hook_3.count = 0
    hook_4.count = 0

    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file1.txt'), dag, 't')
    t.on_finish = hook
    t.on_failure = hook_4

    t2 = PythonCallable(touch_w_upstream, File('file2'), dag, 't2')
    t2.on_finish = hook_2

    t3 = PythonCallable(fn, File('file3'), dag, 't3')
    t3.on_finish = hook_3

    t >> t2

    dag.build()

    assert hook.count == 1
    assert hook_2.count == 1
    assert hook_3.count == 1
    assert hook_4.count == 0
Exemplo n.º 10
0
def test_sucessful_execution(executor, tmp_directory):
    dag = DAG(executor=executor)
    t1 = PythonCallable(touch_root, File('ok.txt'), dag, name='t1')
    t2 = PythonCallable(touch, File('a_file.txt'), dag, name='t2')
    t3 = PythonCallable(touch, File('another_file.txt'), dag, name='t3')
    t4 = PythonCallable(touch, File('yet_another_file.txt'), dag, name='t4')
    PythonCallable(touch_root, File('file.txt'), dag, name='t5')
    t1 >> t2
    t1 >> t3
    (t2 + t3) >> t4

    dag.build()

    assert Path('ok.txt').exists()
    assert Path('a_file.txt').exists()
    assert Path('another_file.txt').exists()
    assert Path('yet_another_file.txt').exists()
    assert Path('file.txt').exists()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Executed}
    assert set(t.product._is_outdated() for t in dag.values()) == {False}

    # nothing executed cause everything is up-to-date
    dag.build()

    assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
Exemplo n.º 11
0
def test_hot_reload(backup_test_pkg, tmp_directory):
    cfg = DAGConfigurator()
    cfg.params.hot_reload = True
    dag = cfg.create()

    t1 = PythonCallable(functions.touch_root, File('file1.txt'), dag)
    t2 = PythonCallable(functions.touch_upstream, File('file2.txt'), dag)
    t1 >> t2

    path_to_functions = Path(backup_test_pkg, 'functions.py')
    source_new = """
from pathlib import Path

def touch_root(product):
    Path(str(product)).write_text("hi")

def touch_upstream(product, upstream):
    Path(str(product)).write_text("hello")
    """
    path_to_functions.write_text(source_new)

    dag.build()

    assert Path('file1.txt').read_text() == 'hi'
    assert Path('file2.txt').read_text() == 'hello'
Exemplo n.º 12
0
def test_upload_after_task_build(tmp_directory):
    dag = DAG()
    product = File('file.txt')
    product.upload = Mock(wraps=product.upload)
    task = PythonCallable(_touch, product, dag=dag)
    task.build()

    product.upload.assert_called_once()
Exemplo n.º 13
0
def test_params_are_copied_upon_initialization():
    dag = DAG()

    params = {'a': 1}
    t1 = PythonCallable(touch, File('file'), dag, name='t1', params=params)
    t2 = PythonCallable(touch, File('file'), dag, name='t2', params=params)

    assert t1.params is not t2.params
Exemplo n.º 14
0
def test_runs_on_finish(tmp_directory, capsys):

    dag = DAG()
    t = PythonCallable(fn1, File('file1.txt'), dag, name='fn1')
    t.on_finish = on_finish
    dag.build()

    assert capsys.readouterr().out == 'running on finish\n'
Exemplo n.º 15
0
def test_python_callable_with_file():
    dag = DAG()
    t = PythonCallable(touch, File('file.txt'), dag, name='name')
    t.render()

    assert str(t.product) == 'file.txt'
    assert str(t.source) == ('def touch(product):\n    '
                             'Path(str(product)).touch()\n')
Exemplo n.º 16
0
def _make_dag_with_upstream():
    # run in the same process, to ensure the mock object is called
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root')
    PythonCallable(_touch, File('2.txt'), dag=dag, name=2)
    t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3)
    t1 >> t3
    return dag
Exemplo n.º 17
0
def test_building_a_single_task_when_rendered_upstream():
    dag = DAG()
    t1 = PythonCallable(touch, File('1.txt'), dag, name=1)
    t2 = PythonCallable(touch_w_upstream, File('2.txt'), dag, name=2)

    t1 >> t2

    dag.render()
    t2.build()
Exemplo n.º 18
0
def test_on_failure_exceptions_are_logged(executor, caplog):
    dag = DAG(executor='serial')
    t = PythonCallable(fn_that_fails, File('file.txt'), dag, name='t')
    t.on_failure = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGBuildError):
            dag.build()

    assert 'Exception when running on_failure for task "t"' in caplog.text
Exemplo n.º 19
0
def test_on_finish_exceptions_are_logged(executor, tmp_directory, caplog):
    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file.txt'), dag, name='t')
    t.on_finish = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGBuildError):
            dag.build()

    assert 'Exception when running on_finish for task "t"' in caplog.text
Exemplo n.º 20
0
def test_on_render_exceptions_are_logged(executor, caplog):
    dag = DAG(executor=executor)
    t = PythonCallable(fn, File('file.txt'), dag, name='t')
    t.on_render = hook_crashing

    with caplog.at_level(logging.ERROR):
        with pytest.raises(DAGRenderError):
            dag.render()

    assert 'Exception when running on_render for task "t"' in caplog.text
Exemplo n.º 21
0
def test_load_from_metaproduct(tmp_directory):
    Path('a.csv').write_text('a,b\n1,2')

    dag = DAG()

    task = PythonCallable(touch_meta, {
        'a': File('a.csv'),
        'b': File('b')
    }, dag)

    assert task.load(key='a') is not None
Exemplo n.º 22
0
def test_pythoncallable(tmp_directory, product, kwargs):
    df = pd.DataFrame({'a': [1, 2, 3]})
    df.to_csv('my_file.csv', index=False)

    def callable_(product):
        pass

    task = PythonCallable(callable_, product, DAG(), name='task')

    loaded = task.load(**kwargs)
    assert df.equals(loaded)
Exemplo n.º 23
0
def test_build_partially_with_wildcard(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1')
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*')

    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
Exemplo n.º 24
0
def test_python_callable_with_file():
    dag = DAG()
    t = PythonCallable(my_fn,
                       File('/path/to/{{name}}'),
                       dag,
                       name='name',
                       params=dict(name='file'))
    t.render()

    assert str(t.product) == '/path/to/file'
    assert str(t.source) == 'def my_fn(product, upstream):\n    pass\n'
Exemplo n.º 25
0
def test_task_report_after_building(tmp_directory):
    dag = DAG()

    t = PythonCallable(touch_root, File('some_file.txt'), dag, name='task')

    t.render()
    report = t.build()

    assert report['Ran?']
    assert report['Elapsed (s)']
    assert report['name'] == 'task'
Exemplo n.º 26
0
def test_parallel_execution(tmp_directory):
    dag = DAG('dag', executor='parallel')

    a1 = PythonCallable(touch_root, File('a1.txt'), dag, 'a1')
    a2 = PythonCallable(touch_root, File('a2.txt'), dag, 'a2')
    b = PythonCallable(touch, File('b.txt'), dag, 'b')
    c = PythonCallable(touch, File('c.txt'), dag, 'c')

    (a1 + a2) >> b >> c

    dag.build()
Exemplo n.º 27
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag)
    t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag)
    t1 >> t2

    with pytest.warns(None) as record:
        dag.build()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)
Exemplo n.º 28
0
def test_keeps_folder_layout(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.')
    Path('dir').mkdir()
    PythonCallable(_touch, File('file'), dag, name='task')
    PythonCallable(_touch, File('dir/nested'), dag, name='nested')
    dag.build()

    assert Path('backup', 'dir', 'nested').is_file()
    assert Path('backup', 'dir', '.nested.metadata').is_file()
    assert Path('backup', 'file').is_file()
    assert Path('backup', '.file.metadata').is_file()
Exemplo n.º 29
0
def test_tracebacks_are_shown_for_all_on_build_failing_tasks(executor):
    dag = DAG(executor=executor)
    PythonCallable(failing_root, File('a_file.txt'), dag, name='t1')
    PythonCallable(failing_root, File('another_file.txt'), dag, name='t2')

    with pytest.raises(DAGBuildError) as excinfo:
        dag.build()

    # excinfo.getrepr() returns full text of chained exceptions
    assert "PythonCallable: t1 -> File('a_file.txt')" in str(excinfo.getrepr())
    assert ("PythonCallable: t2 -> File('another_file.txt')"
            in str(excinfo.getrepr()))
Exemplo n.º 30
0
def _make_dag_with_two_upstream():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    another = PythonCallable(_touch, File('another'), dag=dag, name='another')
    task = PythonCallable(_touch_upstream,
                          File('file.txt'),
                          dag=dag,
                          name='task')
    (root + another) >> task
    return dag