예제 #1
0
def test_metadata_is_synced_when_executing_in_subprocess(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=True))
    t = PythonCallable(touch_root, File('file.txt'), dag)

    dag.build()

    assert t.product.metadata._data is not None
예제 #2
0
def test_from_params_resolves_paths_in_metaproduct(tmp_directory):
    def touch(product, param):
        Path(product['one']).touch()
        Path(product['another']).touch()

    dag = DAG(executor=Serial(build_in_subprocess=False))
    TaskGroup.from_params(PythonCallable,
                          File, {
                              'one': 'one.txt',
                              'another': 'another.txt'
                          }, {'source': touch},
                          dag,
                          name='task_group',
                          params_array=[{
                              'param': 1
                          }, {
                              'param': 2
                          }],
                          resolve_relative_to='')

    # on windows, paths do not resolve if the file doesn't exist, so we run
    # the pipeline to ensure they do
    dag.build()

    assert Path(dag['task_group0'].product['one']).resolve() == Path(
        'one-0.txt').resolve()
    assert Path(dag['task_group0'].product['another']).resolve() == Path(
        'another-0.txt').resolve()
    assert Path(dag['task_group1'].product['one']).resolve() == Path(
        'one-1.txt').resolve()
    assert Path(dag['task_group1'].product['another']).resolve() == Path(
        'another-1.txt').resolve()
예제 #3
0
def make_training():
    """Instantiates the training DAG
    """
    # setting build_in_subprocess=False because Python does not like when we
    # use multiprocessing in functions defined in the main module. Works if
    # we define them in a different one
    dag = DAG(executor=Serial(build_in_subprocess=False))

    output = Path('output')

    # add "get" task that returns the training data
    PythonCallable(get,
                   File(output / 'get.csv'),
                   dag,
                   serializer=serializer,
                   unserializer=unserializer)

    # add features tasks
    add_features(dag)

    # add "fit" task for model training
    fit_t = PythonCallable(fit, File(output / 'model.pickle'), dag)

    # train after joining features
    dag['join'] >> fit_t

    return dag
예제 #4
0
def test_lazy_load_dag_level_client(tmp_directory, tmp_imports,
                                    my_testing_module, client_spec):

    tasks = [
        {
            'source': 'my_testing_module.task',
            'product': 'output.csv'
        },
    ]

    data = {
        'tasks': tasks,
        'clients': {
            'File': client_spec
        },
    }

    spec = DAGSpec(data, lazy_import=True)

    dag = spec.to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    # since lazy_load=True, creating the dag should not import
    # my_testing_module
    assert 'my_testing_module' not in sys.modules

    dag.build()

    # should be imported now
    assert 'my_testing_module' in sys.modules
    assert Path('backup', 'output.csv').exists()
예제 #5
0
def test_lazy_load_product_level_client(tmp_directory, tmp_imports,
                                        my_testing_module, client_spec):
    Path('script.sql').write_text("""
CREATE TABLE {{product}} AS SELECT * FROM my_table
""")

    with sqlite3.connect('my.db') as conn:
        pd.DataFrame({'x': range(5)}).to_sql('my_table', conn)

    tasks = [
        {
            'source': 'script.sql',
            'product': [None, 'name', 'table'],
            'client': client_spec,
            'product_client': client_spec,
            'product_class': 'GenericSQLRelation',
        },
    ]

    data = {'tasks': tasks}

    spec = DAGSpec(data, lazy_import=True)

    dag = spec.to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    # since lazy_load=True, creating the dag should not import
    # my_testing_module
    assert 'my_testing_module' not in sys.modules

    dag.build()

    # should be imported now
    assert 'my_testing_module' in sys.modules
예제 #6
0
def test_grid_and_upstream_wildcard_callables(spec_raw, tmp_directory,
                                              add_current_to_sys_path,
                                              no_sys_modules_cache):
    Path('sample_source_callables.py').write_text("""
from pathlib import Path

def unserializer(product):
    return Path(product).read_text()

def upstream(product, param):
    Path(product).touch()

def downstream(product, upstream):
    up = upstream['upstream-*']
    one = up['upstream-0']
    another = up['upstream-1']
    Path(product).touch()
""")

    spec = DAGSpec(spec_raw)

    dag = spec.to_dag().render()
    # to build faster
    dag.executor = Serial(build_in_subprocess=False)

    # make sure unserializing works correctly
    dag.build()

    assert set(dag) == {'upstream-1', 'upstream-0', 'downstream'}
    assert set(dag['downstream'].params['upstream']['upstream-*']) == {
        'upstream-1', 'upstream-0'
    }
예제 #7
0
def dag():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    task = PythonCallable(_touch_upstream, File('file'), dag=dag, name='task')
    root >> task
    return dag
예제 #8
0
def make_dag(env, params):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[SQLUpload] = SQLAlchemyClient(env.db_uri)
    dag.clients[SQLiteRelation] = SQLAlchemyClient(env.db_uri)
    dump = make_task_dump(dag)
    upload = make_task_upload(dag)
    dump >> upload
    return dag
예제 #9
0
def test_creates_parent_dirs(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    PythonCallable(touch, File('some/nested/product.txt'), dag=dag)

    dag.build()

    return dag
예제 #10
0
def test_dag_on_render_with_params(tmp_directory, tmp_imports,
                                   write_dag_hooks_spec):
    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    dag.render()

    assert Path('hook').read_text() == 'on render'
예제 #11
0
 def make():
     dag = DAG(executor=Serial(build_in_subprocess=False))
     PythonCallable(touch_root_w_param,
                    File('1.txt'),
                    dag,
                    name='first',
                    params={'some_param': object()})
     return dag
예제 #12
0
def _make_dag_with_upstream():
    # run in the same process, to ensure the mock object is called
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')
    t1 = PythonCallable(_touch, File('1.txt'), dag=dag, name='root')
    PythonCallable(_touch, File('2.txt'), dag=dag, name=2)
    t3 = PythonCallable(_touch_upstream, File('3.txt'), dag=dag, name=3)
    t1 >> t3
    return dag
예제 #13
0
    def make():
        dag = DAG(executor=Serial(build_in_subprocess=False))

        PythonCallable(task_with_resource,
                       File('output'),
                       dag,
                       params=dict(resources_=dict(file='resource.txt')))

        return dag
예제 #14
0
def test_build_partially_with_wildcard(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('a-1.txt'), dag, name='a-1')
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*')

    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
예제 #15
0
def test_pipeline(test_env, force):
    # test is executed with a sample of the data
    dag = pipeline._make(test_env)

    # customize executor for testing purposes, default settings will not
    # start the debugger in the line that raised the exception, this
    # settings will, try adding an exception in any of the PythonCallable
    # tasks then run pytest --pdb to see it in action
    dag.executor = Serial(build_in_subprocess=False, catch_exceptions=False)

    dag.build(force=force)
예제 #16
0
def test_creates_parent_dirs_meta_product(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    PythonCallable(touch_meta, {
        'one': File('some/nested/product.txt'),
        'another': File('some/another/product.txt')
    },
                   dag=dag)

    dag.build()

    return dag
예제 #17
0
def test_keeps_folder_layout(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('backup', path_to_project_root='.')
    Path('dir').mkdir()
    PythonCallable(_touch, File('file'), dag, name='task')
    PythonCallable(_touch, File('dir/nested'), dag, name='nested')
    dag.build()

    assert Path('backup', 'dir', 'nested').is_file()
    assert Path('backup', 'dir', '.nested.metadata').is_file()
    assert Path('backup', 'file').is_file()
    assert Path('backup', '.file.metadata').is_file()
예제 #18
0
def _make_dag_with_two_upstream():
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    another = PythonCallable(_touch, File('another'), dag=dag, name='another')
    task = PythonCallable(_touch_upstream,
                          File('file.txt'),
                          dag=dag,
                          name='task')
    (root + another) >> task
    return dag
예제 #19
0
def test_warnings_are_shown(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    t1 = PythonCallable(touch_root_w_warning, File('file.txt'), dag)
    t2 = PythonCallable(touch_w_warning, File('file2.txt'), dag)
    t1 >> t2

    with pytest.warns(None) as record:
        dag.build()

    assert len(record) == 1
    assert 'This is a warning' in str(record[0].message)
    assert 'This is another warning' in str(record[0].message)
예제 #20
0
def test_unserializes_upstream_metaproduct(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    dag.unserializer = metaproduct_unserializer
    t1 = PythonCallable(touch_meta, {
        'one': File('one'),
        'another': File('another')
    },
                        dag=dag,
                        name='first')
    t2 = PythonCallable(touch_with_first_as_upstream, File('last'), dag=dag)
    t1 >> t2

    dag.build()
예제 #21
0
def make_larger_dag_with_client():
    dag = DAG(executor=Serial(build_in_subprocess=False))

    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(touch_root, File('out/root'), dag=dag, name='root')
    task = PythonCallable(touch, File('out/file'), dag=dag, name='task')
    another = PythonCallable(touch,
                             File('out/another'),
                             dag=dag,
                             name='another')
    root >> task >> another
    return dag
예제 #22
0
def test_build_partially_with_wildcard_skip_upstream(tmp_directory):
    dag = DAG(executor=Serial(build_in_subprocess=False))
    root = PythonCallable(touch_root, File('root.txt'), dag, name='root')
    a1 = PythonCallable(touch, File('a-1.txt'), dag, name='a-1')
    root >> a1
    PythonCallable(touch_root, File('a-2.txt'), dag, name='a-2')
    PythonCallable(touch_root, File('b.txt'), dag, name='b')

    dag.build_partially('a-*', skip_upstream=True)

    assert not Path('root.txt').exists()
    assert Path('a-1.txt').exists()
    assert Path('a-2.txt').exists()
    assert not Path('b.txt').exists()
예제 #23
0
def make_dag_with_client_and_metaproduct():
    dag = DAG(executor=Serial(build_in_subprocess=False))

    dag.clients[File] = LocalStorageClient('remote', path_to_project_root='.')

    root = PythonCallable(touch_root_with_metaproduct, {
        'root': File('out/root'),
        'another': File('out/another')
    },
                          dag=dag,
                          name='root')
    task = PythonCallable(touch, File('file'), dag=dag, name='task')
    root >> task
    return dag
예제 #24
0
def test_dag_on_failure_with_params(tmp_directory, tmp_imports,
                                    write_dag_hooks_spec):
    Path('my_module.py').write_text("""
def touch(product):
    raise Exception
""")

    dag = DAGSpec('pipeline.yaml').to_dag()
    dag.executor = Serial(build_in_subprocess=False)

    with pytest.raises(DAGBuildError):
        dag.build()

    assert Path('hook').read_text() == 'on failure'
예제 #25
0
def test_change_static_analysis(tmp_sample_tasks):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    # static_analysis is True by default, this should fail
    t = NotebookRunner(Path('sample.ipynb'),
                       File('out.ipynb'),
                       dag,
                       params=dict(a=1, b=2))

    # disable it
    t.static_analysis = False

    # this should work
    dag.render()
예제 #26
0
def test_warns_if_export_args_but_ipynb_output(tmp_sample_tasks):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    NotebookRunner(Path('sample.ipynb'),
                   File('out.ipynb'),
                   dag,
                   nbconvert_export_kwargs=dict(exclude_input=True))

    with pytest.warns(UserWarning) as records:
        dag.build()

    # NOTE: not sure why sometimes two records are displayed, maybe another
    # library is throwing the warning
    assert any(
        "Output 'out.ipynb' is a notebook file" in record.message.args[0]
        for record in records)
예제 #27
0
def _make_dag_with_metaproduct(with_client=True):
    dag = DAG(executor=Serial(build_in_subprocess=False))

    if with_client:
        dag.clients[File] = LocalStorageClient('remote',
                                               path_to_project_root='.')

    root = PythonCallable(_touch, File('root'), dag=dag, name='root')
    task = PythonCallable(_touch_upstream, {
        'one': File('file.txt'),
        'another': File('another.txt')
    },
                          dag=dag,
                          name='task')
    root >> task
    return dag
예제 #28
0
 def _make():
     dag = DAG(executor=Serial(build_in_subprocess=True))
     dag.clients[File] = LocalStorageClient('backup',
                                            path_to_project_root='.')
     t1 = PythonCallable(_touch_many, {
         'one': File('one'),
         'two': File('two')
     },
                         dag,
                         name='task')
     t2 = PythonCallable(_touch_upstream,
                         File('three'),
                         dag,
                         name='another')
     t1 >> t2
     return dag
def test_dag_without_client(monkeypatch, tmp_directory):
    mock = Mock(wraps=dag_module.fetch_remote_metadata_in_parallel)
    monkeypatch.setattr(dag_module, 'fetch_remote_metadata_in_parallel', mock)
    mock_remote = Mock()
    monkeypatch.setattr(file._RemoteFile, '_fetch_remote_metadata',
                        mock_remote)

    dag = DAG(executor=Serial(build_in_subprocess=False))
    PythonCallable(touch_root, File('one'), dag=dag)

    dag.render()

    # should call it
    mock.assert_called_once_with(dag)
    # but should not call remotes
    mock_remote.assert_not_called()
예제 #30
0
def test_attempts_to_download_on_each_build(tmp_directory, monkeypatch):
    # run in the same process, otherwise we won't know if the mock object
    # is called
    dag = DAG(executor=Serial(build_in_subprocess=False))
    product = File('file.txt')
    PythonCallable(_touch, product, dag=dag)

    monkeypatch.setattr(File, 'download', Mock(wraps=product.download))

    # download is called on each call to dag.render(), dag.build() calls it...
    dag.build()
    assert product.download.call_count == 1

    # second time, it should attempt to download again as the remote files
    # could've been modified
    dag.build()
    assert product.download.call_count == 2