CREATE TABLE {{product}} AS SELECT *, pH > AVG(pH) AS high_pH FROM {{upstream['upload']}} """ features = SQLScript(sql, product=SQLiteRelation((None, 'features', 'table')), dag=dag, name='features') red_task >> concat_task white_task >> concat_task concat_task >> upload_task >> features ############################################################################### # render will pass all parameters so you can see exactly which SQL code # will be executed dag.render() ############################################################################### # print source code for task "features" print(dag['features'].source_code) dag.plot(output='matplotlib') dag.build()
df = pd.read_csv(str(upstream['dump'])) df['a'] = df['a'] + 1 df.to_csv(str(product), index=False) # we convert the Python function into a Task task_add_one = PythonCallable(_add_one, File(tmp_dir / 'add_one.csv'), dag, name='add_one') # declare how tasks relate to each other: first dump then add one task_dump >> task_add_one # plot the workflow, pending tasks are shown in red dag.plot(output='matplotlib', clear_cached_status=True) # run our sample pipeline dag.build() ############################################################################### # Each time the DAG is run it will save the current timestamp and the # source code of each task, next time we run it it will only run the # necessary tasks to get everything up-to-date, there is a simple rule to # that: a task will run if its code (or the code from any dependency) has # changed since the last time it ran. # Data processing pipelines consist on many small long-running tasks which # depend on each other. During early development phases things are expected to
dag.clients[SQLiteRelation] = client dag.clients[SQLScript] = client source_loader = SourceLoader(tmp_dir) transfer = SQLTransfer(source_loader['data_select.sql'], product=SQLiteRelation((None, 'data2', 'table')), dag=dag, name='transfer') subset = SQLScript(source_loader['subset_create.sql'], product=SQLiteRelation((None, 'subset', 'table')), dag=dag, name='subset') transfer >> subset dag.render() ############################################################################### # Our macro is correctly rendered: print(dag['subset'].source) ############################################################################### # Plot and execute pipeline: dag.plot() dag.build()
# build training pipeline dag_fit = DAG() get = PythonCallable(_get, File(tmp_dir / 'data.parquet'), dag_fit, name='get') dag_fit = add_fts(dag_fit) fit = PythonCallable(_fit, { 'report': File(tmp_dir / 'report.txt'), 'model': File(tmp_dir / 'model.joblib') }, dag_fit, name='fit') dag_fit['join'] >> fit ############################################################################### # Fit pipeline plot dag_fit.plot(output='matplotlib') dag_fit.build() # build prediction pipeline - pass a new observation with values [1, 0, 10, 2] dag_pred = DAG() get = PythonCallable(_new_obs, File(tmp_dir / 'obs.parquet'), dag_pred, name='get', params={'values': [1, 0, 10, 2]}) dag_pred = add_fts(dag_pred) pred = PythonCallable(_pred, File(tmp_dir / 'pred.csv'), dag_pred,