def test_dag_reports_sub_select_cols(sqlite_client_and_tmp_dir): client, _ = sqlite_client_and_tmp_dir dag = DAG() dag.clients[SQLScript] = client dag.clients[SQLiteRelation] = client PythonCallable(touch_root, File('some_file.txt'), dag, name='task') sql = 'CREATE TABLE {{product}} AS SELECT * FROM data' SQLScript(sql, SQLiteRelation(('data2', 'table')), dag, name='task2') assert dag.status()[['name', 'Last run']] assert dag.build()[['Ran?', 'Elapsed (s)']]
def test_status_cleared_after_reporting_status(executor, tmp_directory): # this is a pesky scenario, we try to avoid retrieving metdata when we # don't have to because it's slow, so we keep a local copy, but this means # we have to keep an eye on conditions where we must retrieve again, here's # one edge case dag = DAG(executor=executor) PythonCallable(touch_root, File('ok.txt'), dag, name='t1') # dag status requires retrieving metadata, we have a local copy now... dag.status() # building a task means saving metadata again, if the task was executed # in the process where the dag lives, metadata is still up-to-date because # to save metadata, we first have to override the local copy, the edge # case happens when task is executed in a child process, which means # the local copy in the DAG process is now outdated and should be cleared # up dag.build() # this should not trigger any execution, because we just built dag.build() assert set(t.exec_status for t in dag.values()) == {TaskStatus.Skipped}
############################################################################### # Inspecting a pipeline # ********************* # A lot of data pipelines start as experimental projects (e.g. developing a # Machine Learning model), which causes them to grow unpredictably. As the # pipeline evolves, it will span dozens of files whose intent is unclear. The # DAG object serves as the primary reference for anyone seeking to understand # the pipeline. # Making a pipeline transparent helps others quickly understand it without going # through the code details and eases debugging for developers. # status returns a summary of each task status dag.status() ############################################################################### # Inspecting the `DAG` object # --------------------------- # A lot of data work is done interactively using Jupyter or similar tools, being # able interact with a pipeline in the same way is an effective way of # experimenting new methods. # say you are adding a new method to task add_one, you can run your code # with all upstream dependencies being taken care of like this # run your task dag['add_one'].build(force=True)