def test_simple_map(monkeypatch): flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) with prefect.Flow(name="test", result=PrefectResult()) as flow: t1 = plus_one.map([0, 1, 2]) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id) ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t is not t1 ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): state = CloudFlowRunner(flow=flow).run(return_tasks=flow.tasks, executor=LocalExecutor()) assert state.is_successful() assert client.flow_runs[flow_run_id].state.is_successful() assert client.task_runs[task_run_id_1].state.is_mapped() # there should be a total of 4 task runs corresponding to the mapped task assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4)
def test_running_state_finishes(self): flow = Flow(name="test", tasks=[Task()]) new_state = FlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_successful()
def test_e2e_pipeline(): """Smoke test. Flow successfully executes using a local executor. """ kwargs = { 'url': 'https://vincentarelbundock.github.io/Rdatasets/csv/stevedata/fakeTSD.csv', 'cat_cols': ['year'], 'endog': 'y', 'exog': ['x1', 'x2'] } state = e2e_pipeline.run(**kwargs, executor=LocalExecutor()) assert state.is_successful()
def test_determine_final_state_preserves_running_states_when_tasks_still_running( self, ): task = Task() flow = Flow(name="test", tasks=[task]) old_state = Running() new_state = FlowRunner(flow=flow).get_flow_run_state( state=old_state, task_states={task: Retrying(start_time=pendulum.now("utc").add(days=1))}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state is old_state
def test_determine_final_state_has_final_say(self): class MyFlowRunner(FlowRunner): def determine_final_state(self, *args, **kwargs): return Failed("Very specific error message") flow = Flow(name="test", tasks=[Task()]) new_state = MyFlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_failed() assert new_state.message == "Very specific error message"
def test_can_queue_successfully_and_run(monkeypatch): @prefect.task def return_one(): return 1 with prefect.Flow("test-queues-work!") as flow: t1 = return_one() flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) client = QueueingMockCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun( id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id ), ] + [ TaskRun( id=str(uuid.uuid4()), task_slug=flow.slugs[t1], flow_run_id=flow_run_id ) for t in flow.tasks if t not in [ t1, ] ], monkeypatch=monkeypatch, num_times_in_queue=6, ) with prefect.context(flow_run_id=flow_run_id): run_state = CloudFlowRunner(flow=flow).run( executor=LocalExecutor(), return_tasks=flow.tasks ) assert run_state.is_successful() # Pending -> Running -> Queued (4x) -> Success # State transitions that result in `set_flow_run_state` calls are from # Pending -> Running and Running -> Success, all others # are from Running -> Queued or Queued -> Queued assert client.call_count["set_flow_run_state"] == 2 + (client.num_times_in_queue)
def test_wait(self): """LocalExecutor's wait() method just returns its input""" assert LocalExecutor().wait(1) == 1 assert LocalExecutor().wait(prefect) is prefect
def test_submit(self): """LocalExecutor directly executes the function""" assert LocalExecutor().submit(lambda: 1) == 1 assert LocalExecutor().submit(lambda x: x, 1) == 1 assert LocalExecutor().submit(lambda x: x, x=1) == 1 assert LocalExecutor().submit(lambda: prefect) is prefect
def local(): "Local, immediate execution executor" yield LocalExecutor()
# FLOWS CONFIGURATION # Run config run_config = KubernetesRun() # Storage FLOWS_DIR_PATH = '/opt/server/src/flows' storage_kwargs = { 'dockerfile': 'server/Dockerfile', 'registry_url': REGISTRY_URL, 'stored_as_script': True, } # Executer local_executor = LocalExecutor() dask_executor = DaskExecutor(address=DASK_SCHEDULER_ADDR) # Result if RESULT_SUBCLASS == 'azure': result = AzureResult(container=AZURE_RESULT_CONTAINER) elif RESULT_SUBCLASS == 's3': result = S3Result(bucket=S3_RESULT_BUCKET) else: result = LocalResult(dir=LOCAL_RESULT_DIR) # Set flow run configs mapreduce_wordcount.run_config = run_config
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly. """ flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = plus_one.map([-1, 0, 1]) t2 = invert_fail_once.map(t1) t2.max_retries = 1 t2.retry_delay = datetime.timedelta(minutes=100) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun(id=task_run_id_2, task_slug=flow.slugs[t2], flow_run_id=flow_run_id), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, t2] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_mapped() # there should be a total of 4 task runs corresponding to each mapped task for t in [t1, t2]: assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t] ]) == 4) # t2's first child task should be retrying t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert isinstance(t2_0.state, Retrying) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data failed_id = [ t_id for t_id, tr in client.task_runs.items() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0 ].pop() client.task_runs[failed_id].state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) # t2's first child task should be successful t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert t2_0.state.is_successful()
def test_deep_map_with_a_retry(monkeypatch): """ Creates a situation in which a deeply-mapped Flow encounters a one-time error in one of the middle layers. Running the flow a second time should resolve the error. DOES NOT WORK WITH DASK EXECUTORS because of the need for shared state on second run """ flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) task_run_id_3 = str(uuid.uuid4()) with prefect.Flow(name="test", result=PrefectResult()) as flow: t1 = plus_one.map([-1, 0, 1]) t2 = invert_fail_once.map(t1) t3 = plus_one.map(t2) t2.max_retries = 1 t2.retry_delay = datetime.timedelta(minutes=100) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun(id=task_run_id_2, task_slug=flow.slugs[t2], flow_run_id=flow_run_id), TaskRun(id=task_run_id_3, task_slug=flow.slugs[t3], flow_run_id=flow_run_id), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, t2, t3] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_mapped() assert client.task_runs[task_run_id_3].state.is_mapped() # there should be a total of 4 task runs corresponding to each mapped task for t in [t1, t2, t3]: assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t] ]) == 4) # t2's first child task should be retrying t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert isinstance(t2_0.state, Retrying) # t3's first child task should be pending t3_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t3] and tr.map_index == 0) assert t3_0.state.is_pending() # RUN A SECOND TIME with an artificially updated start time failed_id = [ t_id for t_id, tr in client.task_runs.items() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0 ].pop() client.task_runs[failed_id].state.start_time = pendulum.now("UTC") with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) # t2's first child task should be successful t2_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t2] and tr.map_index == 0) assert t2_0.state.is_successful() # t3's first child task should be successful t3_0 = next(tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t3] and tr.map_index == 0) assert t3_0.state.is_successful()
def executor(self) -> Executor: return LocalExecutor()
def test_prefect_executors(train_data, grid_search, parallel_columns): from dask.distributed import Client from prefect.executors import DaskExecutor from prefect.executors import LocalDaskExecutor from prefect.executors import LocalExecutor client = Client() executors = { "dask_already_running": DaskExecutor(address=client.scheduler.address), "local": LocalExecutor(), "local_dask": LocalDaskExecutor(), # this spins up LocalDaskExecutor, but just to check the interface "dask_create_on_call": DaskExecutor(), } for executor_name, executor in executors.items(): flow, state = run_model_selection( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, visualize_success=False, executor=executor, ) assert state.is_successful() results = select_model_general( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, executor=executor, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, ) assert len(results) == len(train_data[parallel_columns + ["Product"]].drop_duplicates()) assert isinstance(results[0], ModelSelectorResult) if executor_name == "dask_already_running": client.shutdown() if client.status != "closed": client.shutdown()
def test_result(data, serializer): r = XpersistResult( CacheStore(), serializer=serializer, ) new = r.write(data) assert new.read(new.location).value == data assert r.cache_store.get(new.location) == data assert r.exists(new.location) @pytest.mark.parametrize( 'executor', [ LocalExecutor(), DaskExecutor(cluster_kwargs={ 'processes': False, 'threads_per_worker': 8 }, debug=True), ], ) def test_result_flow(executor): os.environ['PREFECT__FLOWS__CHECKPOINTING'] = 'True' r = XpersistResult( CacheStore(), serializer='xarray.netcdf', ) @task(target='testing.nc', result=r)
def test_is_pickleable(self): e = LocalExecutor() post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalExecutor)
def test_is_pickleable_after_start(self): e = LocalExecutor() with e.start(): post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalExecutor)
def test_non_keyed_states_are_hydrated_correctly_with_retries( monkeypatch, tmpdir): """ Ensures that retries longer than 10 minutes properly "hydrate" upstream states so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies can affect the number of children spawned. """ @prefect.task def return_list(): return [1, 2, 3] @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20)) def fail_once(): if prefect.context.get("task_run_count", 0) < 2: raise SyntaxError("bad") else: return 100 flow_run_id = str(uuid.uuid4()) task_run_id_1 = str(uuid.uuid4()) task_run_id_2 = str(uuid.uuid4()) with prefect.Flow(name="test-retries", result=LocalResult(dir=tmpdir)) as flow: t1 = fail_once.map(upstream_tasks=[return_list]) monkeypatch.setattr("requests.Session", MagicMock()) monkeypatch.setattr("requests.post", MagicMock()) client = MockedCloudClient( flow_runs=[FlowRun(id=flow_run_id)], task_runs=[ TaskRun(id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id), TaskRun( id=task_run_id_2, task_slug=flow.slugs[return_list], flow_run_id=flow_run_id, ), ] + [ TaskRun(id=str(uuid.uuid4()), task_slug=flow.slugs[t], flow_run_id=flow_run_id) for t in flow.tasks if t not in [t1, return_list] ], monkeypatch=monkeypatch, ) with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert client.flow_runs[flow_run_id].state.is_running() assert client.task_runs[task_run_id_1].state.is_mapped() assert client.task_runs[task_run_id_2].state.is_successful() # there should be a total of 4 task runs corresponding to each mapped task assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) # t1's first child task should be retrying assert all([ isinstance(tr.state, Retrying) for tr in client.task_runs.values() if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1) ]) # RUN A SECOND TIME with an artificially updated start time # and remove all in-memory data for idx, tr in client.task_runs.items(): if tr.task_slug == flow.slugs[t1] and tr.map_index != -1: tr.state.start_time = pendulum.now("UTC") for idx, tr in client.task_runs.items(): tr.state._result.value = None with prefect.context(flow_run_id=flow_run_id): CloudFlowRunner(flow=flow).run(executor=LocalExecutor()) assert (len([ tr for tr in client.task_runs.values() if tr.task_slug == flow.slugs[t1] ]) == 4) assert all(tr.state.is_successful() for tr in client.task_runs.values())