def test_case_with_parameters(self): with Flow("test") as flow: x = Parameter("x") cond = identity(True) with case(cond, True): y1 = x + 1 with case(cond, False): y2 = x - 1 state = flow.run(x=1) assert state.result[y1].result == 2 assert state.result[y2].is_skipped()
def test_deserialize_flow(self): f = Flow(name="test") f.add_task(Task()) f.add_task(Parameter("x")) env = LocalEnvironment() serialized = env.serialize_flow_to_bytes(f) deserialized = env.deserialize_flow_from_bytes(serialized) assert isinstance(deserialized, Flow) assert len(deserialized.tasks) == 2 assert {p.name for p in deserialized.parameters()} == {"x"}
def main(): with Flow("reprocess-purpleair-single") as flow: environment = Parameter("environment", default="staging") dt = DateTimeParameter("dt") client = create_purpleair_archive_client(environment) all_sensors_raw = extract_warehouse_purpleair(dt=dt, purpleair_client=client) all_sensors_processed = transform_all_sensors_raw(all_sensors_raw) blob_client = create_hour_blob_client(environment=environment, dt=dt) load_all_sensors_processed(all_sensors_processed, blob_client) # Registers flow to server, which we can then deploy and run in background agents. flow.register(project_name="caqi-flows")
def run_flow(data): site = websiteIdentifier(data)[0] upload_dispatcher = { 'yelp': upload_yelp_data, 'indeed': upload_indeed_data } with Flow('ScrapeData') as flow: instance = Parameter("instance") get_type = get_website(instance=instance) scrape = gather_data(get_type) upload_dispatcher.get(site)(scrape) # now attach our custom handler to Task B's logger flow.run(instance=data)
def test_query_connect_fail(): query_task = Query(url='https://localhost/not/a/real/url', username='******', password='******', insecure=True) with Flow('test_query_connect_fail flow') as flow: sql = Parameter('input_sql') query_task(query=sql) parameters = dict(input_sql='SELECT * FROM base', ) state = flow.run(parameters=parameters) assert state.is_failed()
def main(): with Flow("mean-aqi") as flow: environment = Parameter("environment", default="staging") start = DateTimeParameter("start") interval_hour = Parameter("interval_hour", default=1) end = DateTimeParameter("end_inclusive") dts = datetime_range(start, interval_hour, end) client = create_purpleair_archive_client(environment) maybe_all_sensors_processed = extract_warehouse_purpleair_processed.map(dt=dts, purpleair_client=unmapped(client)) all_sensors_processed = filter_failed(maybe_all_sensors_processed) mean_aqi = transform_processed_mean.map(all_sensors_processed) combined_mean_aqi = combine_mean_aqis(mean_aqi) blob_client = create_mean_aqi_blob_client(environment) load_mean_aqi(combined_mean_aqi, blob_client) # blob_client = create_hour_blob_client.map(environment=unmapped(environment), dt=dts) # load_all_sensors_processed.map(all_sensors_processed, blob_client) # Registers flow to server, which we can then deploy and run in background agents. flow.register(project_name="caqi-flows")
def test_workflow_config_schema_defaults(): """ Test that WorkflowConfigSchema loads input data if 'parameters', 'earliest_date' and 'date_stencil' are not specified. """ # Create storage object to pass in context dummy_workflow = Flow(name="DUMMY_WORKFLOW") for parameter in ["reference_date", "date_ranges"]: dummy_workflow.add_task(Parameter(parameter)) workflow_storage = storage.Memory() workflow_storage.add_flow(dummy_workflow) input_dict = dict(workflow_name="DUMMY_WORKFLOW") workflow_config = WorkflowConfigSchema( context={"workflow_storage": workflow_storage} ).load(input_dict) assert isinstance(workflow_config, WorkflowConfig) assert workflow_config == WorkflowConfig(workflow_name="DUMMY_WORKFLOW")
def test_workflow_config_schema_invalid_earliest_date(): """ Test that WorkflowConfigSchema raises a ValidationError if the 'earliest_date' field is not a date. """ # Create storage object to pass in context dummy_workflow = Flow(name="DUMMY_WORKFLOW") for parameter in ["reference_date", "date_ranges"]: dummy_workflow.add_task(Parameter(parameter)) workflow_storage = storage.Memory() workflow_storage.add_flow(dummy_workflow) input_dict = dict(workflow_name="DUMMY_WORKFLOW", earliest_date=datetime.time(11)) with pytest.raises(ValidationError) as exc_info: workflow_config = WorkflowConfigSchema( context={"workflow_storage": workflow_storage} ).load(input_dict) assert "Not a valid date." in exc_info.value.messages["earliest_date"]
def test_query_success(mocker, fake_files): query_mock = mocker.patch('quetzal.client.helpers.query') query_mock.return_value = fake_files, len(fake_files) query_task = Query(url='https://localhost/api/v1', username='******', password='******', insecure=True) with Flow('test_query_success flow') as flow: sql = Parameter('input_sql') query_task(query=sql) parameters = dict(input_sql='SELECT * FROM base', ) state = flow.run(parameters=parameters) assert state.is_successful() query_mock.assert_called()
def test_workflow_config_schema_invalid_parameter_names(key): """ Test that WorkflowConfigSchema raises a ValidationError if the 'parameters' dict keys contain 'reference_date' or 'date_ranges'. """ # Create storage object to pass in context dummy_workflow = Flow(name="DUMMY_WORKFLOW") for parameter in ["reference_date", "date_ranges"]: dummy_workflow.add_task(Parameter(parameter)) workflow_storage = storage.Memory() workflow_storage.add_flow(dummy_workflow) input_dict = dict(workflow_name="DUMMY_WORKFLOW", parameters={key: "DUMMY_VALUE"}) with pytest.raises(ValidationError) as exc_info: workflow_config = WorkflowConfigSchema( context={"workflow_storage": workflow_storage} ).load(input_dict) assert "Invalid input." in exc_info.value.messages["parameters"][key]["key"]
def test_workflow_config_schema_workflow_not_found(tmpdir): """ Test that WorkflowConfigSchema raises a ValidationError if the named workflow does not exist. """ # Create storage object to pass in context dummy_workflow = Flow(name="DUMMY_WORKFLOW") for parameter in ["reference_date", "date_ranges"]: dummy_workflow.add_task(Parameter(parameter)) workflow_storage = storage.Local(tmpdir) workflow_storage.add_flow(dummy_workflow) with pytest.raises(ValidationError) as exc_info: workflow_config = WorkflowConfigSchema( context={ "workflow_storage": workflow_storage }).load({"workflow_name": "NONEXISTENT_WORKFLOW"}) assert ("Workflow does not exist in this storage." in exc_info.value.messages["workflow_name"])
def test_workflow_config_schema_workflow_does_not_accept_automatic_parameters( missing_parameter, tmpdir): """ Test that WorkflowConfigSchema raises a ValidationError if the named workflow does not accept parameters 'reference_date' and 'date_ranges'. """ # Create storage object to pass in context dummy_workflow = Flow(name="DUMMY_WORKFLOW") for parameter in {"reference_date", "date_ranges"} - {missing_parameter}: dummy_workflow.add_task(Parameter(parameter)) workflow_storage = storage.Local(tmpdir) workflow_storage.add_flow(dummy_workflow) with pytest.raises(ValidationError) as exc_info: workflow_config = WorkflowConfigSchema( context={ "workflow_storage": workflow_storage }).load({"workflow_name": "DUMMY_WORKFLOW"}) assert (f"Workflow does not accept parameters {{'{missing_parameter}'}}." in exc_info.value.messages["workflow_name"])
def build_feature_pipeline_flow(): with Flow("feature-pipeline") as feature_pipeline: dataset = Parameter("dataset") start_date = Parameter("start_date") end_date = Parameter("end_date") window_size = Parameter("window_size") window_offset = Parameter("window_offset") bots = Parameter("bots") feature_store = create_feature_store(dataset, window_size, window_offset) df = create_features(feature_store, start_date, end_date) add_label_column(df, bots, task_args=dict(slug="output")) return feature_pipeline
def _build(self, *, base_dir: str = None, pattern: str = '*/**.hdf5', limit: Optional[int] = None, **kwargs): # Manage parameters # ... not needed for this flow ... # Instantiate tasks list_files = ListFiles(as_file_adapter=True, pattern=pattern, limit=limit) with self: directory = Parameter('base_dir', default=base_dir, required=False) trigger = AlwaysSucceed(name='trigger') dataset = list_files(directory, upstream_tasks=[trigger]) self.set_reference_tasks([dataset]) logger.debug('Built flow %s with tasks %s', self, self.tasks)
}, python_dependencies=[ "python-dotenv", "boto3", "botocore", ], ignore_healthchecks=True, # only an extreme poweruser should use this ^ ) run_config = DockerRun( env={"sample_key": "sample_value"}, labels=["docker"], ) with Flow( "Upload to S3", storage=storage, run_config=run_config ) as flow: files_to_download = Parameter( name="File List", default=["data/test_data.csv", "data/user_data.csv", "data/event_data.csv"] ) conn = connect_to_s3() upload_to_s3.map( s3_client=unmapped(conn), file_path=create_filepath.map(files_to_download) ) # flow.run() flow.register(project_name="AWS")
def print_me(val): print(val) @task(trigger=any_failed, log_stdout=True) def handle_failure(): print("FAILURE") @task(trigger=any_successful, log_stdout=True) def the_end(): print("We are done.") with Flow("data-retrieval") as flow: key = Parameter("key") data = pull_data() value = get_value(data=data, key=key) printm = print_me handlef = handle_failure handle_failure.set_upstream(value) printm.set_upstream(value, key="val") end = the_end end.set_upstream(printm) end.set_upstream(handlef) # print(flow.serialize())
worker_cpu=256, worker_mem=512, scheduler_timeout="15 minutes", ) # Be aware of scheduler_timeout. In this case, if no Dask client (e.g. Prefect # Dask Executor) has connected to the Dask scheduler in 15 minutes, the Dask # cluster will terminate. For development, you may want to increase this timeout. @task def times_two(x): return x * 2 @task def get_sum(x_list): return sum(x_list) with Flow("Dask Cloud Provider Test") as flow: x = Parameter("x", default=[1, 2, 3]) y = times_two.map(x) results = get_sum(y) flow.run(executor=DaskExecutor(cluster.scheduler.address), parameters={"x": list(range(10))}) # Tear down the Dask cluster. If you're developing and testing your flow you would # not do this after each Flow run, but when you're done developing and testing. cluster.close()
from prefect import task, Flow, Parameter from prefect.engine.signals import LOOP @task(max_retries=5, retry_delay=timedelta(seconds=2)) def compute_large_fibonacci(M): # we extract the accumulated task loop result from context loop_payload = prefect.context.get("task_loop_result", {}) n = loop_payload.get("n", 1) fib = loop_payload.get("fib", 1) next_fib = requests.post("https://nemo.api.stdlib.com/[email protected]/", data={ "nth": n }).json() if next_fib > M: return fib # return statements end the loop raise LOOP(message=f"Fib {n}={next_fib}", result=dict(n=n + 1, fib=next_fib)) with Flow("fibonacci") as flow: M = Parameter("M") fib_num = compute_large_fibonacci(M) flow_state = flow.run(M=100) print(flow_state.result[fib_num].result) # 89
# you must have a run function in the class to make it a flow def run(self, x: int, y: int = None) -> int: self.x = x self.y = y self.print_it(x) print(self.x + self.y) print('run function running') return def print_it(self, x): print('printing ') print(x) #this line has to happen for the Task super class to properly call the run within the class a = AddTask(default=1) with Flow("My Flow") as f: x = Parameter('x') y = Parameter('y') #because a is already defined with a default, we only have to pass in the variables #we actually want here addition = a(x, y) # t2 != a x = 1 y = 2 with raise_on_exception(): f.run(parameters={'x': x, 'y': y})
Given a list of ordered images represented as bytes, combines them into a single GIF stored in the present working directory. """ prefect.context.get("logger") images = [imageio.imread(BytesIO(image)) for image in image_bytes] imageio.mimsave(gif_file, images) with Flow("Image ETL") as flow: Path("src/pipeline/temp").touch() image_path = Path("src/pipeline/temp/image-data.img") gif_path = Path("src/pipeline/temp/comb.gif") DATA_URL = Parameter( "DATA_URL", default= "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true", ) DATA_FILE = Parameter("DATA_FILE", default=image_path) # Extract command = curl_cmd(DATA_URL, DATA_FILE) curl = download(command=command) # Transform # we use the `upstream_tasks` keyword to specify non-data dependencies images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl]) # Load frames = write_to_disk.map(images) combine_to_gif(frames, gif_path)
TODO: Add this to a SQL database to save on memory """ if bool_mrd: df.to_csv(os.path.join(dir_proc, f"mrd_{filename}.csv"), index=False) else: df.to_csv(os.path.join(dir_proc, f"full_{filename}.csv"), index=False) # Create an output folder dir_proc_today = make_dir_proc() dir_proc = config.get("dir", "proc") with Flow("Process data") as flow_proc: filename = Parameter("filename") nrow = Parameter("nrows") fp_data = fp_from_dir_raw(filename=filename) df = read_in_data(fp_data, nrows=nrow) df2 = basic_features(df) df3 = time_features(df2) df4 = lagged_features(df3) df5 = rolling_features(df4) # PLACEHOLDER - doesn't do anything yet df6 = reshape_to_home(df5) # Save out for _dir in [dir_proc, dir_proc_today]: save_full_df = save_df(df5, dir_proc=_dir, filename=filename, bool_mrd=False) save_mrd = save_df(df6,
import logging logger = logging.getLogger() logger.setLevel(logging.INFO) # For other template options https://docs.prefect.io/api/latest/utilities/context.html#context-2 s3_result = S3Result(bucket="results-prefect-tst", location="{flow_name}-{today}/results.prefect") #lcl_result = LocalResult(dir="~/prefect_guide/results/", location="{flow_name}/{today}") result_h = s3_result # A flow has no particular order unless the data is bound (shown) or explicitly set (not shown). with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl: tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html") imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst") n_conn = Parameter("n_conn", default=1) # Scrape the website tsx_imb_df = get_tsx_moc_imb(tsx_url) # Get the connection string from prefect cloud conn_str = PrefectSecret("moc_pgdb_conn") # Partition the df to tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn) df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str)) if __name__ == "__main__":
def _define_model_selection_flow(): """Define flow that runs model selection. Specifically data filtering, partitioning and model selection and optional persistence on a given dataset Returns ------- prefect.Flow """ from prefect import task, Flow, Parameter, unmapped with Flow("model selection") as flow: df = Parameter("data") grid_search = Parameter("grid_search") target_col_name = Parameter("target_col_name") country_code_column = Parameter("country_code_column") include_rules = Parameter("include_rules") exclude_rules = Parameter("exclude_rules") parallel_over_columns = Parameter("parallel_over_columns") partition_columns = Parameter("partition_columns") frequency = Parameter("frequency") output_path = Parameter("output_path") persist_cv_data = Parameter("persist_cv_data") persist_cv_results = Parameter("persist_cv_results") persist_model_reprs = Parameter("persist_model_reprs") persist_best_model = Parameter("persist_best_model") persist_partition = Parameter("persist_partition") persist_model_selector_results = Parameter( "persist_model_selector_results") df_filtered = task(filter_data)(df=df, include_rules=include_rules, exclude_rules=exclude_rules) partitions = task(partition_data)(df=df_filtered, partition_by=parallel_over_columns) parallel_over_dicts, partition_dfs = partitions["labels"], partitions[ "data"] train_data = task(prepare_data_for_training).map( df=partition_dfs, frequency=unmapped(frequency), partition_columns=unmapped(partition_columns), parallel_over_columns=unmapped(parallel_over_columns), country_code_column=unmapped(country_code_column), ) results = task(select_model).map( df=train_data, target_col_name=unmapped(target_col_name), grid_search=unmapped(grid_search), partition_columns=unmapped(partition_columns), parallel_over_dict=parallel_over_dicts, frequency=unmapped(frequency), country_code_column=unmapped(country_code_column), ) write_ok = task(persist_experts_in_physical_partition).map( results=results, folder_path=unmapped(output_path), persist_cv_results=unmapped(persist_cv_results), persist_cv_data=unmapped(persist_cv_data), persist_model_reprs=unmapped(persist_model_reprs), persist_best_model=unmapped(persist_best_model), persist_partition=unmapped(persist_partition), persist_model_selector_results=unmapped( persist_model_selector_results), ) flow.set_reference_tasks([write_ok]) return flow
Args: - catalog (str, str): tuple with name of table and url """ bq = bigquery.Client(project=GCP.project) job_config = bigquery.LoadJobConfig() job_config.write_disposition = "WRITE_TRUNCATE" df = pd.DataFrame(requests.get(catalog[1]).json()["value"]) job = bq.load_table_from_dataframe( dataframe=df, destination=f"{schema}.{catalog[0]}", project=GCP.project, location=GCP.location, ) return job gcp = Parameter("gcp", required=True) with Flow("CBS catalogs") as flow: odatav3 = odatav3_catalog_to_gbq.map(catalog=list(CATALOGS.items()), GCP=unmapped(gcp)) def main(config): """Executes vektis.agb.flow in DaskExecutor. """ flow.run(parameters={"gcp": config.gcp}) if __name__ == "__main__": config = get_config("dataverbinders") main(config=config)
@task def return_list(val): return list(range(0, val)) @task def parse_value(val): if val % 2 != 0: raise ValueError("Value is not even!") return val @task(trigger=prefect.triggers.any_failed) def catch_error(val): print(f"Do something with this value error: {val}") schedule = IntervalSchedule(interval=timedelta(minutes=1)) with Flow("Raise error on Odd", schedule=schedule) as flow: my_param = Parameter("param") my_list = return_list(val=my_param) def_list = parse_value.map(my_list) catch_error.map(def_list) params = [1, 2, 3, 4, 5] for p in params: FlowRunner(flow=flow).initialize_run(parameters={"param": p})
"Meta Data": meta_data, "Time Series (15min)": data }) @task def persist_data_in_influx(injector: Injector, av_response: InterdayResponseModel, secrets: Dict[str, str]): influx_v2_client = injector.get(InfluxDBClient) influx_v2_client.write_api(SYNCHRONOUS).write( secrets['INFLUX_V2_BUCKET'], record=interday_response_model_to_points(av_response)) schedule = IntervalSchedule(interval=timedelta(hours=24)) with Flow("scrap-stock", schedule) as flow: injector = create_secret_injector_task() token_renewal_result = renew_token_task(injector) secrets = fetch_secret_task('common', 'kv', injector) stocks = Parameter("stocks", default=["GOOGL", "MSFT"]) av_response = scrap_stock.map(stocks, secrets=unmapped(secrets)) persist_data_in_influx.map(injector=unmapped(injector), av_response=av_response, secrets=unmapped(secrets)) flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py") flow.run()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Feb 4 09:17:15 2020 @author: felipe """ from prefect import Parameter, Flow with Flow("Say hi!") as flow: name = Parameter("name") say_hello(name) flow.run(name="Marvin")
from prefect import task, Flow, Parameter ## initialize the Parameter outside of any ## Flow context add_num = Parameter("add_num", default=10) @task def add_one(x): return x + 1 with Flow("Flow 1") as flow_1: new_num1 = add_one(add_num) @task def add_two(y): return y + 1 with Flow("Flow 2") as flow_2: new_num2 = add_one(add_num) combo_fl = Flow("Add Numbers") combo_fl.update(flow_1) combo_fl.update(flow_2, validate=True) combo_fl.visualize()
def check_if_even(value): return value % 2 == 0 @task def print_odd(value): print("{} is odd!".format(value)) @task def print_even(value): print("{} is even!".format(value)) with Flow("Check Even/Odd") as f: value = Parameter("value") is_even = check_if_even(value) even = print_even(value) odd = print_odd(value) ifelse(is_even, even, odd) # Prints '2 is even!' f.run(value=2) # Prints '1 is odd!' f.run(value=1)
@task(max_retries=5, retry_delay=datetime.timedelta(minutes=10)) def root_task(): pass @task( cache_for=datetime.timedelta(days=10), cache_validator=partial_parameters_only(["x"]), result_handler=JSONResultHandler(), ) def cached_task(x, y): pass x = Parameter("x") y = Parameter("y", default=42) @task(name="Big Name", checkpoint=True, result_handler=S3ResultHandler(bucket="blob")) def terminal_task(): pass env = RemoteEnvironment( executor="prefect.engine.executors.DaskExecutor", executor_kwargs={"scheduler_address": "tcp://"}, ) storage = Docker(