Пример #1
0
    def test_case_with_parameters(self):
        with Flow("test") as flow:
            x = Parameter("x")
            cond = identity(True)
            with case(cond, True):
                y1 = x + 1
            with case(cond, False):
                y2 = x - 1

        state = flow.run(x=1)
        assert state.result[y1].result == 2
        assert state.result[y2].is_skipped()
Пример #2
0
    def test_deserialize_flow(self):
        f = Flow(name="test")
        f.add_task(Task())
        f.add_task(Parameter("x"))

        env = LocalEnvironment()
        serialized = env.serialize_flow_to_bytes(f)
        deserialized = env.deserialize_flow_from_bytes(serialized)

        assert isinstance(deserialized, Flow)
        assert len(deserialized.tasks) == 2
        assert {p.name for p in deserialized.parameters()} == {"x"}
Пример #3
0
def main():

    with Flow("reprocess-purpleair-single") as flow:
        environment = Parameter("environment", default="staging")
        dt = DateTimeParameter("dt")
        client = create_purpleair_archive_client(environment)
        all_sensors_raw = extract_warehouse_purpleair(dt=dt, purpleair_client=client)
        all_sensors_processed = transform_all_sensors_raw(all_sensors_raw)
        blob_client = create_hour_blob_client(environment=environment, dt=dt)
        load_all_sensors_processed(all_sensors_processed, blob_client)

    # Registers flow to server, which we can then deploy and run in background agents.
    flow.register(project_name="caqi-flows")
Пример #4
0
def run_flow(data):
    site = websiteIdentifier(data)[0]
    upload_dispatcher = {
        'yelp': upload_yelp_data,
        'indeed': upload_indeed_data
    }
    with Flow('ScrapeData') as flow:
        instance = Parameter("instance")
        get_type = get_website(instance=instance)
        scrape = gather_data(get_type)
        upload_dispatcher.get(site)(scrape)

    # now attach our custom handler to Task B's logger
    flow.run(instance=data)
Пример #5
0
def test_query_connect_fail():
    query_task = Query(url='https://localhost/not/a/real/url',
                       username='******',
                       password='******',
                       insecure=True)

    with Flow('test_query_connect_fail flow') as flow:
        sql = Parameter('input_sql')
        query_task(query=sql)

    parameters = dict(input_sql='SELECT * FROM base', )
    state = flow.run(parameters=parameters)

    assert state.is_failed()
Пример #6
0
def main():

    with Flow("mean-aqi") as flow:
        environment = Parameter("environment", default="staging")
        start = DateTimeParameter("start")
        interval_hour = Parameter("interval_hour", default=1)
        end = DateTimeParameter("end_inclusive")

        dts = datetime_range(start, interval_hour, end)
        client = create_purpleair_archive_client(environment)

        maybe_all_sensors_processed = extract_warehouse_purpleair_processed.map(dt=dts, purpleair_client=unmapped(client))
        all_sensors_processed = filter_failed(maybe_all_sensors_processed)
        
        mean_aqi = transform_processed_mean.map(all_sensors_processed)
        combined_mean_aqi = combine_mean_aqis(mean_aqi)
        blob_client = create_mean_aqi_blob_client(environment)
        load_mean_aqi(combined_mean_aqi, blob_client)
        # blob_client = create_hour_blob_client.map(environment=unmapped(environment), dt=dts)
        # load_all_sensors_processed.map(all_sensors_processed, blob_client)

    # Registers flow to server, which we can then deploy and run in background agents.
    flow.register(project_name="caqi-flows")
Пример #7
0
def test_workflow_config_schema_defaults():
    """
    Test that WorkflowConfigSchema loads input data if 'parameters',
    'earliest_date' and 'date_stencil' are not specified.
    """
    # Create storage object to pass in context
    dummy_workflow = Flow(name="DUMMY_WORKFLOW")
    for parameter in ["reference_date", "date_ranges"]:
        dummy_workflow.add_task(Parameter(parameter))
    workflow_storage = storage.Memory()
    workflow_storage.add_flow(dummy_workflow)

    input_dict = dict(workflow_name="DUMMY_WORKFLOW")
    workflow_config = WorkflowConfigSchema(
        context={"workflow_storage": workflow_storage}
    ).load(input_dict)
    assert isinstance(workflow_config, WorkflowConfig)
    assert workflow_config == WorkflowConfig(workflow_name="DUMMY_WORKFLOW")
Пример #8
0
def test_workflow_config_schema_invalid_earliest_date():
    """
    Test that WorkflowConfigSchema raises a ValidationError if the
    'earliest_date' field is not a date.
    """
    # Create storage object to pass in context
    dummy_workflow = Flow(name="DUMMY_WORKFLOW")
    for parameter in ["reference_date", "date_ranges"]:
        dummy_workflow.add_task(Parameter(parameter))
    workflow_storage = storage.Memory()
    workflow_storage.add_flow(dummy_workflow)

    input_dict = dict(workflow_name="DUMMY_WORKFLOW", earliest_date=datetime.time(11))
    with pytest.raises(ValidationError) as exc_info:
        workflow_config = WorkflowConfigSchema(
            context={"workflow_storage": workflow_storage}
        ).load(input_dict)
    assert "Not a valid date." in exc_info.value.messages["earliest_date"]
Пример #9
0
def test_query_success(mocker, fake_files):
    query_mock = mocker.patch('quetzal.client.helpers.query')
    query_mock.return_value = fake_files, len(fake_files)

    query_task = Query(url='https://localhost/api/v1',
                       username='******',
                       password='******',
                       insecure=True)

    with Flow('test_query_success flow') as flow:
        sql = Parameter('input_sql')
        query_task(query=sql)

    parameters = dict(input_sql='SELECT * FROM base', )
    state = flow.run(parameters=parameters)

    assert state.is_successful()
    query_mock.assert_called()
Пример #10
0
def test_workflow_config_schema_invalid_parameter_names(key):
    """
    Test that WorkflowConfigSchema raises a ValidationError if the 'parameters'
    dict keys contain 'reference_date' or 'date_ranges'.
    """
    # Create storage object to pass in context
    dummy_workflow = Flow(name="DUMMY_WORKFLOW")
    for parameter in ["reference_date", "date_ranges"]:
        dummy_workflow.add_task(Parameter(parameter))
    workflow_storage = storage.Memory()
    workflow_storage.add_flow(dummy_workflow)

    input_dict = dict(workflow_name="DUMMY_WORKFLOW", parameters={key: "DUMMY_VALUE"})
    with pytest.raises(ValidationError) as exc_info:
        workflow_config = WorkflowConfigSchema(
            context={"workflow_storage": workflow_storage}
        ).load(input_dict)
    assert "Invalid input." in exc_info.value.messages["parameters"][key]["key"]
Пример #11
0
def test_workflow_config_schema_workflow_not_found(tmpdir):
    """
    Test that WorkflowConfigSchema raises a ValidationError if the named
    workflow does not exist.
    """
    # Create storage object to pass in context
    dummy_workflow = Flow(name="DUMMY_WORKFLOW")
    for parameter in ["reference_date", "date_ranges"]:
        dummy_workflow.add_task(Parameter(parameter))
    workflow_storage = storage.Local(tmpdir)
    workflow_storage.add_flow(dummy_workflow)

    with pytest.raises(ValidationError) as exc_info:
        workflow_config = WorkflowConfigSchema(
            context={
                "workflow_storage": workflow_storage
            }).load({"workflow_name": "NONEXISTENT_WORKFLOW"})
    assert ("Workflow does not exist in this storage."
            in exc_info.value.messages["workflow_name"])
Пример #12
0
def test_workflow_config_schema_workflow_does_not_accept_automatic_parameters(
        missing_parameter, tmpdir):
    """
    Test that WorkflowConfigSchema raises a ValidationError if the named
    workflow does not accept parameters 'reference_date' and 'date_ranges'.
    """
    # Create storage object to pass in context
    dummy_workflow = Flow(name="DUMMY_WORKFLOW")
    for parameter in {"reference_date", "date_ranges"} - {missing_parameter}:
        dummy_workflow.add_task(Parameter(parameter))
    workflow_storage = storage.Local(tmpdir)
    workflow_storage.add_flow(dummy_workflow)

    with pytest.raises(ValidationError) as exc_info:
        workflow_config = WorkflowConfigSchema(
            context={
                "workflow_storage": workflow_storage
            }).load({"workflow_name": "DUMMY_WORKFLOW"})
    assert (f"Workflow does not accept parameters {{'{missing_parameter}'}}."
            in exc_info.value.messages["workflow_name"])
Пример #13
0
def build_feature_pipeline_flow():
    with Flow("feature-pipeline") as feature_pipeline:
        dataset = Parameter("dataset")

        start_date = Parameter("start_date")
        end_date = Parameter("end_date")
        window_size = Parameter("window_size")
        window_offset = Parameter("window_offset")
        bots = Parameter("bots")

        feature_store = create_feature_store(dataset, window_size,
                                             window_offset)

        df = create_features(feature_store, start_date, end_date)

        add_label_column(df, bots, task_args=dict(slug="output"))
    return feature_pipeline
Пример #14
0
    def _build(self,
               *,
               base_dir: str = None,
               pattern: str = '*/**.hdf5',
               limit: Optional[int] = None,
               **kwargs):

        # Manage parameters
        # ... not needed for this flow ...

        # Instantiate tasks
        list_files = ListFiles(as_file_adapter=True,
                               pattern=pattern,
                               limit=limit)

        with self:
            directory = Parameter('base_dir', default=base_dir, required=False)
            trigger = AlwaysSucceed(name='trigger')
            dataset = list_files(directory, upstream_tasks=[trigger])

            self.set_reference_tasks([dataset])

        logger.debug('Built flow %s with tasks %s', self, self.tasks)
Пример #15
0
    },
    python_dependencies=[
        "python-dotenv",
        "boto3",
        "botocore",
    ],
    ignore_healthchecks=True,
    # only an extreme poweruser should use this ^
)
run_config = DockerRun(
    env={"sample_key": "sample_value"},
    labels=["docker"],
)

with Flow(
    "Upload to S3", 
    storage=storage, 
    run_config=run_config
) as flow:
    files_to_download = Parameter(
        name="File List", 
        default=["data/test_data.csv", "data/user_data.csv", "data/event_data.csv"]
    )
    conn = connect_to_s3()
    upload_to_s3.map(
        s3_client=unmapped(conn), 
        file_path=create_filepath.map(files_to_download)
    )

# flow.run()
flow.register(project_name="AWS")
Пример #16
0
def print_me(val):
    print(val)


@task(trigger=any_failed, log_stdout=True)
def handle_failure():
    print("FAILURE")


@task(trigger=any_successful, log_stdout=True)
def the_end():
    print("We are done.")


with Flow("data-retrieval") as flow:
    key = Parameter("key")
    data = pull_data()
    value = get_value(data=data, key=key)

    printm = print_me

    handlef = handle_failure
    handle_failure.set_upstream(value)

    printm.set_upstream(value, key="val")

    end = the_end
    end.set_upstream(printm)
    end.set_upstream(handlef)

# print(flow.serialize())
Пример #17
0
    worker_cpu=256,
    worker_mem=512,
    scheduler_timeout="15 minutes",
)
# Be aware of scheduler_timeout. In this case, if no Dask client (e.g. Prefect
# Dask Executor) has connected to the Dask scheduler in 15 minutes, the Dask
# cluster will terminate. For development, you may want to increase this timeout.


@task
def times_two(x):
    return x * 2


@task
def get_sum(x_list):
    return sum(x_list)


with Flow("Dask Cloud Provider Test") as flow:
    x = Parameter("x", default=[1, 2, 3])
    y = times_two.map(x)
    results = get_sum(y)

flow.run(executor=DaskExecutor(cluster.scheduler.address),
         parameters={"x": list(range(10))})

# Tear down the Dask cluster. If you're developing and testing your flow you would
# not do this after each Flow run, but when you're done developing and testing.
cluster.close()
Пример #18
0
from prefect import task, Flow, Parameter
from prefect.engine.signals import LOOP


@task(max_retries=5, retry_delay=timedelta(seconds=2))
def compute_large_fibonacci(M):
    # we extract the accumulated task loop result from context
    loop_payload = prefect.context.get("task_loop_result", {})

    n = loop_payload.get("n", 1)
    fib = loop_payload.get("fib", 1)

    next_fib = requests.post("https://nemo.api.stdlib.com/[email protected]/",
                             data={
                                 "nth": n
                             }).json()

    if next_fib > M:
        return fib  # return statements end the loop

    raise LOOP(message=f"Fib {n}={next_fib}",
               result=dict(n=n + 1, fib=next_fib))


with Flow("fibonacci") as flow:
    M = Parameter("M")
    fib_num = compute_large_fibonacci(M)

flow_state = flow.run(M=100)
print(flow_state.result[fib_num].result)  # 89
    # you must have a run function in the class to make it a flow
    def run(self, x: int, y: int = None) -> int:
        self.x = x
        self.y = y
        self.print_it(x)
        print(self.x + self.y)
        print('run function running')
        return

    def print_it(self, x):
        print('printing  ')
        print(x)


#this line has to happen for the Task super class to properly call the run within the class
a = AddTask(default=1)

with Flow("My Flow") as f:
    x = Parameter('x')
    y = Parameter('y')
    #because a is already defined with a default, we only have to pass in the variables
    #we actually want here
    addition = a(x, y)  # t2 != a

x = 1
y = 2

with raise_on_exception():
    f.run(parameters={'x': x, 'y': y})
Пример #20
0
    Given a list of ordered images represented as bytes,
    combines them into a single GIF stored in the present working directory.
    """
    prefect.context.get("logger")
    images = [imageio.imread(BytesIO(image)) for image in image_bytes]
    imageio.mimsave(gif_file, images)


with Flow("Image ETL") as flow:
    Path("src/pipeline/temp").touch()
    image_path = Path("src/pipeline/temp/image-data.img")
    gif_path = Path("src/pipeline/temp/comb.gif")

    DATA_URL = Parameter(
        "DATA_URL",
        default=
        "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true",
    )
    DATA_FILE = Parameter("DATA_FILE", default=image_path)

    # Extract
    command = curl_cmd(DATA_URL, DATA_FILE)
    curl = download(command=command)

    # Transform
    # we use the `upstream_tasks` keyword to specify non-data dependencies
    images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl])

    # Load
    frames = write_to_disk.map(images)
    combine_to_gif(frames, gif_path)
Пример #21
0
    TODO: Add this to a SQL database to save on memory
    """
    if bool_mrd:
        df.to_csv(os.path.join(dir_proc, f"mrd_{filename}.csv"), index=False)
    else:
        df.to_csv(os.path.join(dir_proc, f"full_{filename}.csv"), index=False)


# Create an output folder
dir_proc_today = make_dir_proc()
dir_proc = config.get("dir", "proc")

with Flow("Process data") as flow_proc:

    filename = Parameter("filename")
    nrow = Parameter("nrows")
    fp_data = fp_from_dir_raw(filename=filename)
    df = read_in_data(fp_data, nrows=nrow)
    df2 = basic_features(df)
    df3 = time_features(df2)
    df4 = lagged_features(df3)
    df5 = rolling_features(df4)  # PLACEHOLDER - doesn't do anything yet
    df6 = reshape_to_home(df5)
    # Save out
    for _dir in [dir_proc, dir_proc_today]:
        save_full_df = save_df(df5,
                               dir_proc=_dir,
                               filename=filename,
                               bool_mrd=False)
        save_mrd = save_df(df6,
Пример #22
0
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)


# For other template options https://docs.prefect.io/api/latest/utilities/context.html#context-2  
s3_result = S3Result(bucket="results-prefect-tst", location="{flow_name}-{today}/results.prefect")

#lcl_result = LocalResult(dir="~/prefect_guide/results/", location="{flow_name}/{today}")

result_h = s3_result

# A flow has no particular order unless the data is bound (shown) or explicitly set (not shown).
with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl:
    
    tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html")
    imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst")
    n_conn = Parameter("n_conn", default=1) 
    
    # Scrape the website
    tsx_imb_df = get_tsx_moc_imb(tsx_url)

    # Get the connection string from prefect cloud 
    conn_str = PrefectSecret("moc_pgdb_conn")
    
    # Partition the df to 
    tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn)

    df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str))

if __name__ == "__main__":
Пример #23
0
def _define_model_selection_flow():
    """Define flow that runs model selection.

    Specifically data filtering, partitioning and model selection
    and optional persistence on a given dataset

    Returns
    -------
    prefect.Flow
    """

    from prefect import task, Flow, Parameter, unmapped

    with Flow("model selection") as flow:
        df = Parameter("data")
        grid_search = Parameter("grid_search")
        target_col_name = Parameter("target_col_name")
        country_code_column = Parameter("country_code_column")
        include_rules = Parameter("include_rules")
        exclude_rules = Parameter("exclude_rules")
        parallel_over_columns = Parameter("parallel_over_columns")
        partition_columns = Parameter("partition_columns")
        frequency = Parameter("frequency")
        output_path = Parameter("output_path")
        persist_cv_data = Parameter("persist_cv_data")
        persist_cv_results = Parameter("persist_cv_results")
        persist_model_reprs = Parameter("persist_model_reprs")
        persist_best_model = Parameter("persist_best_model")
        persist_partition = Parameter("persist_partition")
        persist_model_selector_results = Parameter(
            "persist_model_selector_results")
        df_filtered = task(filter_data)(df=df,
                                        include_rules=include_rules,
                                        exclude_rules=exclude_rules)

        partitions = task(partition_data)(df=df_filtered,
                                          partition_by=parallel_over_columns)

        parallel_over_dicts, partition_dfs = partitions["labels"], partitions[
            "data"]

        train_data = task(prepare_data_for_training).map(
            df=partition_dfs,
            frequency=unmapped(frequency),
            partition_columns=unmapped(partition_columns),
            parallel_over_columns=unmapped(parallel_over_columns),
            country_code_column=unmapped(country_code_column),
        )
        results = task(select_model).map(
            df=train_data,
            target_col_name=unmapped(target_col_name),
            grid_search=unmapped(grid_search),
            partition_columns=unmapped(partition_columns),
            parallel_over_dict=parallel_over_dicts,
            frequency=unmapped(frequency),
            country_code_column=unmapped(country_code_column),
        )

        write_ok = task(persist_experts_in_physical_partition).map(
            results=results,
            folder_path=unmapped(output_path),
            persist_cv_results=unmapped(persist_cv_results),
            persist_cv_data=unmapped(persist_cv_data),
            persist_model_reprs=unmapped(persist_model_reprs),
            persist_best_model=unmapped(persist_best_model),
            persist_partition=unmapped(persist_partition),
            persist_model_selector_results=unmapped(
                persist_model_selector_results),
        )

    flow.set_reference_tasks([write_ok])

    return flow
Пример #24
0
    Args:
        - catalog (str, str): tuple with name of table and url
    """
    bq = bigquery.Client(project=GCP.project)
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"
    df = pd.DataFrame(requests.get(catalog[1]).json()["value"])
    job = bq.load_table_from_dataframe(
        dataframe=df,
        destination=f"{schema}.{catalog[0]}",
        project=GCP.project,
        location=GCP.location,
    )
    return job


gcp = Parameter("gcp", required=True)
with Flow("CBS catalogs") as flow:
    odatav3 = odatav3_catalog_to_gbq.map(catalog=list(CATALOGS.items()), GCP=unmapped(gcp))


def main(config):
    """Executes vektis.agb.flow in DaskExecutor.
    """
    flow.run(parameters={"gcp": config.gcp})


if __name__ == "__main__":
    config = get_config("dataverbinders")
    main(config=config)
@task
def return_list(val):
    return list(range(0, val))


@task
def parse_value(val):
    if val % 2 != 0:
        raise ValueError("Value is not even!")

    return val


@task(trigger=prefect.triggers.any_failed)
def catch_error(val):
    print(f"Do something with this value error: {val}")


schedule = IntervalSchedule(interval=timedelta(minutes=1))
with Flow("Raise error on Odd", schedule=schedule) as flow:
    my_param = Parameter("param")
    my_list = return_list(val=my_param)

    def_list = parse_value.map(my_list)

    catch_error.map(def_list)

params = [1, 2, 3, 4, 5]

for p in params:
    FlowRunner(flow=flow).initialize_run(parameters={"param": p})
Пример #26
0
        "Meta Data": meta_data,
        "Time Series (15min)": data
    })


@task
def persist_data_in_influx(injector: Injector,
                           av_response: InterdayResponseModel,
                           secrets: Dict[str, str]):
    influx_v2_client = injector.get(InfluxDBClient)
    influx_v2_client.write_api(SYNCHRONOUS).write(
        secrets['INFLUX_V2_BUCKET'],
        record=interday_response_model_to_points(av_response))


schedule = IntervalSchedule(interval=timedelta(hours=24))

with Flow("scrap-stock", schedule) as flow:
    injector = create_secret_injector_task()
    token_renewal_result = renew_token_task(injector)
    secrets = fetch_secret_task('common', 'kv', injector)
    stocks = Parameter("stocks", default=["GOOGL", "MSFT"])
    av_response = scrap_stock.map(stocks, secrets=unmapped(secrets))
    persist_data_in_influx.map(injector=unmapped(injector),
                               av_response=av_response,
                               secrets=unmapped(secrets))

flow.storage = GitHub(repo="piokra/prefect-tutorial", path="scrap_stock.py")

flow.run()
Пример #27
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb  4 09:17:15 2020

@author: felipe
"""

from prefect import Parameter, Flow

with Flow("Say hi!") as flow:
    name = Parameter("name")
    say_hello(name)

flow.run(name="Marvin")
Пример #28
0
from prefect import task, Flow, Parameter

## initialize the Parameter outside of any
## Flow context

add_num = Parameter("add_num", default=10)


@task
def add_one(x):
    return x + 1


with Flow("Flow 1") as flow_1:
    new_num1 = add_one(add_num)


@task
def add_two(y):
    return y + 1


with Flow("Flow 2") as flow_2:
    new_num2 = add_one(add_num)

combo_fl = Flow("Add Numbers")

combo_fl.update(flow_1)
combo_fl.update(flow_2, validate=True)

combo_fl.visualize()
Пример #29
0
def check_if_even(value):
    return value % 2 == 0


@task
def print_odd(value):
    print("{} is odd!".format(value))


@task
def print_even(value):
    print("{} is even!".format(value))


with Flow("Check Even/Odd") as f:
    value = Parameter("value")
    is_even = check_if_even(value)

    even = print_even(value)
    odd = print_odd(value)

    ifelse(is_even, even, odd)


# Prints '2 is even!'
f.run(value=2)


# Prints '1 is odd!'
f.run(value=1)
Пример #30
0
@task(max_retries=5, retry_delay=datetime.timedelta(minutes=10))
def root_task():
    pass


@task(
    cache_for=datetime.timedelta(days=10),
    cache_validator=partial_parameters_only(["x"]),
    result_handler=JSONResultHandler(),
)
def cached_task(x, y):
    pass


x = Parameter("x")
y = Parameter("y", default=42)


@task(name="Big Name",
      checkpoint=True,
      result_handler=S3ResultHandler(bucket="blob"))
def terminal_task():
    pass


env = RemoteEnvironment(
    executor="prefect.engine.executors.DaskExecutor",
    executor_kwargs={"scheduler_address": "tcp://"},
)
storage = Docker(