示例#1
0
def test_simple_map(monkeypatch):

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())

    with prefect.Flow(name="test", result=PrefectResult()) as flow:
        t1 = plus_one.map([0, 1, 2])

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id)
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id) for t in flow.tasks if t is not t1
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        state = CloudFlowRunner(flow=flow).run(return_tasks=flow.tasks,
                                               executor=LocalExecutor())

    assert state.is_successful()
    assert client.flow_runs[flow_run_id].state.is_successful()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    # there should be a total of 4 task runs corresponding to the mapped task
    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)
示例#2
0
 def test_running_state_finishes(self):
     flow = Flow(name="test", tasks=[Task()])
     new_state = FlowRunner(flow=flow).get_flow_run_state(
         state=Running(),
         task_states={},
         task_contexts={},
         return_tasks=set(),
         task_runner_state_handlers=[],
         executor=LocalExecutor(),
     )
     assert new_state.is_successful()
示例#3
0
def test_e2e_pipeline():
    """Smoke test. Flow successfully executes using a local executor.
    """

    kwargs = {
        'url':
        'https://vincentarelbundock.github.io/Rdatasets/csv/stevedata/fakeTSD.csv',
        'cat_cols': ['year'],
        'endog': 'y',
        'exog': ['x1', 'x2']
    }

    state = e2e_pipeline.run(**kwargs, executor=LocalExecutor())
    assert state.is_successful()
示例#4
0
 def test_determine_final_state_preserves_running_states_when_tasks_still_running(
     self,
 ):
     task = Task()
     flow = Flow(name="test", tasks=[task])
     old_state = Running()
     new_state = FlowRunner(flow=flow).get_flow_run_state(
         state=old_state,
         task_states={task: Retrying(start_time=pendulum.now("utc").add(days=1))},
         task_contexts={},
         return_tasks=set(),
         task_runner_state_handlers=[],
         executor=LocalExecutor(),
     )
     assert new_state is old_state
示例#5
0
    def test_determine_final_state_has_final_say(self):
        class MyFlowRunner(FlowRunner):
            def determine_final_state(self, *args, **kwargs):
                return Failed("Very specific error message")

        flow = Flow(name="test", tasks=[Task()])
        new_state = MyFlowRunner(flow=flow).get_flow_run_state(
            state=Running(),
            task_states={},
            task_contexts={},
            return_tasks=set(),
            task_runner_state_handlers=[],
            executor=LocalExecutor(),
        )
        assert new_state.is_failed()
        assert new_state.message == "Very specific error message"
示例#6
0
def test_can_queue_successfully_and_run(monkeypatch):
    @prefect.task
    def return_one():
        return 1

    with prefect.Flow("test-queues-work!") as flow:
        t1 = return_one()

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())

    client = QueueingMockCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(
                id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id
            ),
        ]
        + [
            TaskRun(
                id=str(uuid.uuid4()), task_slug=flow.slugs[t1], flow_run_id=flow_run_id
            )
            for t in flow.tasks
            if t
            not in [
                t1,
            ]
        ],
        monkeypatch=monkeypatch,
        num_times_in_queue=6,
    )

    with prefect.context(flow_run_id=flow_run_id):
        run_state = CloudFlowRunner(flow=flow).run(
            executor=LocalExecutor(), return_tasks=flow.tasks
        )

    assert run_state.is_successful()

    # Pending -> Running -> Queued (4x) -> Success
    # State transitions that result in `set_flow_run_state` calls are from
    # Pending -> Running and Running -> Success, all others
    # are from Running -> Queued or Queued -> Queued
    assert client.call_count["set_flow_run_state"] == 2 + (client.num_times_in_queue)
示例#7
0
 def test_wait(self):
     """LocalExecutor's wait() method just returns its input"""
     assert LocalExecutor().wait(1) == 1
     assert LocalExecutor().wait(prefect) is prefect
示例#8
0
 def test_submit(self):
     """LocalExecutor directly executes the function"""
     assert LocalExecutor().submit(lambda: 1) == 1
     assert LocalExecutor().submit(lambda x: x, 1) == 1
     assert LocalExecutor().submit(lambda x: x, x=1) == 1
     assert LocalExecutor().submit(lambda: prefect) is prefect
示例#9
0
def local():
    "Local, immediate execution executor"
    yield LocalExecutor()
示例#10
0
# FLOWS CONFIGURATION

# Run config
run_config = KubernetesRun()

# Storage
FLOWS_DIR_PATH = '/opt/server/src/flows'
storage_kwargs = {
    'dockerfile': 'server/Dockerfile',
    'registry_url': REGISTRY_URL,
    'stored_as_script': True,
}

# Executer
local_executor = LocalExecutor()
dask_executor = DaskExecutor(address=DASK_SCHEDULER_ADDR)

# Result
if RESULT_SUBCLASS == 'azure':
    result = AzureResult(container=AZURE_RESULT_CONTAINER)
elif RESULT_SUBCLASS == 's3':
    result = S3Result(bucket=S3_RESULT_BUCKET)
else:
    result = LocalResult(dir=LOCAL_RESULT_DIR)


# Set flow run configs
mapreduce_wordcount.run_config = run_config

示例#11
0
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly.
    """

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = plus_one.map([-1, 0, 1])
        t2 = invert_fail_once.map(t1)

    t2.max_retries = 1
    t2.retry_delay = datetime.timedelta(minutes=100)

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_2,
                    task_slug=flow.slugs[t2],
                    flow_run_id=flow_run_id),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, t2]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_mapped()

    # there should be a total of 4 task runs corresponding to each mapped task
    for t in [t1, t2]:
        assert (len([
            tr for tr in client.task_runs.values()
            if tr.task_slug == flow.slugs[t]
        ]) == 4)

    # t2's first child task should be retrying
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert isinstance(t2_0.state, Retrying)

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    failed_id = [
        t_id for t_id, tr in client.task_runs.items()
        if tr.task_slug == flow.slugs[t2] and tr.map_index == 0
    ].pop()
    client.task_runs[failed_id].state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    # t2's first child task should be successful
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert t2_0.state.is_successful()
示例#12
0
def test_deep_map_with_a_retry(monkeypatch):
    """
    Creates a situation in which a deeply-mapped Flow encounters a one-time error in one
    of the middle layers. Running the flow a second time should resolve the error.

    DOES NOT WORK WITH DASK EXECUTORS because of the need for shared state on second run
    """

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())
    task_run_id_3 = str(uuid.uuid4())

    with prefect.Flow(name="test", result=PrefectResult()) as flow:
        t1 = plus_one.map([-1, 0, 1])
        t2 = invert_fail_once.map(t1)
        t3 = plus_one.map(t2)

    t2.max_retries = 1
    t2.retry_delay = datetime.timedelta(minutes=100)

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_2,
                    task_slug=flow.slugs[t2],
                    flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_3,
                    task_slug=flow.slugs[t3],
                    flow_run_id=flow_run_id),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, t2, t3]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_mapped()
    assert client.task_runs[task_run_id_3].state.is_mapped()

    # there should be a total of 4 task runs corresponding to each mapped task
    for t in [t1, t2, t3]:
        assert (len([
            tr for tr in client.task_runs.values()
            if tr.task_slug == flow.slugs[t]
        ]) == 4)

    # t2's first child task should be retrying
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert isinstance(t2_0.state, Retrying)

    # t3's first child task should be pending
    t3_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t3] and tr.map_index == 0)
    assert t3_0.state.is_pending()

    # RUN A SECOND TIME with an artificially updated start time
    failed_id = [
        t_id for t_id, tr in client.task_runs.items()
        if tr.task_slug == flow.slugs[t2] and tr.map_index == 0
    ].pop()
    client.task_runs[failed_id].state.start_time = pendulum.now("UTC")

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    # t2's first child task should be successful
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert t2_0.state.is_successful()

    # t3's first child task should be successful
    t3_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t3] and tr.map_index == 0)
    assert t3_0.state.is_successful()
示例#13
0
 def executor(self) -> Executor:
     return LocalExecutor()
示例#14
0
def test_prefect_executors(train_data, grid_search, parallel_columns):
    from dask.distributed import Client
    from prefect.executors import DaskExecutor
    from prefect.executors import LocalDaskExecutor
    from prefect.executors import LocalExecutor

    client = Client()

    executors = {
        "dask_already_running": DaskExecutor(address=client.scheduler.address),
        "local": LocalExecutor(),
        "local_dask": LocalDaskExecutor(),
        # this spins up LocalDaskExecutor, but just to check the interface
        "dask_create_on_call": DaskExecutor(),
    }

    for executor_name, executor in executors.items():
        flow, state = run_model_selection(
            df=train_data,
            grid_search=grid_search,
            target_col_name="Quantity",
            frequency="D",
            partition_columns=["Product"],
            parallel_over_columns=parallel_columns,
            include_rules=None,
            exclude_rules=None,
            country_code_column="Holidays_code",
            output_path="",
            persist_cv_results=False,
            persist_cv_data=False,
            persist_model_reprs=False,
            persist_best_model=False,
            persist_partition=False,
            persist_model_selector_results=False,
            visualize_success=False,
            executor=executor,
        )
        assert state.is_successful()

        results = select_model_general(
            df=train_data,
            grid_search=grid_search,
            target_col_name="Quantity",
            frequency="D",
            partition_columns=["Product"],
            parallel_over_columns=parallel_columns,
            executor=executor,
            include_rules=None,
            exclude_rules=None,
            country_code_column="Holidays_code",
            output_path="",
            persist_cv_results=False,
            persist_cv_data=False,
            persist_model_reprs=False,
            persist_best_model=False,
            persist_partition=False,
            persist_model_selector_results=False,
        )

        assert len(results) == len(train_data[parallel_columns +
                                              ["Product"]].drop_duplicates())
        assert isinstance(results[0], ModelSelectorResult)

        if executor_name == "dask_already_running":
            client.shutdown()

    if client.status != "closed":
        client.shutdown()
示例#15
0
def test_result(data, serializer):
    r = XpersistResult(
        CacheStore(),
        serializer=serializer,
    )

    new = r.write(data)
    assert new.read(new.location).value == data
    assert r.cache_store.get(new.location) == data
    assert r.exists(new.location)


@pytest.mark.parametrize(
    'executor',
    [
        LocalExecutor(),
        DaskExecutor(cluster_kwargs={
            'processes': False,
            'threads_per_worker': 8
        },
                     debug=True),
    ],
)
def test_result_flow(executor):
    os.environ['PREFECT__FLOWS__CHECKPOINTING'] = 'True'
    r = XpersistResult(
        CacheStore(),
        serializer='xarray.netcdf',
    )

    @task(target='testing.nc', result=r)
示例#16
0
 def test_is_pickleable(self):
     e = LocalExecutor()
     post = cloudpickle.loads(cloudpickle.dumps(e))
     assert isinstance(post, LocalExecutor)
示例#17
0
 def test_is_pickleable_after_start(self):
     e = LocalExecutor()
     with e.start():
         post = cloudpickle.loads(cloudpickle.dumps(e))
         assert isinstance(post, LocalExecutor)
示例#18
0
def test_non_keyed_states_are_hydrated_correctly_with_retries(
        monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies
    can affect the number of children spawned.
    """
    @prefect.task
    def return_list():
        return [1, 2, 3]

    @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20))
    def fail_once():
        if prefect.context.get("task_run_count", 0) < 2:
            raise SyntaxError("bad")
        else:
            return 100

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = fail_once.map(upstream_tasks=[return_list])

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(
                id=task_run_id_2,
                task_slug=flow.slugs[return_list],
                flow_run_id=flow_run_id,
            ),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, return_list]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_successful()

    # there should be a total of 4 task runs corresponding to each mapped task
    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)

    # t1's first child task should be retrying
    assert all([
        isinstance(tr.state, Retrying) for tr in client.task_runs.values()
        if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1)
    ])

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    for idx, tr in client.task_runs.items():
        if tr.task_slug == flow.slugs[t1] and tr.map_index != -1:
            tr.state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)
    assert all(tr.state.is_successful() for tr in client.task_runs.values())