def test_graphql_repr_falls_back_to_dict_repr(): gql = {"flow_run": Pending("test")} res = GraphQLResult(gql) assert repr(res) == """{'flow_run': <Pending: "test">}"""
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.engine.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ # this dictionary is used for tracking the states of "children" mapped tasks; # when running on Dask, we want to avoid serializing futures, so instead # of storing child task states in the `map_states` attribute we instead store # in this dictionary and only after they are resolved do we attach them to the Mapped state mapped_children = dict() # type: Dict[Task, list] if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError( "Some tasks in return_tasks were not found in the flow.") def extra_context(task: Task, task_index: int = None) -> dict: return { "task_name": task.name, "task_tags": task.tags, "task_index": task_index, } # -- process each task in order with executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) # if a task is a constant task, we already know its return value # no need to use up resources by running it through a task runner if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant): task_states[task] = task_state = Success(result=task.value) # if the state is finished, don't run the task, just use the provided state if # the state is cached / mapped, we still want to run the task runner pipeline # steps to either ensure the cache is still valid / or to recreate the mapped # pipeline for possible retries if (isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped()): continue upstream_states = {} # type: Dict[Edge, State] # this dictionary is used exclusively for "reduce" tasks in particular we store # the states / futures corresponding to the upstream children, and if running # on Dask, let Dask resolve them at the appropriate time. # Note: this is an optimization that allows Dask to resolve the mapped # dependencies by "elevating" them to a function argument. upstream_mapped_states = {} # type: Dict[Edge, list] # -- process each edge to the task for edge in self.flow.edges_to(task): # load the upstream task states (supplying Pending as a default) upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.")) # if the edge is flattened and not the result of a map, then we # preprocess the upstream states. If it IS the result of a # map, it will be handled in `prepare_upstream_states_for_mapping` if edge.flattened: if not isinstance(upstream_states[edge], Mapped): upstream_states[edge] = executor.submit( executors.flatten_upstream_state, upstream_states[edge]) # this checks whether the task is a "reduce" task for a mapped pipeline # and if so, collects the appropriate upstream children if not edge.mapped and isinstance(upstream_states[edge], Mapped): children = mapped_children.get(edge.upstream_task, []) # if the edge is flattened, then we need to wait for the mapped children # to complete and then flatten them if edge.flattened: children = executors.flatten_mapped_children( mapped_children=children, executor=executor, ) upstream_mapped_states[edge] = children # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant( val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=ConstantResult(value=val), ) # handle mapped tasks if any([edge.mapped for edge in upstream_states.keys()]): # wait on upstream states to determine the width of the pipeline # this is the key to depth-first execution upstream_states = executor.wait( {e: state for e, state in upstream_states.items()}) # we submit the task to the task runner to determine if # we can proceed with mapping - if the new task state is not a Mapped # state then we don't proceed task_states[task] = executor.wait( executor.submit( run_task, task=task, state=task_state, # original state upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers= task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, is_mapped_parent=True, extra_context=extra_context(task), )) # either way, we should now have enough resolved states to restructure # the upstream states into a list of upstream state dictionaries to iterate over list_of_upstream_states = executors.prepare_upstream_states_for_mapping( task_states[task], upstream_states, mapped_children, executor=executor, ) submitted_states = [] for idx, states in enumerate(list_of_upstream_states): # if we are on a future rerun of a partially complete flow run, # there might be mapped children in a retrying state; this check # looks into the current task state's map_states for such info if (isinstance(task_state, Mapped) and len(task_state.map_states) >= idx + 1): current_state = task_state.map_states[ idx] # type: Optional[State] elif isinstance(task_state, Mapped): current_state = None else: current_state = task_state # this is where each child is submitted for actual work submitted_states.append( executor.submit( run_task, task=task, state=current_state, upstream_states=states, context=dict( prefect.context, **task_contexts.get(task, {}), map_index=idx, ), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers= task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task, task_index=idx), )) if isinstance(task_states.get(task), Mapped): mapped_children[ task] = submitted_states # type: ignore else: task_states[task] = executor.submit( run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers=task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task), ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union( return_tasks) final_states = executor.wait({ t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks }) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): # ensure we wait for any mapped children to complete if t in mapped_children: s.map_states = executor.wait(mapped_children[t]) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set( flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks])) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state
class FlowRunner(Runner): """ FlowRunners handle the execution of Flows and determine the State of a Flow before, during and after the Flow is run. In particular, through the FlowRunner you can specify which tasks should be the first tasks to run, which tasks should be returned after the Flow is finished, and what states each task should be initialized with. Args: - flow (Flow): the `Flow` to be run - task_runner_cls (TaskRunner, optional): The class used for running individual Tasks. Defaults to [TaskRunner](task_runner.html) - state_handlers (Iterable[Callable], optional): A list of state change handlers that will be called whenever the flow changes state, providing an opportunity to inspect or modify the new state. The handler will be passed the flow runner instance, the old (prior) state, and the new (current) state, with the following signature: `state_handler(fr: FlowRunner, old_state: State, new_state: State) -> Optional[State]` If multiple functions are passed, then the `new_state` argument will be the result of the previous handler. Note: new FlowRunners are initialized within the call to `Flow.run()` and in general, this is the endpoint through which FlowRunners will be interacted with most frequently. Example: ```python @task def say_hello(): print('hello') with Flow("My Flow") as f: say_hello() fr = FlowRunner(flow=f) flow_state = fr.run() ``` """ def __init__( self, flow: Flow, task_runner_cls: type = None, state_handlers: Iterable[Callable] = None, ): self.flow = flow if task_runner_cls is None: task_runner_cls = prefect.engine.get_default_task_runner_class() self.task_runner_cls = task_runner_cls super().__init__(state_handlers=state_handlers) def __repr__(self) -> str: return "<{}: {}>".format(type(self).__name__, self.flow.name) def call_runner_target_handlers(self, old_state: State, new_state: State) -> State: """ A special state handler that the FlowRunner uses to call its flow's state handlers. This method is called as part of the base Runner's `handle_state_change()` method. Args: - old_state (State): the old (previous) state - new_state (State): the new (current) state Returns: - State: the new state """ self.logger.debug( "Flow '{name}': Handling state change from {old} to {new}".format( name=self.flow.name, old=type(old_state).__name__, new=type(new_state).__name__, ) ) for handler in self.flow.state_handlers: new_state = handler(self.flow, old_state, new_state) or new_state return new_state def initialize_run( # type: ignore self, state: Optional[State], task_states: Dict[Task, State], context: Dict[str, Any], task_contexts: Dict[Task, Dict[str, Any]], parameters: Dict[str, Any], ) -> FlowRunnerInitializeResult: """ Initializes the Task run by initializing state and context appropriately. If the provided state is a Submitted state, the state it wraps is extracted. Args: - state (Optional[State]): the initial state of the run - task_states (Dict[Task, State]): a dictionary of any initial task states - context (Dict[str, Any], optional): prefect.Context to use for execution to use for each Task run - task_contexts (Dict[Task, Dict[str, Any]], optional): contexts that will be provided to each task - parameters(dict): the parameter values for the run Returns: - NamedTuple: a tuple of initialized objects: `(state, task_states, context, task_contexts)` """ # overwrite context parameters one-by-one context_params = context.setdefault("parameters", {}) for p in self.flow.parameters(): if not p.required: context_params.setdefault(p.name, p.default) for param, value in (parameters or {}).items(): context_params[param] = value context.update(flow_name=self.flow.name) context.setdefault("scheduled_start_time", pendulum.now("utc")) # add various formatted dates to context now = pendulum.now("utc") dates = { "date": now, "today": now.strftime("%Y-%m-%d"), "yesterday": now.add(days=-1).strftime("%Y-%m-%d"), "tomorrow": now.add(days=1).strftime("%Y-%m-%d"), "today_nodash": now.strftime("%Y%m%d"), "yesterday_nodash": now.add(days=-1).strftime("%Y%m%d"), "tomorrow_nodash": now.add(days=1).strftime("%Y%m%d"), } for key, val in dates.items(): context.setdefault(key, val) for task in self.flow.tasks: task_contexts.setdefault(task, {}).update( task_name=task.name, task_slug=self.flow.slugs[task] ) state, context = super().initialize_run(state=state, context=context) return FlowRunnerInitializeResult( state=state, task_states=task_states, context=context, task_contexts=task_contexts, ) def run( self, state: State = None, task_states: Dict[Task, State] = None, return_tasks: Iterable[Task] = None, parameters: Dict[str, Any] = None, task_runner_state_handlers: Iterable[Callable] = None, <<<<<<< HEAD executor: "prefect.executors.Executor" = None, ======= executor: "prefect.engine.executors.Executor" = None, >>>>>>> prefect clone context: Dict[str, Any] = None, task_contexts: Dict[Task, Dict[str, Any]] = None, ) -> State: """ The main endpoint for FlowRunners. Calling this method will perform all computations contained within the Flow and return the final state of the Flow. Args: - state (State, optional): starting state for the Flow. Defaults to `Pending` - task_states (dict, optional): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - parameters (dict, optional): dictionary of any needed Parameter values, with keys being strings representing Parameter names and values being their corresponding values - task_runner_state_handlers (Iterable[Callable], optional): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor, optional): executor to use when performing computation; defaults to the executor specified in your prefect configuration - context (Dict[str, Any], optional): prefect.Context to use for execution to use for each Task run - task_contexts (Dict[Task, Dict[str, Any]], optional): contexts that will be provided to each task Returns: - State: `State` representing the final post-run state of the `Flow`. """ self.logger.info("Beginning Flow run for '{}'".format(self.flow.name)) # make copies to avoid modifying user inputs task_states = dict(task_states or {}) context = dict(context or {}) task_contexts = dict(task_contexts or {}) parameters = dict(parameters or {}) if executor is None: # Use the executor on the flow, if configured executor = getattr(self.flow, "executor", None) if executor is None: executor = prefect.engine.get_default_executor_class()() self.logger.debug("Using executor type %s", type(executor).__name__) try: state, task_states, context, task_contexts = self.initialize_run( state=state, task_states=task_states, context=context, task_contexts=task_contexts, parameters=parameters, ) with prefect.context(context): state = self.check_flow_is_pending_or_running(state) state = self.check_flow_reached_start_time(state) state = self.set_flow_to_running(state) state = self.get_flow_run_state( state, task_states=task_states, task_contexts=task_contexts, return_tasks=return_tasks, task_runner_state_handlers=task_runner_state_handlers, executor=executor, ) except ENDRUN as exc: state = exc.state # All other exceptions are trapped and turned into Failed states except Exception as exc: self.logger.exception( "Unexpected error while running flow: {}".format(repr(exc)) ) if prefect.context.get("raise_on_exception"): raise exc new_state = Failed( message="Unexpected error while running flow: {}".format(repr(exc)), result=exc, ) state = self.handle_state_change(state or Pending(), new_state) return state
async def _create_flow_run( flow_id: str = None, parameters: dict = None, context: dict = None, scheduled_start_time: datetime.datetime = None, flow_run_name: str = None, version_group_id: str = None, ) -> Any: """ Creates a new flow run for an existing flow. Args: - flow_id (str): A string representing the current flow id - parameters (dict, optional): A dictionary of parameters that were specified for the flow - context (dict, optional): A dictionary of context values - scheduled_start_time (datetime.datetime): When the flow_run should be scheduled to run. If `None`, defaults to right now. Must be UTC. - flow_run_name (str, optional): An optional string representing this flow run - version_group_id (str, optional): An optional version group ID; if provided, will run the most recent unarchived version of the group """ if flow_id is None and version_group_id is None: raise ValueError( "One of flow_id or version_group_id must be provided.") scheduled_start_time = scheduled_start_time or pendulum.now() if flow_id: where_clause = {"id": {"_eq": flow_id}} elif version_group_id: where_clause = { "version_group_id": { "_eq": version_group_id }, "archived": { "_eq": False }, } flow = await models.Flow.where(where=where_clause).first( { "id": True, "archived": True, "tenant_id": True, "parameters": True, "flow_group_id": True, "flow_group": { "default_parameters": True }, }, order_by={"version": EnumValue("desc")}, ) # type: Any if not flow: msg = (f"Flow {flow_id} not found" if flow_id else f"Version group {version_group_id} has no unarchived flows.") raise exceptions.NotFound(msg) elif flow.archived: raise ValueError(f"Flow {flow.id} is archived.") # check parameters run_parameters = flow.flow_group.default_parameters run_parameters.update((parameters or {})) required_parameters = [p["name"] for p in flow.parameters if p["required"]] missing = set(required_parameters).difference(run_parameters) if missing: raise ValueError(f"Required parameters were not supplied: {missing}") state = Scheduled(message="Flow run scheduled.", start_time=scheduled_start_time) run = models.FlowRun( tenant_id=flow.tenant_id, flow_id=flow_id or flow.id, parameters=run_parameters, context=context or {}, scheduled_start_time=scheduled_start_time, name=flow_run_name or names.generate_slug(2), states=[ models.FlowRunState( tenant_id=flow.tenant_id, **models.FlowRunState.fields_from_state( Pending(message="Flow run created")), ) ], ) flow_run_id = await run.insert() # apply the flow run's initial state via `set_flow_run_state` await api.states.set_flow_run_state(flow_run_id=flow_run_id, state=state) return flow_run_id
@pytest.mark.parametrize( "state_check", [ dict(state=Cancelled(), assert_true={"is_finished"}), dict(state=Cached(), assert_true={"is_cached", "is_finished", "is_successful"}), dict(state=ClientFailed(), assert_true={"is_meta_state"}), dict(state=Failed(), assert_true={"is_finished", "is_failed"}), dict(state=Finished(), assert_true={"is_finished"}), dict(state=Looped(), assert_true={"is_finished", "is_looped"}), dict(state=Mapped(), assert_true={"is_finished", "is_mapped", "is_successful"}), dict(state=Paused(), assert_true={"is_pending", "is_scheduled"}), dict(state=Pending(), assert_true={"is_pending"}), dict(state=Queued(), assert_true={"is_meta_state", "is_queued"}), dict(state=Resume(), assert_true={"is_pending", "is_scheduled"}), dict(state=Retrying(), assert_true={"is_pending", "is_scheduled", "is_retrying"}), dict(state=Running(), assert_true={"is_running"}), dict(state=Scheduled(), assert_true={"is_pending", "is_scheduled"}), dict(state=Skipped(), assert_true={"is_finished", "is_successful", "is_skipped"}), dict(state=Submitted(), assert_true={"is_meta_state", "is_submitted"}), dict(state=Success(), assert_true={"is_finished", "is_successful"}), dict(state=TimedOut(), assert_true={"is_finished", "is_failed"}), dict(state=TriggerFailed(), assert_true={"is_finished", "is_failed"}), ], ) def test_state_is_methods(state_check):
def test_states_with_mutable_attrs_are_hashable(): assert {State(result=[1]), Pending(cached_inputs=dict(a=1))}
def initialize_run(self, *args, **kwargs): raise ENDRUN(state=Pending())
class TestTaskRunStates: async def test_set_task_run_state(self, task_run_id): result = await api.states.set_task_run_state( task_run_id=task_run_id, state=Failed() ) assert result.task_run_id == task_run_id query = await models.TaskRun.where(id=task_run_id).first( {"version", "state", "serialized_state"} ) assert query.version == 2 assert query.state == "Failed" assert query.serialized_state["type"] == "Failed" @pytest.mark.parametrize("state", [Failed(), Success()]) async def test_set_task_run_state_fails_with_wrong_task_run_id(self, state): with pytest.raises(ValueError, match="State update failed"): await api.states.set_task_run_state( task_run_id=str(uuid.uuid4()), state=state ) @pytest.mark.parametrize( "state", [s() for s in State.children() if not s().is_running()] ) async def test_state_does_not_set_heartbeat_unless_running( self, state, task_run_id ): task_run = await models.TaskRun.where(id=task_run_id).first({"heartbeat"}) assert task_run.heartbeat is None await api.states.set_task_run_state(task_run_id=task_run_id, state=state) task_run = await models.TaskRun.where(id=task_run_id).first({"heartbeat"}) assert task_run.heartbeat is None async def test_running_state_sets_heartbeat(self, task_run_id, running_flow_run_id): task_run = await models.TaskRun.where(id=task_run_id).first({"heartbeat"}) assert task_run.heartbeat is None dt = pendulum.now("UTC") await api.states.set_task_run_state(task_run_id=task_run_id, state=Running()) task_run = await models.TaskRun.where(id=task_run_id).first({"heartbeat"}) assert task_run.heartbeat > dt async def test_trigger_failed_state_does_not_set_end_time(self, task_run_id): await api.states.set_task_run_state( task_run_id=task_run_id, state=TriggerFailed() ) task_run_info = await models.TaskRun.where(id=task_run_id).first( {"id", "start_time", "end_time"} ) assert not task_run_info.start_time assert not task_run_info.end_time @pytest.mark.parametrize( "state", [s() for s in State.children() if s not in _MetaState.children()], ids=[s.__name__ for s in State.children() if s not in _MetaState.children()], ) async def test_setting_a_task_run_state_pulls_cached_inputs_if_possible( self, task_run_id, state, running_flow_run_id ): res1 = SafeResult(1, result_handler=JSONResultHandler()) res2 = SafeResult({"z": 2}, result_handler=JSONResultHandler()) complex_result = {"x": res1, "y": res2} cached_state = Failed(cached_inputs=complex_result) await models.TaskRun.where(id=task_run_id).update( set=dict(serialized_state=cached_state.serialize()) ) # try to schedule the task run to scheduled await api.states.set_task_run_state(task_run_id=task_run_id, state=state) task_run = await models.TaskRun.where(id=task_run_id).first( {"serialized_state"} ) # ensure the state change took place assert task_run.serialized_state["type"] == type(state).__name__ assert task_run.serialized_state["cached_inputs"]["x"]["value"] == 1 assert task_run.serialized_state["cached_inputs"]["y"]["value"] == {"z": 2} @pytest.mark.parametrize( "state", [ s(cached_inputs=None) for s in State.children() if s not in _MetaState.children() ], ids=[s.__name__ for s in State.children() if s not in _MetaState.children()], ) async def test_task_runs_with_null_cached_inputs_do_not_overwrite_cache( self, state, task_run_id, running_flow_run_id ): await api.states.set_task_run_state(task_run_id=task_run_id, state=state) # set up a Retrying state with non-null cached inputs res1 = SafeResult(1, result_handler=JSONResultHandler()) res2 = SafeResult({"z": 2}, result_handler=JSONResultHandler()) complex_result = {"x": res1, "y": res2} cached_state = Retrying(cached_inputs=complex_result) await api.states.set_task_run_state(task_run_id=task_run_id, state=cached_state) run = await models.TaskRun.where(id=task_run_id).first({"serialized_state"}) assert run.serialized_state["cached_inputs"]["x"]["value"] == 1 assert run.serialized_state["cached_inputs"]["y"]["value"] == {"z": 2} @pytest.mark.parametrize( "state_cls", [s for s in State.children() if s not in _MetaState.children()] ) async def test_task_runs_cached_inputs_give_preference_to_new_cached_inputs( self, state_cls, task_run_id, running_flow_run_id ): # set up a Failed state with null cached inputs res1 = SafeResult(1, result_handler=JSONResultHandler()) res2 = SafeResult({"a": 2}, result_handler=JSONResultHandler()) complex_result = {"b": res1, "c": res2} cached_state = state_cls(cached_inputs=complex_result) await api.states.set_task_run_state(task_run_id=task_run_id, state=cached_state) # set up a Retrying state with non-null cached inputs res1 = SafeResult(1, result_handler=JSONResultHandler()) res2 = SafeResult({"z": 2}, result_handler=JSONResultHandler()) complex_result = {"x": res1, "y": res2} cached_state = Retrying(cached_inputs=complex_result) await api.states.set_task_run_state(task_run_id=task_run_id, state=cached_state) run = Box( await models.TaskRun.where(id=task_run_id).first({"serialized_state"}) ) # verify that we have cached inputs, and that preference has been given to the new # cached inputs assert run.serialized_state.cached_inputs assert run.serialized_state.cached_inputs.x.value == 1 assert run.serialized_state.cached_inputs.y.value == {"z": 2} @pytest.mark.parametrize( "flow_run_state", [Pending(), Running(), Failed(), Success()] ) async def test_running_states_can_not_be_set_if_flow_run_is_not_running( self, flow_run_id, task_run_id, flow_run_state ): await api.states.set_flow_run_state( flow_run_id=flow_run_id, state=flow_run_state ) set_running_coroutine = api.states.set_task_run_state( task_run_id=task_run_id, state=Running() ) if flow_run_state.is_running(): assert await set_running_coroutine assert ( await models.TaskRun.where(id=task_run_id).first({"state"}) ).state == "Running" else: with pytest.raises(ValueError, match="is not in a running state"): await set_running_coroutine assert ( await models.TaskRun.where(id=task_run_id).first({"state"}) ).state != "Running"
def test_state_type_methods_with_pending_state(self): state = Pending() assert state.is_pending() assert not state.is_retrying() assert not state.is_cached() assert not state.is_running() assert not state.is_finished() assert not state.is_skipped() assert not state.is_scheduled() assert not state.is_successful() assert not state.is_failed() assert not state.is_mapped() assert not state.is_meta_state()
def test_flow_run_handles_error_states_when_initial_state_is_provided(): with Flow(name="test") as f: res = AddTask()("5", 5) state = f.run(state=Pending()) assert state.is_failed()
def test_graphql_repr_falls_back_to_dict_repr(): gql = {"flow_run": Pending("test")} res = as_nested_dict(gql, GraphQLResult) assert repr(res) == """{'flow_run': Pending("test")}"""
async def get_or_create_mapped_task_run_children( flow_run_id: str, task_id: str, max_map_index: int ) -> List[str]: """ Creates and/or retrieves mapped child task runs for a given flow run and task. Args: - flow_run_id (str): the flow run associated with the parent task run - task_id (str): the task ID to create and/or retrieve - max_map_index (int,): the number of mapped children e.g., a value of 2 yields 3 mapped children """ # grab task info task = await models.Task.where(id=task_id).first({"cache_key", "tenant_id"}) # generate task runs to upsert task_runs = [ models.TaskRun( tenant_id=task.tenant_id, flow_run_id=flow_run_id, task_id=task_id, map_index=i, cache_key=task.cache_key, ) for i in range(max_map_index + 1) ] # upsert the mapped children task_runs = ( await models.TaskRun().insert_many( objects=task_runs, on_conflict=dict( constraint="task_run_unique_identifier_key", update_columns=["cache_key"], ), selection_set={"returning": {"id", "map_index"}}, ) )["returning"] task_runs.sort(key=lambda task_run: task_run.map_index) # get task runs without states stateless_runs = await models.TaskRun.where( { "_and": [ {"flow_run_id": {"_eq": flow_run_id}}, {"task_id": {"_eq": task_id}}, {"state_id": {"_is_null": True}}, ] } ).get({"id", "map_index", "version"}) # create and insert states for stateless task runs task_run_states = [ models.TaskRunState( tenant_id=task.tenant_id, task_run_id=task_run.id, **models.TaskRunState.fields_from_state( Pending(message="Task run created") ), ) for task_run in stateless_runs ] await models.TaskRunState().insert_many(task_run_states) # return the task run ids return [task_run.id for task_run in task_runs]
def test_preparing_state_for_cloud_replaces_cached_inputs_with_safe(): xres = Result(3, result_handler=JSONResultHandler()) state = prepare_state_for_cloud(Pending(cached_inputs=dict(x=xres))) assert state.is_pending() assert state.result == NoResult assert state.cached_inputs == dict(x=xres)
def run( self, state: State = None, task_states: Dict[Task, State] = None, return_tasks: Iterable[Task] = None, parameters: Dict[str, Any] = None, task_runner_state_handlers: Iterable[Callable] = None, executor: "prefect.engine.executors.Executor" = None, context: Dict[str, Any] = None, task_contexts: Dict[Task, Dict[str, Any]] = None, ) -> State: """ The main endpoint for FlowRunners. Calling this method will perform all computations contained within the Flow and return the final state of the Flow. Args: - state (State, optional): starting state for the Flow. Defaults to `Pending` - task_states (dict, optional): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - parameters (dict, optional): dictionary of any needed Parameter values, with keys being strings representing Parameter names and values being their corresponding values - task_runner_state_handlers (Iterable[Callable], optional): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor, optional): executor to use when performing computation; defaults to the executor specified in your prefect configuration - context (Dict[str, Any], optional): prefect.Context to use for execution to use for each Task run - task_contexts (Dict[Task, Dict[str, Any]], optional): contexts that will be provided to each task Returns: - State: `State` representing the final post-run state of the `Flow`. """ self.logger.info("Beginning Flow run for '{}'".format(self.flow.name)) # make copies to avoid modifying user inputs task_states = dict(task_states or {}) context = dict(context or {}) task_contexts = dict(task_contexts or {}) parameters = dict(parameters or {}) if executor is None: executor = prefect.engine.get_default_executor_class()() try: state, task_states, context, task_contexts = self.initialize_run( state=state, task_states=task_states, context=context, task_contexts=task_contexts, parameters=parameters, ) with prefect.context(context): state = self.check_flow_is_pending_or_running(state) state = self.check_flow_reached_start_time(state) state = self.set_flow_to_running(state) state = self.get_flow_run_state( state, task_states=task_states, task_contexts=task_contexts, return_tasks=return_tasks, task_runner_state_handlers=task_runner_state_handlers, executor=executor, ) except ENDRUN as exc: state = exc.state # All other exceptions are trapped and turned into Failed states except Exception as exc: self.logger.exception( "Unexpected error while running flow: {}".format(repr(exc))) if prefect.context.get("raise_on_exception"): raise exc new_state = Failed( message="Unexpected error while running flow: {}".format( repr(exc)), result=exc, ) state = self.handle_state_change(state or Pending(), new_state) return state
class TestRunFlowStep: def test_running_state_finishes(self): flow = Flow(name="test", tasks=[Task()]) new_state = FlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_successful() @pytest.mark.parametrize( "state", [Pending(), Retrying(), Finished(), Success(), Failed(), Skipped()]) def test_other_states_raise_endrun(self, state): flow = Flow(name="test", tasks=[Task()]) with pytest.raises(ENDRUN): FlowRunner(flow=flow).get_flow_run_state( state=state, task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=Executor(), ) def test_determine_final_state_has_final_say(self): class MyFlowRunner(FlowRunner): def determine_final_state(self, *args, **kwargs): return Failed("Very specific error message") flow = Flow(name="test", tasks=[Task()]) new_state = MyFlowRunner(flow=flow).get_flow_run_state( state=Running(), task_states={}, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state.is_failed() assert new_state.message == "Very specific error message" def test_determine_final_state_preserves_running_states_when_tasks_still_running( self, ): task = Task() flow = Flow(name="test", tasks=[task]) old_state = Running() new_state = FlowRunner(flow=flow).get_flow_run_state( state=old_state, task_states={ task: Retrying(start_time=pendulum.now("utc").add(days=1)) }, task_contexts={}, return_tasks=set(), task_runner_state_handlers=[], executor=LocalExecutor(), ) assert new_state is old_state
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.engine.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError( "Some tasks in return_tasks were not found in the flow.") # -- process each task in order with executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant): task_states[task] = task_state = Success(result=task.value) # if the state is finished, don't run the task, just use the provided state if (isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped()): continue upstream_states = { } # type: Dict[Edge, Union[State, Iterable]] # -- process each edge to the task for edge in self.flow.edges_to(task): upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.")) # -- run the task with prefect.context(task_full_name=task.name, task_tags=task.tags): task_states[task] = executor.submit( self.run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), task_runner_state_handlers=task_runner_state_handlers, executor=executor, ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union( return_tasks) final_states = executor.wait({ t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks }) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): s.map_states = executor.wait(s.map_states) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set( flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks])) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state
class TestInitializeRun: def test_initialize_sets_none_to_pending(self): result = FlowRunner(Flow(name="test")).initialize_run(state=None, task_states={}, context={}, task_contexts={}, parameters={}) assert result.state.is_pending() @pytest.mark.parametrize("state", [Pending(), Running()]) def test_initialize_returns_state_if_provided(self, state): result = FlowRunner(Flow(name="test")).initialize_run(state=state, task_states={}, context={}, task_contexts={}, parameters={}) assert result.state is state def test_initialize_sets_task_contexts(self): t1 = Task(name="t1") t2 = Parameter(name="x") flow = Flow(name="test", tasks=[t1, t2]) result = FlowRunner(flow).initialize_run(state=Pending(), task_states={}, context={}, task_contexts={}, parameters={}) assert result.task_contexts == { t: dict(task_name=t.name, task_slug=flow.slugs[t]) for t in flow.tasks } def test_initialize_puts_parameters_in_context(self): x = Parameter(name="x") flow = Flow(name="test", tasks=[x]) result = FlowRunner(flow).initialize_run( state=Pending(), task_states={}, context={}, task_contexts={}, parameters={"x": 1}, ) assert result.context["parameters"] == {"x": 1} def test_parameter_precedance(self): x = Parameter(name="x") flow = Flow(name="test", tasks=[x]) result = FlowRunner(flow).initialize_run( state=Pending(), task_states={}, context={"parameters": { "x": 2, "y": 1 }}, task_contexts={}, parameters={"x": 1}, ) assert result.context["parameters"] == {"x": 1, "y": 1}
def test_task_map_with_no_upstream_results_and_a_mapped_state(executor): """ This test makes sure that mapped tasks properly generate children tasks even when run multiple times and without available upstream results. In this test, we run the pipeline from a variety of starting points, ensuring that some upstream results are unavailable and checking that children pipelines are properly regenerated. """ @prefect.task def numbers(): return [1, 2, 3] @prefect.task def plus_one(x): return x + 1 @prefect.task def get_sum(x): return sum(x) with Flow(name="test") as f: n = numbers() x = plus_one.map(n) y = plus_one.map(x) s = get_sum(y) # first run with a missing result from `n` but map_states for `x` state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[ Pending(cached_inputs={"x": Result(i)}) for i in range(1, 4) ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12 # next run with missing results for n and x state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[Success(), Success(), Success()]), y: Mapped(map_states=[ Success(result=3), Success(result=4), Retrying(cached_inputs={"x": Result(4)}), ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12 # next run with missing results for n, x, and y state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[Success(), Success(), Success()]), y: Mapped(map_states=[ Success(result=3), Success(result=4), Success(result=5) ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12
class TestTaskRunStates: async def test_set_task_run_state(self, task_run_id): result = await api.states.set_task_run_state(task_run_id=task_run_id, state=Failed()) assert result.task_run_id == task_run_id query = await models.TaskRun.where(id=task_run_id).first( {"version", "state", "serialized_state"}) assert query.version == 2 assert query.state == "Failed" assert query.serialized_state["type"] == "Failed" @pytest.mark.parametrize("state", [Failed(), Success()]) async def test_set_task_run_state_fails_with_wrong_task_run_id( self, state): with pytest.raises(ValueError, match="State update failed"): await api.states.set_task_run_state(task_run_id=str(uuid.uuid4()), state=state) @pytest.mark.parametrize( "state", [s() for s in State.children() if not s().is_running()]) async def test_state_does_not_set_heartbeat_unless_running( self, state, task_run_id): task_run = await models.TaskRun.where(id=task_run_id ).first({"heartbeat"}) assert task_run.heartbeat is None await api.states.set_task_run_state(task_run_id=task_run_id, state=state) task_run = await models.TaskRun.where(id=task_run_id ).first({"heartbeat"}) assert task_run.heartbeat is None async def test_running_state_sets_heartbeat(self, task_run_id, running_flow_run_id): task_run = await models.TaskRun.where(id=task_run_id ).first({"heartbeat"}) assert task_run.heartbeat is None dt = pendulum.now("UTC") await api.states.set_task_run_state(task_run_id=task_run_id, state=Running()) task_run = await models.TaskRun.where(id=task_run_id ).first({"heartbeat"}) assert task_run.heartbeat > dt async def test_trigger_failed_state_does_not_set_end_time( self, task_run_id): await api.states.set_task_run_state(task_run_id=task_run_id, state=TriggerFailed()) task_run_info = await models.TaskRun.where(id=task_run_id).first( {"id", "start_time", "end_time"}) assert not task_run_info.start_time assert not task_run_info.end_time @pytest.mark.parametrize( "flow_run_state", [Pending(), Running(), Failed(), Success()]) async def test_running_states_can_not_be_set_if_flow_run_is_not_running( self, flow_run_id, task_run_id, flow_run_state): await api.states.set_flow_run_state(flow_run_id=flow_run_id, state=flow_run_state) set_running_coroutine = api.states.set_task_run_state( task_run_id=task_run_id, state=Running()) if flow_run_state.is_running(): assert await set_running_coroutine assert (await models.TaskRun.where(id=task_run_id ).first({"state"})).state == "Running" else: with pytest.raises(ValueError, match="is not in a running state"): await set_running_coroutine assert (await models.TaskRun.where(id=task_run_id).first( {"state"})).state != "Running"
def test_states_are_hashable(): assert {State(), Pending(), Success()}
async def get_or_create_task_run_info( flow_run_id: str, task_id: str, map_index: int = None ) -> dict: """ Given a flow_run_id, task_id, and map_index, return details about the corresponding task run. If the task run doesn't exist, it will be created. Returns: - dict: a dict of details about the task run, including its id, version, and state. """ if map_index is None: map_index = -1 task_run = await models.TaskRun.where( { "flow_run_id": {"_eq": flow_run_id}, "task_id": {"_eq": task_id}, "map_index": {"_eq": map_index}, } ).first({"id", "version", "state", "serialized_state"}) if task_run: return dict( id=task_run.id, version=task_run.version, state=task_run.state, serialized_state=task_run.serialized_state, ) # if it isn't found, add it to the DB task = await models.Task.where(id=task_id).first({"cache_key", "tenant_id"}) if not task: raise ValueError("Invalid task ID") db_task_run = models.TaskRun( tenant_id=task.tenant_id, flow_run_id=flow_run_id, task_id=task_id, map_index=map_index, cache_key=task.cache_key, version=0, ) db_task_run_state = models.TaskRunState( tenant_id=task.tenant_id, state="Pending", timestamp=pendulum.now(), message="Task run created", serialized_state=Pending(message="Task run created").serialize(), ) db_task_run.states = [db_task_run_state] run = await db_task_run.insert( on_conflict=dict( constraint="task_run_unique_identifier_key", update_columns=["cache_key"], ), selection_set={"returning": {"id"}}, ) return dict( id=run.returning.id, version=db_task_run.version, state="Pending", serialized_state=db_task_run_state.serialized_state, )
def __init__(self, id, state=None, version=None): self.id = id self.state = state or Pending() self.version = version or 0
# TODO: we only need to rerun these tasks if any pending # downstream tasks depend on them. if ( isinstance( task, ( prefect.tasks.core.resource_manager.ResourceSetupTask, prefect.tasks.core.resource_manager.ResourceCleanupTask, prefect.tasks.secrets.SecretBase, ), ) and task_state is not None and task_state.is_finished() and not task_state.is_cached() ): task_states[task] = task_state = Pending() ======= >>>>>>> prefect clone # if the state is finished, don't run the task, just use the provided state if # the state is cached / mapped, we still want to run the task runner pipeline # steps to either ensure the cache is still valid / or to recreate the mapped # pipeline for possible retries if ( isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped() ): continue