def _schedule_refresh_task_state(task_ex_id, delay=0): """Schedules task preconditions check. This method provides transactional decoupling of task preconditions check from events that can potentially satisfy those preconditions. It's needed in non-locking model in order to avoid 'phantom read' phenomena when reading state of multiple tasks to see if a task that depends on them can start. Just starting a separate transaction without using scheduler is not safe due to concurrency window that we'll have in this case (time between transactions) whereas scheduler is a special component that is designed to be resistant to failures. :param task_ex_id: Task execution ID. :param delay: Delay. """ sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob(run_after=delay, func_name=_REFRESH_TASK_STATE_PATH, func_args={'task_ex_id': task_ex_id}, key=_get_refresh_state_job_key(task_ex_id)) sched.schedule(job)
def schedule_on_action_update(action_ex, delay=0): """Schedules task update check. This method provides transactional decoupling of action update from task update check. It's needed in non-locking model in order to avoid 'phantom read' phenomena when reading state of multiple actions to see if a task is updated. Just starting a separate transaction without using scheduler is not safe due to concurrency window that we'll have in this case (time between transactions) whereas scheduler is a special component that is designed to be resistant to failures. :param action_ex: Action execution. :param delay: Minimum amount of time before task update check should be made. """ # Optimization to avoid opening a new transaction if it's not needed. if not action_ex.task_execution.spec.get('with-items'): _on_action_update(action_ex) return sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=delay, func_name=_SCHEDULED_ON_ACTION_UPDATE_PATH, func_args={ 'action_ex_id': action_ex.id, 'wf_action': isinstance(action_ex, models.WorkflowExecution) }, key='th_on_a_u-%s' % action_ex.task_execution_id) sched.schedule(job)
def start(self): super(EngineServer, self).start() db_api.setup_db() self._scheduler = sched_base.get_system_scheduler() self._scheduler.start() self._expiration_policy_tg = expiration_policy.setup() action_heartbeat_checker.start() # If the current engine instance uses a local action executor # then we also need to initialize a heartbeat reporter for it. # Heartbeats will be sent to the engine tier in the same way as # with a remote executor. So if the current cluster node crashes # in the middle of executing an action then one of the remaining # engine instances will expire the action in a configured period # of time. if cfg.CONF.executor.type == 'local': action_heartbeat_sender.start() if self._setup_profiler: profiler_utils.setup('mistral-engine', cfg.CONF.engine.host) # Initialize and start RPC server. self._rpc_server = rpc.get_rpc_server_driver()(cfg.CONF.engine) self._rpc_server.register_endpoint(self) self._rpc_server.run(executor=cfg.CONF.oslo_rpc_executor) self._notify_started('Engine server started.')
def before_task_start(self, task): super(WaitBeforePolicy, self).before_task_start(task) # No need to wait for a task if delay is 0 if self.delay == 0: return ctx_key = 'wait_before_policy' policy_ctx = task.get_policy_context(ctx_key) if policy_ctx.get('skip'): # Unset state 'RUNNING_DELAYED'. task.set_state(states.RUNNING, None) return if task.get_state() != states.IDLE: policy_ctx.update({'skip': True}) task.set_state( states.RUNNING_DELAYED, "Delayed by 'wait-before' policy [delay=%s]" % self.delay) sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=self.delay, func_name=_CONTINUE_TASK_PATH, func_args={'task_ex_id': task.get_id()}) sched.schedule(job)
def before_task_start(self, task_ex, task_spec): super(TimeoutPolicy, self).before_task_start(task_ex, task_spec) # No timeout if delay is 0 if self.delay == 0: return sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=self.delay, func_name=_FAIL_IF_INCOMPLETE_TASK_PATH, func_args={ 'task_ex_id': task_ex.id, 'timeout': self.delay } ) sched.schedule(job) wf_trace.info( task_ex, "Timeout check scheduled [task=%s, timeout(s)=%s]." % (task_ex.id, self.delay) )
def after_task_complete(self, task_ex, task_spec): super(WaitAfterPolicy, self).after_task_complete(task_ex, task_spec) # No need to postpone a task if delay is 0 if self.delay == 0: return context_key = 'wait_after_policy' runtime_context = _ensure_context_has_key( task_ex.runtime_context, context_key ) task_ex.runtime_context = runtime_context policy_context = runtime_context[context_key] if policy_context.get('skip'): # Skip, already processed. return policy_context.update({'skip': True}) _log_task_delay(task_ex, self.delay) end_state = task_ex.state end_state_info = task_ex.state_info # TODO(rakhmerov): Policies probably need to have tasks.Task # interface in order to manage task state safely. # Set task state to 'RUNNING_DELAYED'. task_ex.state = states.RUNNING_DELAYED task_ex.state_info = ( 'Suspended by wait-after policy for %s seconds' % self.delay ) # Schedule to change task state to RUNNING again. sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=self.delay, func_name=_COMPLETE_TASK_PATH, func_args={ 'task_ex_id': task_ex.id, 'state': end_state, 'state_info': end_state_info } ) sched.schedule(job)
def _schedule_if_needed(t_ex_id): # NOTE(rakhmerov): we need to minimize the number of scheduled jobs # that refresh state of "join" tasks. We'll check if corresponding # jobs are already scheduled. Note that we must ignore scheduled jobs # that are currently being processed because of a possible race with # the transaction that deletes scheduled jobs, i.e. the job may still # exist in DB (the deleting transaction didn't commit yet) but it has # already been processed and the task state hasn't changed. sched = sched_base.get_system_scheduler() jobs_exist = sched.has_scheduled_jobs( key=_get_refresh_state_job_key(t_ex_id), processing=False) if not jobs_exist: _schedule_refresh_task_state(t_ex_id)
def before_task_start(self, task_ex, task_spec): super(WaitBeforePolicy, self).before_task_start(task_ex, task_spec) # No need to wait for a task if delay is 0 if self.delay == 0: return context_key = 'wait_before_policy' runtime_context = _ensure_context_has_key( task_ex.runtime_context, context_key ) task_ex.runtime_context = runtime_context policy_context = runtime_context[context_key] if policy_context.get('skip'): # Unset state 'RUNNING_DELAYED'. wf_trace.info( task_ex, "Task '%s' [%s -> %s]" % (task_ex.name, states.RUNNING_DELAYED, states.RUNNING) ) task_ex.state = states.RUNNING return if task_ex.state != states.IDLE: policy_context.update({'skip': True}) _log_task_delay(task_ex, self.delay) task_ex.state = states.RUNNING_DELAYED sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=self.delay, func_name=_CONTINUE_TASK_PATH, func_args={ 'task_ex_id': task_ex.id } ) sched.schedule(job)
def _schedule_check_and_fix_integrity(wf_ex, delay=0): """Schedules workflow integrity check. :param wf_ex: Workflow execution. :param delay: Minimum amount of time before the check should be made. """ if CONF.engine.execution_integrity_check_delay < 0: # Never check integrity if it's a negative value. return sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob(run_after=delay, func_name=_CHECK_AND_FIX_INTEGRITY_PATH, func_args={'wf_ex_id': wf_ex.id}, key=_get_integrity_check_key(wf_ex)) sched.schedule(job)
def test_delete_workflow_integrity_check_on_execution_delete(self): wf_text = """--- version: '2.0' wf: tasks: async_task: action: std.async_noop """ wf_service.create_workflows(wf_text) wf_ex = self.engine.start_workflow('wf') db_api.delete_workflow_execution(wf_ex.id) sched = sched_base.get_system_scheduler() self._await(lambda: not sched.has_scheduled_jobs())
def after_task_complete(self, task): super(WaitAfterPolicy, self).after_task_complete(task) # No need to postpone a task if delay is 0 if self.delay == 0: return ctx_key = 'wait_after_policy' policy_ctx = task.get_policy_context(ctx_key) if policy_ctx.get('skip'): # Skip, already processed. return policy_ctx.update({'skip': True}) end_state = task.get_state() end_state_info = task.get_state_info() # Set task state to 'RUNNING_DELAYED'. task.set_state( states.RUNNING_DELAYED, "Delayed by 'wait-after' policy [delay=%s]" % self.delay ) # Schedule to change task state to RUNNING again. sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob( run_after=self.delay, func_name=_COMPLETE_TASK_PATH, func_args={ 'task_ex_id': task.get_id(), 'state': end_state, 'state_info': end_state_info } ) sched.schedule(job)
def start(self): super(EngineServer, self).start() db_api.setup_db() self._scheduler = sched_base.get_system_scheduler() self._scheduler.start() self._expiration_policy_tg = expiration_policy.setup() action_execution_checker.start() if self._setup_profiler: profiler_utils.setup('mistral-engine', cfg.CONF.engine.host) # Initialize and start RPC server. self._rpc_server = rpc.get_rpc_server_driver()(cfg.CONF.engine) self._rpc_server.register_endpoint(self) self._rpc_server.run(executor=cfg.CONF.oslo_rpc_executor) self._notify_started('Engine server started.')
def after_task_complete(self, task_ex, task_spec): """Possible Cases: 1. state = SUCCESS if continue_on is not specified, no need to move to next iteration; if current:count achieve retry:count then policy breaks the loop (regardless on continue-on condition); otherwise - check continue_on condition and if it is True - schedule the next iteration, otherwise policy breaks the loop. 2. retry:count = 5, current:count = 2, state = ERROR, state = IDLE/DELAYED, current:count = 3 3. retry:count = 5, current:count = 4, state = ERROR Iterations complete therefore state = #{state}, current:count = 4. """ super(RetryPolicy, self).after_task_complete(task_ex, task_spec) # There is nothing to repeat if self.count == 0: return # TODO(m4dcoder): If the task_ex.action_executions and # task_ex.workflow_executions collection are not called, # then the retry_no in the runtime_context of the task_ex will not # be updated accurately. To be exact, the retry_no will be one # iteration behind. ex = task_ex.executions # noqa context_key = 'retry_task_policy' runtime_context = _ensure_context_has_key(task_ex.runtime_context, context_key) wf_ex = task_ex.workflow_execution ctx_view = data_flow.ContextView( data_flow.get_current_task_dict(task_ex), data_flow.evaluate_task_outbound_context(task_ex), wf_ex.context, wf_ex.input) continue_on_evaluation = expressions.evaluate(self._continue_on_clause, ctx_view) break_on_evaluation = expressions.evaluate(self._break_on_clause, ctx_view) task_ex.runtime_context = runtime_context state = task_ex.state if not states.is_completed(state) or states.is_cancelled(state): return policy_context = runtime_context[context_key] retry_no = 0 if 'retry_no' in policy_context: retry_no = policy_context['retry_no'] del policy_context['retry_no'] retries_remain = retry_no < self.count stop_continue_flag = (task_ex.state == states.SUCCESS and not self._continue_on_clause) stop_continue_flag = (stop_continue_flag or (self._continue_on_clause and not continue_on_evaluation)) break_triggered = (task_ex.state == states.ERROR and break_on_evaluation) if not retries_remain or break_triggered or stop_continue_flag: return data_flow.invalidate_task_execution_result(task_ex) policy_context['retry_no'] = retry_no + 1 runtime_context[context_key] = policy_context # NOTE(vgvoleg): join tasks in direct workflows can't be # retried as-is, because these tasks can't start without # a correct logical state. if hasattr(task_spec, "get_join") and task_spec.get_join(): from mistral.engine import task_handler as t_h _log_task_delay(task_ex, self.delay, states.WAITING) task_ex.state = states.WAITING t_h._schedule_refresh_task_state(task_ex.id, self.delay) return _log_task_delay(task_ex, self.delay) task_ex.state = states.RUNNING_DELAYED sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob(run_after=self.delay, func_name=_CONTINUE_TASK_PATH, func_args={'task_ex_id': task_ex.id}) sched.schedule(job)
def after_task_complete(self, task): """Possible Cases: 1. state = SUCCESS if continue_on is not specified, no need to move to next iteration; if current:count achieve retry:count then policy breaks the loop (regardless on continue-on condition); otherwise - check continue_on condition and if it is True - schedule the next iteration, otherwise policy breaks the loop. 2. retry:count = 5, current:count = 2, state = ERROR, state = IDLE/DELAYED, current:count = 3 3. retry:count = 5, current:count = 4, state = ERROR Iterations complete therefore state = #{state}, current:count = 4. """ super(RetryPolicy, self).after_task_complete(task) # There is nothing to repeat if self.count == 0: return # TODO(m4dcoder): If the task_ex.action_executions and # task_ex.workflow_executions collection are not called, # then the retry_no in the runtime_context of the task_ex will not # be updated accurately. To be exact, the retry_no will be one # iteration behind. ex = task.task_ex.executions # noqa ctx_key = 'retry_task_policy' expr_ctx = task.get_expression_context( ctx=data_flow.evaluate_task_outbound_context(task.task_ex)) continue_on_evaluation = expressions.evaluate(self._continue_on_clause, expr_ctx) break_on_evaluation = expressions.evaluate(self._break_on_clause, expr_ctx) state = task.get_state() if not states.is_completed(state) or states.is_cancelled(state): return policy_ctx = task.get_policy_context(ctx_key) retry_no = 0 if 'retry_no' in policy_ctx: retry_no = policy_ctx['retry_no'] del policy_ctx['retry_no'] retries_remain = retry_no < self.count stop_continue_flag = (task.get_state() == states.SUCCESS and not self._continue_on_clause) stop_continue_flag = (stop_continue_flag or (self._continue_on_clause and not continue_on_evaluation)) break_triggered = (task.get_state() == states.ERROR and break_on_evaluation) if not retries_remain or break_triggered or stop_continue_flag: return task.invalidate_result() policy_ctx['retry_no'] = retry_no + 1 task.touch_runtime_context() # NOTE(vgvoleg): join tasks in direct workflows can't be # retried as-is, because these tasks can't start without # a correct logical state. if hasattr(task.task_spec, "get_join") and task.task_spec.get_join(): # TODO(rakhmerov): This is an example of broken encapsulation. # The control over such operations should belong to the class Task. # If it's done, from the outside of the class there will be just # one visible operation "continue_task()" or something like that. from mistral.engine import task_handler as t_h task.set_state(states.WAITING, "Delayed by 'retry' policy [delay=%s]" % self.delay) t_h._schedule_refresh_task_state(task.get_id(), self.delay) return task.set_state(states.RUNNING_DELAYED, "Delayed by 'retry' policy [delay=%s]" % self.delay) sched = sched_base.get_system_scheduler() job = sched_base.SchedulerJob(run_after=self.delay, func_name=_CONTINUE_TASK_PATH, func_args={'task_ex_id': task.get_id()}) sched.schedule(job)