def _fail_workflow(wf_ex_id, err, action_ex_id=None): """Private helper to fail workflow on exceptions.""" err_msg = str(err) with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if wf_ex is None: LOG.error( "Cant fail workflow execution with id='%s': not found.", wf_ex_id) return wf_handler.set_execution_state(wf_ex, states.ERROR, err_msg) if action_ex_id: # Note(dzimine): Don't call self.engine_client: # 1) to avoid computing and triggering next tasks # 2) to avoid a loop in case of error in transport action_ex = db_api.get_action_execution(action_ex_id) task_handler.on_action_complete(action_ex, wf_utils.Result(error=err_msg)) return wf_ex
def _fail_workflow(wf_ex_id, err, action_ex_id=None): """Private helper to fail workflow on exceptions.""" with db_api.transaction(): err_msg = str(err) wf_ex = db_api.load_workflow_execution(wf_ex_id) if wf_ex is None: LOG.error( "Cant fail workflow execution with id='%s': not found.", wf_ex_id ) return wf_handler.set_execution_state(wf_ex, states.ERROR, err_msg) if action_ex_id: # Note(dzimine): Don't call self.engine_client: # 1) to avoid computing and triggering next tasks # 2) to avoid a loop in case of error in transport action_ex = db_api.get_action_execution(action_ex_id) task_handler.on_action_complete( action_ex, wf_utils.Result(error=err_msg) )
def _check_and_complete(wf_ex_id): # Note: This method can only be called via scheduler. with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex or states.is_completed(wf_ex.state): return wf = workflows.Workflow(wf_ex=wf_ex) try: incomplete_tasks_count = wf.check_and_complete() except exc.MistralException as e: msg = ("Failed to check and complete [wf_ex=%s]:" " %s\n%s" % (wf_ex, e, tb.format_exc())) LOG.error(msg) force_fail_workflow(wf.wf_ex, msg) return if not states.is_completed(wf_ex.state): # Let's assume that a task takes 0.01 sec in average to complete # and based on this assumption calculate a time of the next check. # The estimation is very rough but this delay will be decreasing # as tasks will be completing which will give a decent # approximation. # For example, if a workflow has 100 incomplete tasks then the # next check call will happen in 10 seconds. For 500 tasks it will # be 50 seconds. The larger the workflow is, the more beneficial # this mechanism will be. delay = int(incomplete_tasks_count * 0.01) _schedule_check_and_complete(wf_ex, delay)
def _get_workflow_execution(id, must_exist=True): with db_api.transaction(): if must_exist: wf_ex = db_api.get_workflow_execution(id) else: wf_ex = db_api.load_workflow_execution(id) return _load_deferred_output_field(wf_ex)
def _scheduled_on_action_update(action_ex_id, wf_action): with db_api.transaction(): if wf_action: action_ex = db_api.load_workflow_execution(action_ex_id) else: action_ex = db_api.load_action_execution(action_ex_id) if action_ex: _on_action_update(action_ex)
def _get_workflow_execution(id, must_exist=True): with db_api.transaction(): if must_exist: wf_ex = db_api.get_workflow_execution(id) else: wf_ex = db_api.load_workflow_execution(id) return rest_utils.load_deferred_fields( wf_ex, ['params', 'input', 'output', 'context', 'spec'])
def check_and_complete(wf_ex_id): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex or states.is_completed(wf_ex.state): return wf = workflows.Workflow(wf_ex=wf_ex) try: wf.check_and_complete() except exc.MistralException as e: msg = ("Failed to check and complete [wf_ex_id=%s, wf_name=%s]:" " %s\n%s" % (wf_ex_id, wf_ex.name, e, tb.format_exc())) LOG.error(msg) force_fail_workflow(wf.wf_ex, msg)
def _fail_workflow(wf_ex_id, exc): """Private helper to fail workflow on exceptions.""" with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if wf_ex is None: LOG.error( "Can't fail workflow execution with id='%s': not found.", wf_ex_id ) return None wf_ex = wf_handler.lock_workflow_execution(wf_ex_id) if not states.is_paused_or_completed(wf_ex.state): wf_handler.set_execution_state(wf_ex, states.ERROR, str(exc)) return wf_ex
def check_and_complete(wf_ex_id): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex or states.is_completed(wf_ex.state): return wf = workflows.Workflow(wf_ex=wf_ex) try: wf.check_and_complete() except exc.MistralException as e: msg = ( "Failed to check and complete [wf_ex_id=%s, wf_name=%s]:" " %s\n%s" % (wf_ex_id, wf_ex.name, e, tb.format_exc()) ) LOG.error(msg) force_fail_workflow(wf.wf_ex, msg)
def _check_and_complete(wf_ex_id): # Note: This method can only be called via scheduler. with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex or states.is_completed(wf_ex.state): return wf = workflows.Workflow(wf_ex=wf_ex) try: check_and_fix_integrity(wf_ex) num_incomplete_tasks = wf.check_and_complete() if not states.is_completed(wf_ex.state): delay = ( 2 + int(num_incomplete_tasks * 0.1) if num_incomplete_tasks else 4 ) # Rescheduling this check may not happen if errors are # raised in the business logic. If the error is DB related # and not considered fatal (e.g. disconnect, deadlock), the # retry annotation around the method will ensure that the # whole method is retried in a new transaction. On fatal # errors, the check should not be rescheduled as it could # result in undesired consequences. # In case there are some errors that should not be # considered fatal, those should be handled explicitly. _schedule_check_and_complete(wf_ex, delay) except exc.MistralException as e: msg = ( "Failed to check and complete [wf_ex_id=%s, wf_name=%s]:" " %s\n%s" % (wf_ex_id, wf_ex.name, e, tb.format_exc()) ) LOG.error(msg) force_fail_workflow(wf.wf_ex, msg)
def _check_and_complete(wf_ex_id): # Note: This method can only be called via scheduler. with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex or states.is_completed(wf_ex.state): return wf = workflows.Workflow( db_api.get_workflow_definition(wf_ex.workflow_id), wf_ex=wf_ex ) try: incomplete_tasks_count = wf.check_and_complete() except exc.MistralException as e: msg = ( "Failed to check and complete [wf_ex=%s]:" " %s\n%s" % (wf_ex, e, tb.format_exc()) ) LOG.error(msg) force_fail_workflow(wf.wf_ex, msg) return if not states.is_completed(wf_ex.state): # Let's assume that a task takes 0.01 sec in average to complete # and based on this assumption calculate a time of the next check. # The estimation is very rough but this delay will be decreasing # as tasks will be completing which will give a decent # approximation. # For example, if a workflow has 100 incomplete tasks then the # next check call will happen in 10 seconds. For 500 tasks it will # be 50 seconds. The larger the workflow is, the more beneficial # this mechanism will be. delay = int(incomplete_tasks_count * 0.01) _schedule_check_and_complete(wf_ex, delay)
def _check_and_fix_integrity(wf_ex_id): check_after_seconds = CONF.engine.execution_integrity_check_delay if check_after_seconds < 0: # Never check integrity if it's a negative value. return # To break cyclic dependency. from mistral.engine import task_handler with db_api.transaction(): wf_ex = db_api.load_workflow_execution(wf_ex_id) if not wf_ex: return if states.is_completed(wf_ex.state): return _schedule_check_and_fix_integrity(wf_ex, delay=120) running_task_execs = db_api.get_task_executions( workflow_execution_id=wf_ex.id, state=states.RUNNING, limit=CONF.engine.execution_integrity_check_batch_size) for t_ex in running_task_execs: # The idea is that we take the latest known timestamp of the task # execution and consider it eligible for checking and fixing only # if some minimum period of time elapsed since the last update. timestamp = t_ex.updated_at or t_ex.created_at delta = timeutils.delta_seconds(timestamp, timeutils.utcnow()) if delta < check_after_seconds: continue child_executions = t_ex.executions if not child_executions: continue all_finished = all( [states.is_completed(c_ex.state) for c_ex in child_executions]) if all_finished: # Find the timestamp of the most recently finished child. most_recent_child_timestamp = max([ c_ex.updated_at or c_ex.created_at for c_ex in child_executions ]) interval = timeutils.delta_seconds(most_recent_child_timestamp, timeutils.utcnow()) if interval > check_after_seconds: # We found a task execution in RUNNING state for which all # child executions are finished. We need to call # "schedule_on_action_complete" on the task handler for # any of the child executions so that the task state is # calculated and updated properly. LOG.warning( "Found a task execution that is likely stuck in" " RUNNING state because all child executions are" " finished, will try to recover [task_execution=%s]", t_ex.id) task_handler.schedule_on_action_complete( child_executions[-1])