def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED) LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num, "failed") elif itask.state.reset(TASK_STATUS_RETRYING): delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers(itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _can_auto_restart(): """Determine whether this workflow can safely auto stop-restart.""" # Check whether there is currently an available host to restart on. try: select_workflow_host(cached=False) except HostSelectException: LOG.critical('Workflow cannot automatically restart because:\n' + 'No alternative host to restart workflow on.') return False except Exception: # Any unexpected error in host selection shouldn't be able to take # down the workflow. LOG.critical('Workflow cannot automatically restart because:\n' + 'Error in host selection:\n' + traceback.format_exc()) return False else: return True
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message. Return True if no retries (hence go to the failed state). """ no_retries = False if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) job_d = get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) self.pflag = True if ( TimerFlags.EXECUTION_RETRY not in itask.try_timers or itask.try_timers[TimerFlags.EXECUTION_RETRY].next() is None ): # No retry lined up: definitive failure. if itask.state.reset(TASK_STATUS_FAILED): self.setup_event_handlers(itask, self.EVENT_FAILED, message) LOG.critical( "[%s] -job(%02d) %s", itask, itask.submit_num, "failed") no_retries = True else: # There is an execution retry lined up. timer = itask.try_timers[TimerFlags.EXECUTION_RETRY] self._retry_task(itask, timer.timeout) delay_msg = f"retrying in {timer.delay_timeout_as_str()}" if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_RETRY, f"{self.JOB_FAILED}, {delay_msg}") self._reset_job_timers(itask) return no_retries
def _set_auto_restart(scheduler, restart_delay=None, mode=AutoRestartMode.RESTART_NORMAL): """Configure the workflow to automatically stop and restart. Restart handled by `workflow_auto_restart`. Args: scheduler (cylc.flow.scheduler.Scheduler): Scheduler instance of the running workflow. restart_delay (cylc.flow.parsec.DurationFloat): Workflow will wait a random period between 0 and `restart_delay` seconds before attempting to stop/restart in order to avoid multiple workflows restarting simultaneously. mode (str): Auto stop-restart mode. Return: bool: False if it is not possible to automatically stop/restart the workflow due to its configuration/runtime state. """ # Check that the workflow isn't already shutting down. if scheduler.stop_mode: return True # Force mode, stop the workflow now, don't restart it. if mode == AutoRestartMode.FORCE_STOP: LOG.critical('This workflow will be shutdown as the workflow ' 'host is unable to continue running it.\n' 'When another workflow host becomes available ' 'the workflow can be restarted by:\n' f' $ cylc play {scheduler.workflow}') if scheduler.auto_restart_time: LOG.info('Scheduled automatic restart canceled') scheduler.auto_restart_time = time() scheduler.auto_restart_mode = mode return True # Check workflow isn't already scheduled to auto-stop. if scheduler.auto_restart_time is not None: return True # Workflow host is condemned and workflow running in no detach mode. # Raise an error to cause the workflow to abort. # This should raise an "abort" event and return a non-zero code to the # caller still attached to the workflow process. if scheduler.options.no_detach: raise RuntimeError('Workflow host condemned in no detach mode') # Check workflow is able to be safely restarted. if not _can_auto_restart(): return False LOG.info('Workflow will automatically restart on a new host.') if restart_delay is not None and restart_delay != 0: if restart_delay > 0: # Delay shutdown by a random interval to avoid many # workflows restarting simultaneously. shutdown_delay = int(random() * restart_delay) # nosec else: # Un-documented feature, schedule exact restart interval for # testing purposes. shutdown_delay = abs(int(restart_delay)) shutdown_time = time() + shutdown_delay LOG.info('Workflow will restart in %ss (at %s)', shutdown_delay, time2str(shutdown_time)) scheduler.auto_restart_time = shutdown_time else: scheduler.auto_restart_time = time() scheduler.auto_restart_mode = AutoRestartMode.RESTART_NORMAL return True