def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _send_by_remote_port(self, messages): """Send message by talking to the daemon (remote?) port.""" from cylc.network.httpclient import (SuiteRuntimeServiceClient, ClientError, ClientInfoError) # Convert time/duration into appropriate units retry_intvl = float( self.env_map.get(SuiteSrvFilesManager.KEY_TASK_MSG_RETRY_INTVL, self.MSG_RETRY_INTVL)) max_tries = int( self.env_map.get(SuiteSrvFilesManager.KEY_TASK_MSG_MAX_TRIES, self.MSG_MAX_TRIES)) client = SuiteRuntimeServiceClient( self.suite, owner=self.env_map.get(SuiteSrvFilesManager.KEY_OWNER), host=self.env_map.get(SuiteSrvFilesManager.KEY_HOST), port=self.env_map.get(SuiteSrvFilesManager.KEY_PORT), timeout=float( self.env_map.get(SuiteSrvFilesManager.KEY_TASK_MSG_TIMEOUT, self.MSG_TIMEOUT)), comms_protocol=self.env_map.get( SuiteSrvFilesManager.KEY_COMMS_PROTOCOL)) for i in range(1, max_tries + 1): # 1..max_tries inclusive try: for message in messages: client.put_message(self.task_id, self.severity, message) except ClientError as exc: sys.stderr.write( "%s WARNING - Send message: try %s of %s failed: %s\n" % (get_current_time_string(), i, max_tries, exc)) # Break if: # * Exhausted number of tries. # * Contact info file not found, suite probably not running. # Don't bother with retry, suite restart will poll any way. if i >= max_tries or isinstance(exc, ClientInfoError): # Issue a warning and let the task carry on sys.stderr.write("%s WARNING - MESSAGE SEND FAILED\n" % (get_current_time_string())) else: sys.stderr.write( " retry in %s seconds, timeout is %s\n" % (retry_intvl, client.timeout)) sleep(retry_intvl) # Reset in case contact info or passphrase change client.host = None client.port = None client.auth = None else: if i > 1: # Continue to write to STDERR, so users can easily see that # it has recovered from previous failures. sys.stderr.write( "%s INFO - Send message: try %s of %s succeeded\n" % (get_current_time_string(), i, max_tries)) break
def _run_command(ctx): """Execute a shell command and capture its output and exit status.""" if cylc.flags.debug: if ctx.cmd_kwargs.get('shell'): OUT.debug(ctx.cmd) else: OUT.debug("%s\n" % ' '.join([quote(cmd_str) for cmd_str in ctx.cmd])) if (SuiteProcPool.STOP_JOB_SUBMISSION.value and ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT): ctx.err = "job submission skipped (suite stopping)" ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG ctx.timestamp = get_current_time_string() return ctx try: stdin_file = None if ctx.cmd_kwargs.get('stdin_file_paths'): stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: for line in open(file_path): stdin_file.write(line) stdin_file.seek(0) elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = PIPE proc = Popen(ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except IOError as exc: if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) except OSError as exc: if exc.filename is None: exc.filename = ctx.cmd[0] if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) else: ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str')) ctx.ret_code = proc.wait() ctx.timestamp = get_current_time_string() return ctx
def _run_command(ctx): """Execute a shell command and capture its output and exit status.""" if cylc.flags.debug: if ctx.cmd_kwargs.get('shell'): OUT.debug(ctx.cmd) else: OUT.debug( "%s\n" % ' '.join([quote(cmd_str) for cmd_str in ctx.cmd])) if (SuiteProcPool.STOP_JOB_SUBMISSION.value and ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT): ctx.err = "job submission skipped (suite stopping)" ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG ctx.timestamp = get_current_time_string() return ctx try: stdin_file = None if ctx.cmd_kwargs.get('stdin_file_paths'): stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: for line in open(file_path): stdin_file.write(line) stdin_file.seek(0) elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = PIPE proc = Popen( ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except IOError as exc: if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) except OSError as exc: if exc.filename is None: exc.filename = ctx.cmd[0] if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) else: ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str')) ctx.ret_code = proc.wait() ctx.timestamp = get_current_time_string() return ctx
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical( "[%s] -job(%02d) %s", itask, itask.submit_num, "failed") else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _filter_submit_output(self, st_file_path, batch_sys, out, err): """Filter submit command output, if relevant.""" job_id = None if hasattr(batch_sys, "REC_ID_FROM_SUBMIT_ERR"): text = err rec_id = batch_sys.REC_ID_FROM_SUBMIT_ERR elif hasattr(batch_sys, "REC_ID_FROM_SUBMIT_OUT"): text = out rec_id = batch_sys.REC_ID_FROM_SUBMIT_OUT if rec_id: for line in str(text).splitlines(): match = rec_id.match(line) if match: job_id = match.group("id") job_status_file = open(st_file_path, "a") job_status_file.write("%s=%s\n" % ( self.CYLC_BATCH_SYS_JOB_ID, job_id)) job_status_file.write("%s=%s\n" % ( self.CYLC_BATCH_SYS_JOB_SUBMIT_TIME, get_current_time_string())) job_status_file.close() break if hasattr(batch_sys, "filter_submit_output"): out, err = batch_sys.filter_submit_output(out, err) return out, err, job_id
def _run_command_exit(cls, ctx, callback=None, callback_args=None): """Process command completion.""" ctx.timestamp = get_current_time_string() if callable(callback): if not callback_args: callback_args = [] callback(ctx, *callback_args)
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("job(%02d) %s" % (itask.submit_num, "failed"), itask=itask) else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def take_checkpoints(self, event, other_daos=None): """Add insert items to *_checkpoints tables. Select items in suite_params, broadcast_states and task_pool and prepare them for insert into the relevant *_checkpoints tables, and prepare an insert into the checkpoint_id table the event and the current time. If other_daos is a specified, it should be a list of CylcSuiteDAO objects. The logic will prepare insertion of the same items into the *_checkpoints tables of these DAOs as well. """ id_ = 1 for (max_id,) in self.connect().execute("SELECT MAX(id) FROM %(table)s" % {"table": self.TABLE_CHECKPOINT_ID}): if max_id >= id_: id_ = max_id + 1 daos = [self] if other_daos: daos.extend(other_daos) for dao in daos: dao.tables[self.TABLE_CHECKPOINT_ID].add_insert_item([id_, get_current_time_string(), event]) for table_name in [self.TABLE_SUITE_PARAMS, self.TABLE_BROADCAST_STATES, self.TABLE_TASK_POOL]: for row in self.connect().execute("SELECT * FROM %s" % table_name): for dao in daos: dao.tables[table_name + "_checkpoints"].add_insert_item([id_] + list(row))
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 items = line.split("|") # See cylc.batch_sys_manager.JobPollContext try: ( batch_sys_exit_polled, run_status, run_signal, time_submit_exit, time_run, time_run_exit ) = items[4:10] except IndexError: itask.summary['latest_message'] = 'poll failed' cylc.flags.iflag = True ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.POLLED_FLAG if run_status == "1" and run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, time_run_exit, flag) elif run_status == "1" and batch_sys_exit_polled == "1": # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + run_signal, time_run_exit, flag) elif run_status == "1": # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, time_run, flag) elif run_status == "0": # The job succeeded self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUCCEEDED, time_run_exit, flag) elif time_run and batch_sys_exit_polled == "1": # The job has terminated without executing the error trap self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, time_run, flag) elif batch_sys_exit_polled == "1": # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, time_submit_exit, flag) else: # The job never ran, and is in batch system self.task_events_mgr.process_message( itask, INFO, TASK_STATUS_SUBMITTED, time_submit_exit, flag)
def jobs_kill(self, job_log_root, job_log_dirs): """Kill multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ # Note: The more efficient way to do this is to group the jobs by their # batch systems, and call the kill command for each batch system once. # However, this will make it more difficult to determine if the kill # command for a particular job is successful or not. if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) now = get_current_time_string() for job_log_dir in job_log_dirs: ret_code, err = self.job_kill( os.path.join(job_log_root, job_log_dir, JOB_LOG_STATUS)) sys.stdout.write("%s%s|%s|%d\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code)) # Note: Print STDERR to STDOUT may look a bit strange, but it # requires less logic for the suite to parse the output. if err.strip(): for line in err.splitlines(True): if not line.endswith("\n"): line += "\n" sys.stdout.write("%s%s|%s|%s" % ( self.OUT_PREFIX_CMD_ERR, now, job_log_dir, line))
def put_broadcast(self, modified_settings, is_cancel=False): """Put or clear broadcasts in runtime database.""" now = get_current_time_string(display_sub_seconds=True) for broadcast_change in ( get_broadcast_change_iter(modified_settings, is_cancel)): broadcast_change["time"] = now self.db_inserts_map[self.TABLE_BROADCAST_EVENTS].append( broadcast_change) if is_cancel: self.db_deletes_map[self.TABLE_BROADCAST_STATES].append({ "point": broadcast_change["point"], "namespace": broadcast_change["namespace"], "key": broadcast_change["key"]}) # Delete statements are currently executed before insert # statements, so we should clear out any insert statements that # are deleted here. # (Not the most efficient logic here, but unless we have a # large number of inserts, then this should not be a big # concern.) inserts = [] for insert in self.db_inserts_map[self.TABLE_BROADCAST_STATES]: if any(insert[key] != broadcast_change[key] for key in ["point", "namespace", "key"]): inserts.append(insert) self.db_inserts_map[self.TABLE_BROADCAST_STATES] = inserts else: self.db_inserts_map[self.TABLE_BROADCAST_STATES].append({ "point": broadcast_change["point"], "namespace": broadcast_change["namespace"], "key": broadcast_change["key"], "value": broadcast_change["value"]})
def _update_db_queue(self): """Update the queue to the runtime DB.""" this_dump = self._get_dump() if this_dump != self.prev_dump: now = get_current_time_string(display_sub_seconds=True) self.settings_queue.append(RecordBroadcastObject(now, this_dump)) self.prev_dump = this_dump
def print_msg(self, msg): now = get_current_time_string(override_use_utc=self.utc) prefix = 'cylc (' + self.mode + ' - ' + now + '): ' if self.priority == 'NORMAL': print prefix + msg else: print >> sys.stderr, prefix + self.priority + ' ' + msg
def jobs_submit(self, job_log_root, job_log_dirs, remote_mode=False): """Submit multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) if remote_mode: items = self._jobs_submit_prep_by_stdin(job_log_root, job_log_dirs) else: items = self._jobs_submit_prep_by_args(job_log_root, job_log_dirs) now = get_current_time_string() for job_log_dir, batch_sys_name, submit_opts in items: job_file_path = os.path.join( job_log_root, job_log_dir, JOB_LOG_JOB) if not batch_sys_name: sys.stdout.write("%s%s|%s|1|\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir)) continue ret_code, out, err, job_id = self._job_submit_impl( job_file_path, batch_sys_name, submit_opts) sys.stdout.write("%s%s|%s|%d|%s\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code, job_id)) for key, value in [("STDERR", err), ("STDOUT", out)]: if value is None or not value.strip(): continue for line in value.splitlines(True): if not value.endswith("\n"): value += "\n" sys.stdout.write("%s%s|%s|[%s] %s" % ( self.OUT_PREFIX_COMMAND, now, job_log_dir, key, line))
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return o_status, o_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() flags.iflag = True # Log message = str(o_status) if o_hold_swap: message += " (%s)" % o_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug(message, itask=self.identity)
def print_msg( self, msg ): now = get_current_time_string(override_use_utc=self.utc) prefix = 'cylc (' + self.mode + ' - ' + now + '): ' if self.priority == 'NORMAL': print prefix + msg else: print >> sys.stderr, prefix + self.priority + ' ' + msg
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return prev_status, prev_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif status in TASK_STATUSES_ACTIVE: if self.status == TASK_STATUS_HELD: self.hold_swap = TASK_STATUS_HELD elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() self.is_updated = True # Log message = str(prev_status) if prev_hold_swap: message += " (%s)" % prev_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug("[%s] -%s", self.identity, message) return (prev_status, prev_hold_swap)
def jobs_kill(self, job_log_root, job_log_dirs): """Kill multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ # Note: The more efficient way to do this is to group the jobs by their # batch systems, and call the kill command for each batch system once. # However, this will make it more difficult to determine if the kill # command for a particular job is successful or not. if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) now = get_current_time_string() for job_log_dir in job_log_dirs: ret_code, err = self.job_kill( os.path.join(job_log_root, job_log_dir, "job.status")) sys.stdout.write("%s%s|%s|%d\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code)) # Note: Print STDERR to STDOUT may look a bit strange, but it # requires less logic for the suite to parse the output. if err.strip(): for line in err.splitlines(True): if not line.endswith("\n"): line += "\n" sys.stdout.write("%s%s|%s|%s" % ( self.OUT_PREFIX_CMD_ERR, now, job_log_dir, line))
def take_checkpoints(self, event, other_daos=None): """Add insert items to *_checkpoints tables. Select items in suite_params, broadcast_states and task_pool and prepare them for insert into the relevant *_checkpoints tables, and prepare an insert into the checkpoint_id table the event and the current time. If other_daos is a specified, it should be a list of CylcSuiteDAO objects. The logic will prepare insertion of the same items into the *_checkpoints tables of these DAOs as well. """ id_ = 1 for max_id, in self.connect().execute( "SELECT MAX(id) FROM %(table)s" % {"table": self.TABLE_CHECKPOINT_ID}): if max_id >= id_: id_ = max_id + 1 daos = [self] if other_daos: daos.extend(other_daos) for dao in daos: dao.tables[self.TABLE_CHECKPOINT_ID].add_insert_item([ id_, get_current_time_string(), event]) for table_name in [ self.TABLE_SUITE_PARAMS, self.TABLE_BROADCAST_STATES, self.TABLE_TASK_POOL]: for row in self.connect().execute("SELECT * FROM %s" % table_name): for dao in daos: dao.tables[table_name + "_checkpoints"].add_insert_item( [id_] + list(row))
def _append_db_queue(self, modified_settings, is_cancel=False): """Update the queue to the runtime DB.""" now = get_current_time_string(display_sub_seconds=True) for broadcast_change in ( get_broadcast_change_iter(modified_settings, is_cancel)): broadcast_change["time"] = now self.db_inserts_map[self.TABLE_BROADCAST_EVENTS].append( broadcast_change) if is_cancel: self.db_deletes_map[self.TABLE_BROADCAST_STATES].append({ "point": broadcast_change["point"], "namespace": broadcast_change["namespace"], "key": broadcast_change["key"]}) # Delete statements are currently executed before insert # statements, so we should clear out any insert statements that # are deleted here. # (Not the most efficient logic here, but unless we have a # large number of inserts, then this should not be a big # concern.) inserts = [] for insert in self.db_inserts_map[self.TABLE_BROADCAST_STATES]: if any([insert[key] != broadcast_change[key] for key in ["point", "namespace", "key"]]): inserts.append(insert) self.db_inserts_map[self.TABLE_BROADCAST_STATES] = inserts else: self.db_inserts_map[self.TABLE_BROADCAST_STATES].append({ "point": broadcast_change["point"], "namespace": broadcast_change["namespace"], "key": broadcast_change["key"], "value": broadcast_change["value"]})
def jobs_submit(self, job_log_root, job_log_dirs, remote_mode=False): """Submit multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) if remote_mode: items = self._jobs_submit_prep_by_stdin(job_log_root, job_log_dirs) else: items = self._jobs_submit_prep_by_args(job_log_root, job_log_dirs) now = get_current_time_string() for job_log_dir, batch_sys_name, submit_opts in items: job_file_path = os.path.join( job_log_root, job_log_dir, self.JOB_FILE_BASE) if not batch_sys_name: sys.stdout.write("%s%s|%s|1|\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir)) continue ret_code, out, err, job_id = self._job_submit_impl( job_file_path, batch_sys_name, submit_opts) sys.stdout.write("%s%s|%s|%d|%s\n" % ( self.OUT_PREFIX_SUMMARY, now, job_log_dir, ret_code, job_id)) for key, value in [("STDERR", err), ("STDOUT", out)]: if value is None or not value.strip(): continue for line in value.splitlines(True): if not value.endswith("\n"): value += "\n" sys.stdout.write("%s%s|%s|[%s] %s" % ( self.OUT_PREFIX_COMMAND, now, job_log_dir, key, line))
def _db_events_insert(self, itask, event="", message=""): """Record an event to the DB.""" self.suite_db_mgr.put_insert_task_events( itask, { "time": get_current_time_string(), "event": event, "message": message })
def __init__(self, suite, test_params=None): if SuiteLog.__INSTANCE: raise Exception("Attempting to initiate a second singleton" "instance.") self._group = None if not test_params: self.is_test = False self.max_bytes = GLOBAL_CFG.get( ['suite logging', 'maximum size in bytes']) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging', 'roll over at start-up']) self.archive_length = GLOBAL_CFG.get( ['suite logging', 'rolling archive length']) else: self.is_test = True self.max_bytes = test_params['max_bytes'] self.roll_at_startup = test_params['roll_at_startup'] self.archive_length = 4 # Log paths. if test_params: self.ldir = test_params['ldir'] else: self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory') self.log_paths = {} self.log_paths[self.LOG] = os.path.join(self.ldir, self.LOG) self.log_paths[self.OUT] = os.path.join(self.ldir, self.OUT) self.log_paths[self.ERR] = os.path.join(self.ldir, self.ERR) # The loggers. self.loggers = {} self.loggers[self.LOG] = None self.loggers[self.OUT] = None self.loggers[self.ERR] = None # Filename stamp functions. if self.is_test: self.stamp = lambda: get_current_time_string(True, True, True ).replace('.', '-') else: self.stamp = lambda: get_current_time_string(False, True, True) SuiteLog.__INSTANCE = self
def __init__(self, cmd_key, cmd, **cmd_kwargs): self.timestamp = get_current_time_string() self.cmd_key = cmd_key self.cmd = cmd self.cmd_kwargs = cmd_kwargs self.err = cmd_kwargs.get('err') self.ret_code = cmd_kwargs.get('ret_code') self.out = cmd_kwargs.get('out')
def put_task_pool(self, pool): """Put statements to update the task_pool table in runtime database. Update the task_pool table and the task_action_timers table. Queue delete (everything) statements to wipe the tables, and queue the relevant insert statements for the current tasks in the pool. """ self.db_deletes_map[self.TABLE_TASK_POOL].append({}) # No need to do: # self.db_deletes_map[self.TABLE_TASK_ACTION_TIMERS].append({}) # Should already be done by self.put_task_event_timers above. self.db_deletes_map[self.TABLE_TASK_TIMEOUT_TIMERS].append({}) for itask in pool.get_all_tasks(): self.db_inserts_map[self.TABLE_TASK_POOL].append({ "name": itask.tdef.name, "cycle": str(itask.point), "spawned": int(itask.has_spawned), "status": itask.state.status, "hold_swap": itask.state.hold_swap}) if itask.state.status in itask.timeout_timers: self.db_inserts_map[self.TABLE_TASK_TIMEOUT_TIMERS].append({ "name": itask.tdef.name, "cycle": str(itask.point), "timeout": itask.timeout_timers[itask.state.status]}) for ctx_key_0 in ["poll_timers", "try_timers"]: for ctx_key_1, timer in getattr(itask, ctx_key_0).items(): if timer is None: continue self.db_inserts_map[self.TABLE_TASK_ACTION_TIMERS].append({ "name": itask.tdef.name, "cycle": str(itask.point), "ctx_key": json.dumps((ctx_key_0, ctx_key_1)), "ctx": self._namedtuple2json(timer.ctx), "delays": json.dumps(timer.delays), "num": timer.num, "delay": timer.delay, "timeout": timer.timeout}) if itask.state.time_updated: set_args = { "time_updated": itask.state.time_updated, "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "status": itask.state.status} where_args = { "cycle": str(itask.point), "name": itask.tdef.name, } self.db_updates_map.setdefault(self.TABLE_TASK_STATES, []) self.db_updates_map[self.TABLE_TASK_STATES].append( (set_args, where_args)) itask.state.time_updated = None self.db_inserts_map[self.TABLE_CHECKPOINT_ID].append({ # id = -1 for latest "id": CylcSuiteDAO.CHECKPOINT_LATEST_ID, "time": get_current_time_string(), "event": CylcSuiteDAO.CHECKPOINT_LATEST_EVENT})
def __init__( self, msg=None, priority='NORMAL' ): self.msg = msg if priority in [ 'NORMAL', 'WARNING', 'CRITICAL' ]: self.priority = priority else: raise Exception( 'Illegal message priority ' + priority ) # load the environment self.env_map = dict(os.environ) # set some instance variables for attr, key, default in ( ('suite', 'CYLC_SUITE_NAME', '(CYLC_SUITE_NAME)'), ('task_id', 'CYLC_TASK_ID', '(CYLC_TASK_ID)'), ('retry_seconds', 'CYLC_TASK_MSG_RETRY_INTVL', '(CYLC_TASK_MSG_RETRY_INTVL)'), ('max_tries', 'CYLC_TASK_MSG_MAX_TRIES', '(CYLC_TASK_MSG_MAX_TRIES)'), ('try_timeout', 'CYLC_TASK_MSG_TIMEOUT', '(CYLC_TASK_MSG_TIMEOUT)'), ('owner', 'CYLC_SUITE_OWNER', None), ('host', 'CYLC_SUITE_HOST', '(CYLC_SUITE_HOST)'), ('port', 'CYLC_SUITE_PORT', '(CYLC_SUITE_PORT)')): value = self.env_map.get(key, default) setattr(self, attr, value) # conversions from string: if self.try_timeout == 'None': self.try_timeout = None try: self.retry_seconds = float( self.retry_seconds ) self.max_tries = int( self.max_tries ) except: pass cylc.flags.verbose = cylc.flags.verbose or self.env_map.get('CYLC_VERBOSE') == 'True' # 'scheduler' or 'submit', (or 'raw' if job script run manually) self.mode = self.env_map.get( 'CYLC_MODE', 'raw' ) rd = self.env_map.get( 'CYLC_SUITE_RUN_DIR', '.' ) self.env_file_path = os.path.join (rd, 'cylc-suite-env' ) self.utc = self.env_map.get('CYLC_UTC') == 'True' # Record the time the messaging system was called and append it # to the message, in case the message is delayed in some way. self.true_event_time = get_current_time_string( override_use_utc=self.utc) self.ssh_messaging = ( self.env_map.get('CYLC_TASK_COMMS_METHOD') == 'ssh' ) self.polling = ( self.env_map.get('CYLC_TASK_COMMS_METHOD') == 'poll' ) self.ssh_login_shell = ( self.env_map.get('CYLC_TASK_SSH_LOGIN_SHELL') != 'False')
def append_to_log(self, submit_num, log_type, out=None, err=None): """Write new command output to the appropriate log file.""" sub_num = "%02d" % int(submit_num) dir_ = os.path.join(self.base_path, sub_num) mkdir_p(dir_) job_log_handle = open(os.path.join(dir_, "job-activity.log"), "a") timestamp = get_current_time_string() self._write_to_log(job_log_handle, timestamp, log_type + "-OUT", out) self._write_to_log(job_log_handle, timestamp, log_type + "-ERR", err) job_log_handle.close()
def jobs_poll(self, job_log_root, job_log_dirs): """Poll multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) ctx_list = [] # Contexts for all relevant jobs ctx_list_by_batch_sys = {} # {batch_sys_name1: [ctx1, ...], ...} for job_log_dir in job_log_dirs: ctx = self._jobs_poll_status_files(job_log_root, job_log_dir) if ctx is None: continue ctx_list.append(ctx) if not ctx.batch_sys_name or not ctx.batch_sys_job_id: # Lost batch system information for some reason. # Mark the job as if it is no longer in the batch system. ctx.batch_sys_exit_polled = 1 sys.stderr.write( "%s/job.status: incomplete batch system info\n" % ( ctx.job_log_dir)) # We can trust: # * Jobs previously polled to have exited the batch system. # * Jobs succeeded or failed with ERR/EXIT. if (ctx.batch_sys_exit_polled or ctx.run_status == 0 or ctx.run_signal in ["ERR", "EXIT"]): continue if ctx.batch_sys_name not in ctx_list_by_batch_sys: ctx_list_by_batch_sys[ctx.batch_sys_name] = [] ctx_list_by_batch_sys[ctx.batch_sys_name].append(ctx) for batch_sys_name, my_ctx_list in ctx_list_by_batch_sys.items(): self._jobs_poll_batch_sys( job_log_root, batch_sys_name, my_ctx_list) cur_time_str = get_current_time_string() for ctx in ctx_list: for message in ctx.messages: sys.stdout.write("%s%s|%s|%s\n" % ( self.OUT_PREFIX_MESSAGE, cur_time_str, ctx.job_log_dir, message)) sys.stdout.write("%s%s|%s\n" % ( self.OUT_PREFIX_SUMMARY, cur_time_str, ctx.get_summary_str()))
def jobs_poll(self, job_log_root, job_log_dirs): """Poll multiple jobs. job_log_root -- The log/job/ sub-directory of the suite. job_log_dirs -- A list containing point/name/submit_num for task jobs. """ if "$" in job_log_root: job_log_root = os.path.expandvars(job_log_root) self.configure_suite_run_dir(job_log_root.rsplit(os.sep, 2)[0]) ctx_list = [] # Contexts for all relevant jobs ctx_list_by_batch_sys = {} # {batch_sys_name1: [ctx1, ...], ...} for job_log_dir in job_log_dirs: ctx = self._jobs_poll_status_files(job_log_root, job_log_dir) if ctx is None: continue ctx_list.append(ctx) if not ctx.batch_sys_name or not ctx.batch_sys_job_id: # Lost batch system information for some reason. # Mark the job as if it is no longer in the batch system. ctx.batch_sys_exit_polled = 1 sys.stderr.write( "%s/%s: incomplete batch system info\n" % ( ctx.job_log_dir, JOB_LOG_STATUS)) # We can trust: # * Jobs previously polled to have exited the batch system. # * Jobs succeeded or failed with ERR/EXIT. if (ctx.batch_sys_exit_polled or ctx.run_status == 0 or ctx.run_signal in ["ERR", "EXIT"]): continue if ctx.batch_sys_name not in ctx_list_by_batch_sys: ctx_list_by_batch_sys[ctx.batch_sys_name] = [] ctx_list_by_batch_sys[ctx.batch_sys_name].append(ctx) for batch_sys_name, my_ctx_list in ctx_list_by_batch_sys.items(): self._jobs_poll_batch_sys( job_log_root, batch_sys_name, my_ctx_list) cur_time_str = get_current_time_string() for ctx in ctx_list: for message in ctx.messages: sys.stdout.write("%s%s|%s|%s\n" % ( self.OUT_PREFIX_MESSAGE, cur_time_str, ctx.job_log_dir, message)) sys.stdout.write("%s%s|%s\n" % ( self.OUT_PREFIX_SUMMARY, cur_time_str, ctx.get_summary_str()))
def __init__(self, name, cycle, time_created_string=None, time_updated_string=None, submit_num=None, is_manual_submit=None, try_num=None, host=None, submit_method=None, submit_method_id=None, status=None): """Insert a new row into the states table""" if time_created_string is None: time_created_string = get_current_time_string() self.s_fmt = "INSERT INTO task_states VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" self.args = [name, cycle, time_created_string, time_updated_string, submit_num, is_manual_submit, try_num, host, submit_method, submit_method_id, status]
def __init__(self, priority=NORMAL): if priority in self.PRIORITIES: self.priority = priority else: raise Exception('Illegal message priority ' + priority) # load the environment self.env_map = dict(os.environ) # set some instance variables self.suite = None self.task_id = None self.retry_seconds = None self.max_tries = None self.try_timeout = None self.owner = None self.host = None self.port = None for attr, key, default in self.ATTRS: value = self.env_map.get(key, default) setattr(self, attr, value) # conversions from string: if self.try_timeout == 'None': self.try_timeout = None try: self.retry_seconds = float(self.retry_seconds) self.max_tries = int(self.max_tries) except ValueError: pass cylc.flags.verbose = ( cylc.flags.verbose or self.env_map.get('CYLC_VERBOSE') == 'True') # 'scheduler' or 'submit', (or 'raw' if job script run manually) self.mode = self.env_map.get('CYLC_MODE', 'raw') suite_run_dir = self.env_map.get('CYLC_SUITE_RUN_DIR', '.') self.env_file_path = os.path.join(suite_run_dir, 'cylc-suite-env') self.utc = self.env_map.get('CYLC_UTC') == 'True' # Record the time the messaging system was called and append it # to the message, in case the message is delayed in some way. self.true_event_time = get_current_time_string( override_use_utc=self.utc) self.ssh_messaging = ( self.env_map.get('CYLC_TASK_COMMS_METHOD') == 'ssh') self.polling = ( self.env_map.get('CYLC_TASK_COMMS_METHOD') == 'poll') self.ssh_login_shell = ( self.env_map.get('CYLC_TASK_SSH_LOGIN_SHELL') != 'False')
def log(self, level, *args, **kwargs): if self.logger.handlers: # If this logger has file handlers write out to it. self.logger.log(level, *args, **kwargs) else: # No file handlers, write out to stdout/stderr. msg = (get_current_time_string() + ' ' + logging._levelNames[level] + ' - ' + str(args[0]), ) + tuple(*args[1:]) if self.log_ in [SuiteLog.OUT, SuiteLog.LOG]: print(*msg) else: print(*msg, file=sys.stderr)
def __init__(self, name, cycle, time_created_string=None, time_updated_string=None, submit_num=None, is_manual_submit=None, try_num=None, host=None, submit_method=None, submit_method_id=None, status=None): """Insert a new row into the states table""" if time_created_string is None: time_created_string = get_current_time_string() self.s_fmt = "INSERT INTO task_states VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" self.args = [name, cycle, time_created_string, time_updated_string, submit_num, is_manual_submit, try_num, host, submit_method, submit_method_id, status] self.to_run = True
def update(self): """Try and connect and do an update.""" if self._no_update_event.is_set(): return False if not self.connect_schd.ready(): self.info_bar.set_update_time( None, get_seconds_as_interval_string( round(self.connect_schd.dt_next))) return False if cylc.flags.debug: print >> sys.stderr, "UPDATE %s" % get_current_time_string() if not self.connected: # Only reconnect via self.reconnect(). self.reconnect() if not self.connected: self.set_stopped() if cylc.flags.debug: print >> sys.stderr, "(not connected)" return False if cylc.flags.debug: print >> sys.stderr, "(connected)" try: err_log_changed = self.retrieve_err_log() summaries_changed = self.retrieve_summary_update_time() if self.summary_update_time is not None and summaries_changed: self.retrieve_state_summaries() except Exception as exc: if self.status == SUITE_STATUS_STOPPING: # Expected stop: prevent the reconnection warning dialog. self.connect_fail_warned = True if cylc.flags.debug: print >> sys.stderr, " CONNECTION LOST", str(exc) self.set_stopped() if self.info_bar.prog_bar_active(): gobject.idle_add(self.info_bar.prog_bar_stop) self.reconnect() return False else: # Got suite data. self.version_mismatch_warned = False status_str = None if self.status in [SUITE_STATUS_INITIALISING, SUITE_STATUS_STOPPING]: status_str = self.status elif self.is_reloading: status_str = "reloading" if status_str is None: gobject.idle_add(self.info_bar.prog_bar_stop) elif self.info_bar.prog_bar_can_start(): gobject.idle_add(self.info_bar.prog_bar_start, status_str) return summaries_changed or err_log_changed
def log(self, level, *args, **kwargs): if self.logger.handlers: # If this logger has file handlers write out to it. self.logger.log(level, *args, **kwargs) else: # No file handlers, write out to stdout/stderr. msg = (get_current_time_string() + ' ' + logging._levelNames[level] + ' - ' + str(args[0]),) + tuple(*args[1:]) if self.log_ in [SuiteLog.OUT, SuiteLog.LOG]: print(*msg) else: print(*msg, file=sys.stderr)
def _jobs_poll_batch_sys(self, job_log_root, batch_sys_name, my_ctx_list): """Helper 2 for self.jobs_poll(job_log_root, job_log_dirs).""" batch_sys = self.get_inst(batch_sys_name) all_job_ids = [ctx.batch_sys_job_id for ctx in my_ctx_list] if hasattr(batch_sys, "get_poll_many_cmd"): # Some poll commands may not be as simple cmd = batch_sys.get_poll_many_cmd(all_job_ids) else: # if hasattr(batch_sys, "POLL_CMD"): # Simple poll command that takes a list of job IDs cmd = [batch_sys.POLL_CMD] + all_job_ids try: proc = Popen(cmd, stderr=PIPE, stdout=PIPE) except OSError as exc: # subprocess.Popen has a bad habit of not setting the # filename of the executable when it raises an OSError. if not exc.filename: exc.filename = cmd[0] sys.stderr.write(str(exc) + "\n") return proc.wait() out, err = proc.communicate() sys.stderr.write(err) if hasattr(batch_sys, "filter_poll_many_output"): # Allow custom filter job_ids = batch_sys.filter_poll_many_output(out) else: # Just about all poll commands return a table, with column 1 # being the job ID. The logic here should be sufficient to # ensure that any table header is ignored. job_ids = [] for line in out.splitlines(): try: head = line.split(None, 1)[0] except IndexError: continue if head in all_job_ids: job_ids.append(head) for ctx in my_ctx_list: ctx.batch_sys_exit_polled = int( ctx.batch_sys_job_id not in job_ids) # Add information to "job.status" if ctx.batch_sys_exit_polled: try: handle = open(os.path.join( job_log_root, ctx.job_log_dir, "job.status"), "a") handle.write("%s=%s\n" % ( self.CYLC_BATCH_SYS_EXIT_POLLED, get_current_time_string())) handle.close() except IOError as exc: sys.stderr.write(str(exc) + "\n")
def __init__(self, priority=NORMAL): if priority in self.PRIORITIES: self.priority = priority else: raise ValueError('Illegal message priority ' + priority) # load the environment self.env_map = dict(os.environ) # set some instance variables self.suite = self.env_map.get(SuiteSrvFilesManager.KEY_NAME) self.task_id = self.env_map.get('CYLC_TASK_ID') # Record the time the messaging system was called and append it # to the message, in case the message is delayed in some way. self.true_event_time = get_current_time_string( override_use_utc=self.env_map.get('CYLC_UTC') == 'True')
def __init__(self, severity=NORMAL): if severity in self.SEVERITIES: self.severity = severity else: raise ValueError('Illegal message severity ' + severity) # load the environment self.env_map = dict(os.environ) # set some instance variables self.suite = self.env_map.get(SuiteSrvFilesManager.KEY_NAME) self.task_id = self.env_map.get('CYLC_TASK_ID') # Record the time the messaging system was called and append it # to the message, in case the message is delayed in some way. self.true_event_time = get_current_time_string( override_use_utc=self.env_map.get('CYLC_UTC') == 'True')
def __init__(self, table, name, cycle, **kwargs): """Update a row in a table.""" kwargs["time_updated"] = get_current_time_string() s_fmt = "UPDATE %(table)s SET %(cols)s WHERE name==? AND cycle==?" cols = "" args = [] not_first = False for k, v in kwargs.items(): if not_first: cols += ", " not_first = True cols += k + "=?" args.append(v) args.append(name) args.append(cycle) self.s_fmt = s_fmt % {"table": table, "cols": cols} self.args = args
def __init__(self, table, name, cycle, **kwargs): """Update a row in a table.""" kwargs["time_updated"] = get_current_time_string() s_fmt = "UPDATE %(table)s SET %(cols)s WHERE name==? AND cycle==?" cols = "" args = [] not_first = False for k, v in kwargs.items(): if not_first: cols += ", " not_first = True cols += k + "=?" args.append(v) args.append(name) args.append(cycle) self.s_fmt = s_fmt % {"table": table, "cols": cols} self.args = args self.to_run = True
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def set_stopped(self): """Reset data and clients when suite is stopped.""" if cylc.flags.debug: sys.stderr.write("%s NOT CONNECTED\n" % get_current_time_string()) self.full_mode = True self.connected = False self.set_status(SUITE_STATUS_STOPPED) self.update_interval += 1.0 if self.update_interval > self.max_update_interval: self.update_interval = self.max_update_interval self.state_summary = {} self.full_state_summary = {} self.fam_state_summary = {} self.full_fam_state_summary = {} self.all_families = {} self.global_summary = {} self.cfg.port = None self.client = None gobject.idle_add(self.app_window.set_title, str(self.cfg.suite)) # Use info bar to display stop summary if available. # Otherwise, just display the reconnect count down. if self.cfg.suite and self.stop_summary is None: stop_summary = get_stop_state_summary( cat_state(self.cfg.suite, self.cfg.host, self.cfg.owner)) if stop_summary != self.stop_summary: self.stop_summary = stop_summary self.status = SUITE_STATUS_STOPPED gobject.idle_add( self.info_bar.set_stop_summary, stop_summary) self.last_update_time = time() try: update_time_str = time2str(self.stop_summary[0]["last_updated"]) except (AttributeError, IndexError, KeyError, TypeError): update_time_str = None gobject.idle_add( self.info_bar.set_update_time, update_time_str, self.info_bar.DISCONNECTED_TEXT) gobject.idle_add(self.info_bar.prog_bar_stop)
def record_messages(suite, task_job, messages): """Record task job messages. Print the messages according to their severity. Write the messages in the job status file. Send the messages to the suite, if possible. Arguments: suite (str): Suite name. task_job (str): Task job identifier "CYCLE/TASK_NAME/SUBMIT_NUM". messages (list): List of messages "[[severity, message], ...]". """ # Record the event time, in case the message is delayed in some way. event_time = get_current_time_string( override_use_utc=(os.getenv('CYLC_UTC') == 'True')) # Print to stdout/stderr for severity, message in messages: if severity in STDERR_LEVELS: handle = sys.stderr else: handle = sys.stdout handle.write('%s %s - %s\n' % (event_time, severity, message)) handle.flush() # Write to job.status _append_job_status_file(suite, task_job, event_time, messages) # Send messages try: pclient = SuiteRuntimeClient(suite) except Exception: # Backward communication not possible if cylc.flags.debug: import traceback traceback.print_exc() pclient('put_messages', { 'task_job': task_job, 'event_time': event_time, 'messages': messages })
def do_rollover(self): """Create and rollover log file if necessary.""" # Generate new file name self.stamp = get_current_time_string(use_basic_format=True) filename = self.baseFilename + '.' + self.stamp mkdir_p(os.path.dirname(filename)) # Touch file with open(filename, 'w+'): os.utime(filename, None) # Update symlink if (os.path.exists(self.baseFilename) or os.path.lexists(self.baseFilename)): os.unlink(self.baseFilename) os.symlink(os.path.basename(filename), self.baseFilename) # Housekeep log files arch_len = glbl_cfg().get([self.GLBL_KEY, 'rolling archive length']) if arch_len: log_files = glob(self.baseFilename + '.*') log_files.sort() while len(log_files) > arch_len: os.unlink(log_files.pop(0)) # Reopen stream, redirect STDOUT and STDERR to log if self.stream: self.stream.close() self.stream = None self.stream = self._open() # Dup STDOUT and STDERR in detach mode if not self.no_detach: os.dup2(self.stream.fileno(), sys.stdout.fileno()) os.dup2(self.stream.fileno(), sys.stderr.fileno()) # Emit header records (should only do this for subsequent log files) for header_record in self.header_records: if self.FILE_NUM in header_record.__dict__: # Increment log file number header_record.__dict__[self.FILE_NUM] += 1 header_record.args = header_record.args[0:-1] + ( header_record.__dict__[self.FILE_NUM], ) logging.FileHandler.emit(self, header_record)
def do_rollover(self): """Create and rollover log file if necessary.""" # Generate new file name self.stamp = get_current_time_string(use_basic_format=True) filename = self.baseFilename + '.' + self.stamp os.makedirs(os.path.dirname(filename), exist_ok=True) # Touch file with open(filename, 'w+'): os.utime(filename, None) # Update symlink if (os.path.exists(self.baseFilename) or os.path.lexists(self.baseFilename)): os.unlink(self.baseFilename) os.symlink(os.path.basename(filename), self.baseFilename) # Housekeep log files arch_len = glbl_cfg().get([self.GLBL_KEY, 'rolling archive length']) if arch_len: log_files = glob(self.baseFilename + '.*') log_files.sort() while len(log_files) > arch_len: os.unlink(log_files.pop(0)) # Reopen stream, redirect STDOUT and STDERR to log if self.stream: self.stream.close() self.stream = None self.stream = self._open() # Dup STDOUT and STDERR in detach mode if not self.no_detach: os.dup2(self.stream.fileno(), sys.stdout.fileno()) os.dup2(self.stream.fileno(), sys.stderr.fileno()) # Emit header records (should only do this for subsequent log files) for header_record in self.header_records: if self.FILE_NUM in header_record.__dict__: # Increment log file number header_record.__dict__[self.FILE_NUM] += 1 header_record.args = header_record.args[0:-1] + ( header_record.__dict__[self.FILE_NUM],) logging.FileHandler.emit(self, header_record)
def log(self, level, *args, **kwargs): try: itask = kwargs.pop("itask") except KeyError: pass else: try: args = ("[%s] -%s" % (itask.identity, args[0]),) + args[1:] except AttributeError: args = ("[%s] -%s" % (itask, args[0]),) + args[1:] args = tuple(args) if self.logger.handlers: # If this logger has file handlers write out to it. self.logger.log(level, *args, **kwargs) else: # No file handlers, write out to stdout/stderr. msg = (get_current_time_string() + ' ' + logging._levelNames[level] + ' - ' + str(args[0]),) + tuple(*args[1:]) if self.log_ in [SuiteLog.OUT, SuiteLog.LOG]: print(*msg) else: print(*msg, file=sys.stderr)
def log(self, level, *args, **kwargs): try: itask = kwargs.pop("itask") except KeyError: pass else: try: args = ("[%s] -%s" % (itask.identity, args[0]), ) + args[1:] except AttributeError: args = ("[%s] -%s" % (itask, args[0]), ) + args[1:] args = tuple(args) self.update_time = time() if self.logger.handlers: # If this logger has file handlers write out to it. self.logger.log(level, *args, **kwargs) else: # No file handlers, write out to stdout/stderr. msg = (get_current_time_string() + ' ' + logging._levelNames[level] + ' - ' + str(args[0]), ) + tuple(*args[1:]) if self.log_ in [SuiteLog.OUT, SuiteLog.LOG]: print(*msg) else: print(*msg, file=sys.stderr)
def put_messages(self, payload): """Send task messages to suite server program. Arguments: payload (dict): task_job (str): Task job as "CYCLE/TASK_NAME/SUBMIT_NUM". event_time (str): Event time as string. messages (list): List in the form [[severity, message], ...]. """ retry_intvl = float(self.comms1.get( self.srv_files_mgr.KEY_TASK_MSG_RETRY_INTVL, self.MSG_RETRY_INTVL)) max_tries = int(self.comms1.get( self.srv_files_mgr.KEY_TASK_MSG_MAX_TRIES, self.MSG_MAX_TRIES)) for i in range(1, max_tries + 1): # 1..max_tries inclusive orig_timeout = self.timeout if self.timeout is None: self.timeout = self.MSG_TIMEOUT try: func_name = self._compat('put_messages') if func_name == 'put_messages': results = self._call_server(func_name, payload=payload) elif func_name == 'put_message': # API 1, 7.5.0 compat cycle, name = payload['task_job'].split('/')[0:2] for severity, message in payload['messages']: results.append(self._call_server( func_name, task_id='%s.%s' % (name, cycle), severity=severity, message=message)) else: # API 0, pre-7.5.0 compat, priority instead of severity cycle, name = payload['task_job'].split('/')[0:2] for severity, message in payload['messages']: results.append(self._call_server( func_name, task_id='%s.%s' % (name, cycle), priority=severity, message=message)) except ClientInfoError: # Contact info file not found, suite probably not running. # Don't bother with retry, suite restart will poll any way. raise except ClientError as exc: now = get_current_time_string() sys.stderr.write( "%s WARNING - Message send failed, try %s of %s: %s\n" % ( now, i, max_tries, exc)) if i < max_tries: sys.stderr.write( " retry in %s seconds, timeout is %s\n" % ( retry_intvl, self.timeout)) sleep(retry_intvl) # Reset in case contact info or passphrase change self.comms1 = {} self.host = None self.port = None self.auth = None else: if i > 1: # Continue to write to STDERR, so users can easily see that # it has recovered from previous failures. sys.stderr.write( "%s INFO - Send message: try %s of %s succeeded\n" % ( get_current_time_string(), i, max_tries)) return results finally: self.timeout = orig_timeout
def process_message(self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % (itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
return (out, err) if __name__ == "__main__": commands = [] for i in range(1, 5): if i == 4: command = ("echoX hello from " + str(i) + "; sleep 10; echo bye from " + str(i)) else: command = ("echo hello from " + str(i) + "; sleep 10; echo bye from " + str(i)) commands.append(command) print 'SRT:', get_current_time_string(display_sub_seconds=True) mp = multisubprocess(commands) out, err = mp.execute() count = 1 for item in out: print count print item count += 1 count = 1 for item in err: print count print item count += 1