def run(self): self._run_count += 1 atexit.register(self.cleanup) if self.script_filename: os.unlink(self.script_filename) with temporary_file(cleanup=False) as fp: self.script_filename = fp.name fp.write(self.RUN_JOB_SCRIPT % { 'filename': self.job_filename, 'sandbox': self.sandbox, 'root': self.tempdir, 'task_id': self.task_id, 'state_filename': self.state_filename, 'success_rate': self.success_rate, 'random_seed': self.random_seed + self._run_count, 'extra_task_runner_args': self.extra_task_runner_args, }) with environment_as(PYTHONPATH=os.pathsep.join(sys.path)): self.po = subprocess.Popen([sys.executable, self.script_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: so, se = self.po.communicate() except OSError as e: if e.errno == errno.ECHILD: so = se = 'Killed' else: raise rc = self.po.returncode if rc != 0: if os.path.exists(self.job_filename): with open(self.job_filename) as fp: config = fp.read() else: config = 'Nonexistent!' if 'THERMOS_DEBUG' in os.environ: print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % ( config, so, se)) try: with open(self.state_filename, 'r') as fp: self.state = thrift_deserialize(RunnerState(), fp.read()) except Exception as e: if 'THERMOS_DEBUG' in os.environ: print('Failed to load Runner state: %s' % e, file=sys.stderr) self.state = RunnerState() try: self.reconstructed_state = CheckpointDispatcher.from_file( self.pathspec.getpath('runner_checkpoint')) except Exception as e: print('Failed to replay checkpoint: %s' % e, file=sys.stderr) self.reconstructed_state = None self.initialized = True return rc
def read(args, options): """Replay a thermos checkpoint. Usage: thermos read [options] checkpoint_filename Options: --simple Do not replay the full task state machine. Only print out the contents of each checkpoint log message. """ if len(args) != 1: app.error('Expected one checkpoint file, got %s' % len(args)) if not os.path.exists(args[0]): app.error('Could not find %s' % args[0]) dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) with open(args[0], 'r') as fp: try: for record in ThriftRecordReader(fp, RunnerCkpt): if not options.simple: dispatcher.dispatch(state, record) else: print('CKPT: %s' % record) except RecordIO.Error as err: print("Failed to recover from %s: %s" % (fp.name, err)) return if not options.simple: if state is None or state.header is None: print('Checkpoint stream CORRUPT or outdated format') return print('Recovered Task Header:') print(' id: %s' % state.header.task_id) print(' user: %s' % state.header.user) print(' host: %s' % state.header.hostname) print(' sandbox: %s' % state.header.sandbox) if state.header.ports: print(' ports: %s' % ' '.join( '%s->%s' % (name, port) for (name, port) in state.header.ports.items())) print('Recovered Task States:') for task_status in state.statuses: print(' %s [pid: %d] => %s' % ( time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state])) print('Recovered Processes:') for process, process_history in state.processes.items(): print(' %s runs: %s' % (process, len(process_history))) for k in reversed(range(len(process_history))): run = process_history[k] print(' %2d: pid=%d, rc=%s, finish:%s, state:%s' % ( k, run.pid, run.return_code if run.return_code is not None else '', time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
def from_file(cls, filename, truncate=False): """Reconstruct a RunnerState from a checkpoint stream contained in a file Returns a hydrated RunnerState, or None on any failures. """ state = RunnerState(processes={}) builder = cls() try: for update in cls.iter_updates(filename): builder.dispatch(state, update, truncate=truncate) return state except cls.Error as e: log.error('Failed to recover from %s: %s' % (filename, e))
def __init__(self, pathspec, task_id): self._task_id = task_id self._dispatcher = CheckpointDispatcher() self._runnerstate = RunnerState(processes={}) self._runner_ckpt = pathspec.given( task_id=task_id).getpath('runner_checkpoint') self._active_file, self._finished_file = (pathspec.given( task_id=task_id, state=state).getpath('task_path') for state in ('active', 'finished')) self._ckpt_head = 0 self._apply_states() self._lock = threading.Lock()
def make_runner_state(cpid=COORDINATOR_PID, pid=PID, user=USER1, pname=PROCESS_NAME): return RunnerState(header=RunnerHeader(user=user), processes={ pname: [ ProcessStatus(fork_time=CREATE_TIME, start_time=CREATE_TIME, pid=pid, coordinator_pid=cpid, process=pname) ] })
def __init__(self, root, task_id): """Construct a TaskMonitor. :param root: The checkpoint root of the task. :param task_id: The task id of the task. """ pathspec = TaskPath(root=root, task_id=task_id) self._dispatcher = CheckpointDispatcher() self._runnerstate = RunnerState(processes={}) self._runner_ckpt = pathspec.getpath('runner_checkpoint') self._active_file, self._finished_file = (pathspec.given(state=state).getpath('task_path') for state in ('active', 'finished')) self._ckpt_head = 0 self._apply_states() self._lock = threading.Lock()
def main(args): values = app.get_options() if len(args) > 0: print("ERROR: unrecognized arguments: %s\n" % (" ".join(args)), file=sys.stderr) app.help() sys.exit(1) if not values.ckpt: print("ERROR: must supply --checkpoint", file=sys.stderr) app.help() sys.exit(1) fp = file(values.ckpt, "r") rr = ThriftRecordReader(fp, RunnerCkpt) wrs = RunnerState(processes={}) dispatcher = CheckpointDispatcher() try: for wts in rr: print('Recovering: %s' % wts) if values.assemble is True: dispatcher.dispatch(wrs, wts) except RecordIO.Error as err: print('Error recovering checkpoint stream: %s' % err, file=sys.stderr) return print('\n\n\n') if values.assemble: print('Recovered Task Header') pprint.pprint(wrs.header, indent=4) print('\nRecovered Task States') for task_status in wrs.statuses: print(' %s [pid: %d] => %s' % ( time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state])) print('\nRecovered Processes') pprint.pprint(wrs.processes, indent=4)
def __init__(self, task, checkpoint_root, sandbox, log_dir=None, task_id=None, portmap=None, user=None, chroot=False, clock=time, universal_handler=None, planner_class=TaskPlanner, hostname=None, process_logger_destination=None, process_logger_mode=None, rotate_log_size_mb=None, rotate_log_backups=None, preserve_env=False): """ required: task (config.Task) = the task to run checkpoint_root (path) = the checkpoint root sandbox (path) = the sandbox in which the path will be run [if None, cwd will be assumed, but garbage collection will be disabled for this task.] optional: log_dir (string) = directory to house stdout/stderr logs. If not specified, logs will be written into the sandbox directory under .logs/ task_id (string) = bind to this task id. if not specified, will synthesize an id based upon task.name() portmap (dict) = a map (string => integer) from name to port, e.g. { 'http': 80 } user (string) = the user to run the task as. if not current user, requires setuid privileges. chroot (boolean) = whether or not to chroot into the sandbox prior to exec. clock (time interface) = the clock to use throughout universal_handler = checkpoint record handler (only used for testing) planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task planning policy. process_logger_destination (string) = The destination of logger to use for all processes. process_logger_mode (string) = The mode of logger to use for all processes. rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB. rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups. preserve_env (boolean) = whether or not env variables for the runner should be in the env for the task being run """ if not issubclass(planner_class, TaskPlanner): raise TypeError('planner_class must be a TaskPlanner.') self._clock = clock launch_time = self._clock.time() launch_time_ms = '%06d' % int( (launch_time - int(launch_time)) * (10**6)) if not task_id: self._task_id = '%s-%s.%s' % ( task.name(), time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)), launch_time_ms) else: self._task_id = task_id current_user = TaskRunnerHelper.get_actual_user() self._user = user or current_user # TODO(wickman) This should be delegated to the ProcessPlatform / Helper if self._user != current_user: if os.geteuid() != 0: raise ValueError( 'task specifies user as %s, but %s does not have setuid permission!' % (self._user, current_user)) self._portmap = portmap or {} self._launch_time = launch_time self._log_dir = log_dir or os.path.join(sandbox, '.logs') self._process_logger_destination = process_logger_destination self._process_logger_mode = process_logger_mode self._rotate_log_size_mb = rotate_log_size_mb self._rotate_log_backups = rotate_log_backups self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir) self._hostname = hostname or socket.gethostname() try: ThermosTaskValidator.assert_valid_task(task) ThermosTaskValidator.assert_valid_ports(task, self._portmap) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) context = ThermosContext(task_id=self._task_id, ports=self._portmap, user=self._user) self._task, uninterp = (task % Environment(thermos=context)).interpolate() if len(uninterp) > 0: raise self.InvalidTask('Failed to interpolate task, missing: %s' % ', '.join(str(ref) for ref in uninterp)) try: ThermosTaskValidator.assert_same_task(self._pathspec, self._task) except ThermosTaskValidator.InvalidTaskError as e: raise self.InvalidTask('Invalid task: %s' % e) self._plan = None # plan currently being executed (updated by Handlers) self._regular_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is False) self._finalizing_plan = planner_class( self._task, clock=clock, process_filter=lambda proc: proc.final().get() is True) self._chroot = chroot self._sandbox = sandbox self._terminal_state = None self._ckpt = None self._process_map = dict( (p.name().get(), p) for p in self._task.processes()) self._task_processes = {} self._stages = dict( (state, stage(self)) for state, stage in self.STAGES.items()) self._finalization_start = None self._preemption_deadline = None self._watcher = ProcessMuxer(self._pathspec) self._state = RunnerState(processes={}) self._preserve_env = preserve_env # create runner state universal_handler = universal_handler or TaskRunnerUniversalHandler self._dispatcher = CheckpointDispatcher() self._dispatcher.register_handler(universal_handler(self)) self._dispatcher.register_handler(TaskRunnerProcessHandler(self)) self._dispatcher.register_handler(TaskRunnerTaskHandler(self)) # recover checkpointed runner state and update plan self._recovery = True self._replay_runner_ckpt()
def state(self): """Return final state of Task (RunnerState, read from disk and cached for future access)""" if self._state is None: path = self._pathspec.getpath('runner_checkpoint') self._state = CheckpointDispatcher.from_file(path) return copy.deepcopy(self._state) if self._state else RunnerState(processes={})
def inspect(self, task_id): """ Reconstructs the checkpoint stream and returns a CheckpointInspection. """ dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) muxer = ProcessMuxer(self._path.given(task_id=task_id)) runner_processes = [] coordinator_processes = set() processes = set() def consume_process_record(record): if not record.process_status: return try: user_uid = pwd.getpwnam(state.header.user).pw_uid except KeyError: log.error('Could not find user: %s' % state.header.user) return if record.process_status.state == ProcessState.FORKED: coordinator_processes.add( (record.process_status.coordinator_pid, user_uid, record.process_status.fork_time)) elif record.process_status.state == ProcessState.RUNNING: processes.add((record.process_status.pid, user_uid, record.process_status.start_time)) # replay runner checkpoint runner_pid = None runner_latest_update = 0 try: with open( self._path.given( task_id=task_id).getpath('runner_checkpoint')) as fp: with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt: for record in ckpt: dispatcher.dispatch(state, record) runner_latest_update = max( runner_latest_update, self.get_timestamp(record.process_status)) # collect all bound runners if record.task_status: if record.task_status.runner_pid != runner_pid: runner_processes.append( (record.task_status.runner_pid, record.task_status.runner_uid or 0, record.task_status.timestamp_ms)) runner_pid = record.task_status.runner_pid elif record.process_status: consume_process_record(record) except (IOError, OSError, RecordIO.Error) as err: log.debug('Error inspecting task runner checkpoint: %s' % err) return # register existing processes in muxer for process_name in state.processes: muxer.register(process_name) # read process checkpoints process_latest_update = runner_latest_update for record in muxer.select(): process_latest_update = max( process_latest_update, self.get_timestamp(record.process_status)) consume_process_record(record) return CheckpointInspection( runner_latest_update=runner_latest_update, process_latest_update=process_latest_update, runner_processes=runner_processes, coordinator_processes=coordinator_processes, processes=processes)