def open_checkpoint(cls, filename, force=False, state=None): """ Acquire a locked checkpoint stream. """ safe_mkdir(os.path.dirname(filename)) fp = lock_file(filename, "a+") if fp in (None, False): if force: log.info('Found existing runner, forcing leadership forfeit.') state = state or CheckpointDispatcher.from_file(filename) if cls.kill_runner(state): log.info('Successfully killed leader.') # TODO(wickman) Blocking may not be the best idea here. Perhaps block up to # a maximum timeout. But blocking is necessary because os.kill does not immediately # release the lock if we're in force mode. fp = lock_file(filename, "a+", blocking=True) else: log.error('Found existing runner, cannot take control.') if fp in (None, False): raise cls.PermissionError( 'Could not open locked checkpoint: %s, lock_file = %s' % (filename, fp)) ckpt = ThriftRecordWriter(fp) ckpt.set_sync(True) return ckpt
def state(self): """Return final state of Task (RunnerState, read from disk and cached for future access)""" if self._state is None: path = self._pathspec.given( task_id=self._task_id).getpath('runner_checkpoint') self._state = CheckpointDispatcher.from_file(path) return self._state
def run(self): self._run_count += 1 atexit.register(self.cleanup) if self.script_filename: os.unlink(self.script_filename) with temporary_file(cleanup=False) as fp: self.script_filename = fp.name fp.write( self.RUN_JOB_SCRIPT % { 'filename': self.job_filename, 'sandbox': self.sandbox, 'root': self.tempdir, 'task_id': self.task_id, 'state_filename': self.state_filename, 'portmap': repr(self.portmap), 'success_rate': self.success_rate, 'random_seed': self.random_seed + self._run_count, }) with environment_as(PYTHONPATH=os.pathsep.join(sys.path)): self.po = subprocess.Popen([sys.executable, self.script_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: so, se = self.po.communicate() except OSError as e: if e.errno == errno.ECHILD: so = se = 'Killed' else: raise rc = self.po.returncode if rc != 0: if os.path.exists(self.job_filename): config = open(self.job_filename).read() else: config = 'Nonexistent!' if 'THERMOS_DEBUG' in os.environ: print( "Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % (config, so, se)) try: with open(self.state_filename, 'r') as fp: self.state = thrift_deserialize(RunnerState(), fp.read()) except Exception as e: if 'THERMOS_DEBUG' in os.environ: print('Failed to load Runner state: %s' % e, file=sys.stderr) self.state = RunnerState() try: self.reconstructed_state = CheckpointDispatcher.from_file( self.pathspec.getpath('runner_checkpoint')) except: self.reconstructed_state = None self.initialized = True return rc
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error('Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root = options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def run(self): self._run_count += 1 atexit.register(self.cleanup) if self.script_filename: os.unlink(self.script_filename) with temporary_file(cleanup=False) as fp: self.script_filename = fp.name fp.write(self.RUN_JOB_SCRIPT % { 'filename': self.job_filename, 'sandbox': self.sandbox, 'root': self.tempdir, 'task_id': self.task_id, 'state_filename': self.state_filename, 'portmap': repr(self.portmap), 'success_rate': self.success_rate, 'random_seed': self.random_seed + self._run_count, }) with environment_as(PYTHONPATH=os.pathsep.join(sys.path)): self.po = subprocess.Popen([sys.executable, self.script_filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: so, se = self.po.communicate() except OSError as e: if e.errno == errno.ECHILD: so = se = 'Killed' else: raise rc = self.po.returncode if rc != 0: if os.path.exists(self.job_filename): config = open(self.job_filename).read() else: config = 'Nonexistent!' if 'THERMOS_DEBUG' in os.environ: print("Runner failed!\n\n\nconfig:%s\n\n\nstdout:%s\n\n\nstderr:%s\n\n\n" % ( config, so, se)) try: with open(self.state_filename, 'r') as fp: self.state = thrift_deserialize(RunnerState(), fp.read()) except Exception as e: if 'THERMOS_DEBUG' in os.environ: print('Failed to load Runner state: %s' % e, file=sys.stderr) self.state = RunnerState() try: self.reconstructed_state = CheckpointDispatcher.from_file( self.pathspec.getpath('runner_checkpoint')) except: self.reconstructed_state = None self.initialized = True return rc
def format_task(task_id): checkpoint_filename = detector.get_checkpoint(task_id) checkpoint_stat = os.stat(checkpoint_filename) try: checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name except: checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid print(' %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='') if options.verbose == 0: print() if options.verbose > 0: state = CheckpointDispatcher.from_file(checkpoint_filename) if state is None or state.header is None: print(' - checkpoint stream CORRUPT or outdated format') return print(' state: %8s' % TaskState._VALUES_TO_NAMES.get( state.statuses[-1].state, 'Unknown'), end='') print(' start: %25s' % time.asctime( time.localtime(state.header.launch_time_ms / 1000.0))) if options.verbose > 1: print(' user: %s' % state.header.user, end='') if state.header.ports: print(' ports: %s' % ' '.join('%s -> %s' % (key, val) for key, val in state.header.ports.items())) else: print(' ports: None') print(' sandbox: %s' % state.header.sandbox) if options.verbose > 2: print(' process table:') for process, process_history in state.processes.items(): print(' - %s runs: %s' % (process, len(process_history)), end='') last_run = process_history[-1] print(' last: pid=%s, rc=%s, finish:%s, state:%s' % (last_run.pid or 'None', last_run.return_code if last_run.return_code is not None else '', time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get( last_run.state, 'Unknown'))) print()
def open_checkpoint(cls, filename, force=False, state=None): """ Acquire a locked checkpoint stream. """ safe_mkdir(os.path.dirname(filename)) fp = lock_file(filename, "a+") if fp in (None, False): if force: log.info("Found existing runner, forcing leadership forfeit.") state = state or CheckpointDispatcher.from_file(filename) if cls.kill_runner(state): log.info("Successfully killed leader.") # TODO(wickman) Blocking may not be the best idea here. Perhaps block up to # a maximum timeout. But blocking is necessary because os.kill does not immediately # release the lock if we're in force mode. fp = lock_file(filename, "a+", blocking=True) else: log.error("Found existing runner, cannot take control.") if fp in (None, False): raise cls.PermissionError("Could not open locked checkpoint: %s, lock_file = %s" % (filename, fp)) ckpt = ThriftRecordWriter(fp) ckpt.set_sync(True) return ckpt
def format_task(task_id): checkpoint_filename = detector.get_checkpoint(task_id) checkpoint_stat = os.stat(checkpoint_filename) try: checkpoint_owner = pwd.getpwuid(checkpoint_stat.st_uid).pw_name except: checkpoint_owner = 'uid:%s' % checkpoint_stat.st_uid print(' %-20s [owner: %8s]' % (task_id, checkpoint_owner), end='') if options.verbose == 0: print() if options.verbose > 0: state = CheckpointDispatcher.from_file(checkpoint_filename) if state is None or state.header is None: print(' - checkpoint stream CORRUPT or outdated format') return print(' state: %8s' % TaskState._VALUES_TO_NAMES.get(state.statuses[-1].state, 'Unknown'), end='') print(' start: %25s' % time.asctime(time.localtime(state.header.launch_time_ms/1000.0))) if options.verbose > 1: print(' user: %s' % state.header.user, end='') if state.header.ports: print(' ports: %s' % ' '.join('%s -> %s' % (key, val) for key, val in state.header.ports.items())) else: print(' ports: None') print(' sandbox: %s' % state.header.sandbox) if options.verbose > 2: print(' process table:') for process, process_history in state.processes.items(): print(' - %s runs: %s' % (process, len(process_history)), end='') last_run = process_history[-1] print(' last: pid=%s, rc=%s, finish:%s, state:%s' % ( last_run.pid or 'None', last_run.return_code if last_run.return_code is not None else '', time.asctime(time.localtime(last_run.stop_time)) if last_run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(last_run.state, 'Unknown'))) print()
def get(cls, task_id, checkpoint_root): """ Get a TaskRunner bound to the task_id in checkpoint_root. """ path = TaskPath(root=checkpoint_root, task_id=task_id, state='active') task_json = path.getpath('task_path') task_checkpoint = path.getpath('runner_checkpoint') if not os.path.exists(task_json): return None task = ThermosConfigLoader.load_json(task_json) if task is None: return None if len(task.tasks()) == 0: return None try: checkpoint = CheckpointDispatcher.from_file(task_checkpoint) if checkpoint is None or checkpoint.header is None: return None return cls(task.tasks()[0].task(), checkpoint_root, checkpoint.header.sandbox, log_dir=checkpoint.header.log_dir, task_id=task_id, portmap=checkpoint.header.ports) except Exception as e: log.error('Failed to reconstitute checkpoint in TaskRunner.get: %s' % e, exc_info=True) return None
def kill(cls, task_id, checkpoint_root, force=False, terminal_status=TaskState.KILLED, clock=time): """ An implementation of Task killing that doesn't require a fully hydrated TaskRunner object. Terminal status must be either KILLED or LOST state. """ if terminal_status not in (TaskState.KILLED, TaskState.LOST): raise cls.Error( "terminal_status must be KILLED or LOST (got %s)" % TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status ) pathspec = TaskPath(root=checkpoint_root, task_id=task_id) checkpoint = pathspec.getpath("runner_checkpoint") state = CheckpointDispatcher.from_file(checkpoint) if state is None or state.header is None or state.statuses is None: if force: log.error("Task has uninitialized TaskState - forcibly finalizing") cls.finalize_task(pathspec) return else: log.error("Cannot update states in uninitialized TaskState!") return ckpt = cls.open_checkpoint(checkpoint, force=force, state=state) def write_task_state(state): update = TaskStatus( state=state, timestamp_ms=int(clock.time() * 1000), runner_pid=os.getpid(), runner_uid=os.getuid() ) ckpt.write(RunnerCkpt(task_status=update)) def write_process_status(status): ckpt.write(RunnerCkpt(process_status=status)) if cls.is_task_terminal(state.statuses[-1].state): log.info("Task is already in terminal state! Finalizing.") cls.finalize_task(pathspec) return with closing(ckpt): write_task_state(TaskState.ACTIVE) for process, history in state.processes.items(): process_status = history[-1] if not cls.is_process_terminal(process_status.state): if cls.kill_process(state, process): write_process_status( ProcessStatus( process=process, state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9, stop_time=clock.time(), ) ) else: if process_status.state is not ProcessState.WAITING: write_process_status( ProcessStatus(process=process, state=ProcessState.LOST, seq=process_status.seq + 1) ) write_task_state(terminal_status) cls.finalize_task(pathspec)
def state(self, task_id): if task_id not in self._states: self._states[task_id] = CheckpointDispatcher.from_file(self._detector.get_checkpoint(task_id)) return self._states[task_id]
def kill(cls, task_id, checkpoint_root, force=False, terminal_status=TaskState.KILLED, clock=time): """ An implementation of Task killing that doesn't require a fully hydrated TaskRunner object. Terminal status must be either KILLED or LOST state. """ if terminal_status not in (TaskState.KILLED, TaskState.LOST): raise cls.Error('terminal_status must be KILLED or LOST (got %s)' % TaskState._VALUES_TO_NAMES.get(terminal_status) or terminal_status) pathspec = TaskPath(root=checkpoint_root, task_id=task_id) checkpoint = pathspec.getpath('runner_checkpoint') state = CheckpointDispatcher.from_file(checkpoint) if state is None or state.header is None or state.statuses is None: if force: log.error( 'Task has uninitialized TaskState - forcibly finalizing') cls.finalize_task(pathspec) return else: log.error('Cannot update states in uninitialized TaskState!') return ckpt = cls.open_checkpoint(checkpoint, force=force, state=state) def write_task_state(state): update = TaskStatus(state=state, timestamp_ms=int(clock.time() * 1000), runner_pid=os.getpid(), runner_uid=os.getuid()) ckpt.write(RunnerCkpt(task_status=update)) def write_process_status(status): ckpt.write(RunnerCkpt(process_status=status)) if cls.is_task_terminal(state.statuses[-1].state): log.info('Task is already in terminal state! Finalizing.') cls.finalize_task(pathspec) return with closing(ckpt): write_task_state(TaskState.ACTIVE) for process, history in state.processes.items(): process_status = history[-1] if not cls.is_process_terminal(process_status.state): if cls.kill_process(state, process): write_process_status( ProcessStatus(process=process, state=ProcessState.KILLED, seq=process_status.seq + 1, return_code=-9, stop_time=clock.time())) else: if process_status.state is not ProcessState.WAITING: write_process_status( ProcessStatus(process=process, state=ProcessState.LOST, seq=process_status.seq + 1)) write_task_state(terminal_status) cls.finalize_task(pathspec)
def tail(args, options): """Tail the logs of a task process. Usage: thermos tail task_name [process_name] """ if len(args) == 0: app.error('Expected a task to tail, got nothing!') if len(args) not in (1, 2): app.error( 'Expected at most two arguments (task and optional process), got %d' % len(args)) task_id = args[0] detector = TaskDetector(root=options.root) checkpoint = CheckpointDispatcher.from_file( detector.get_checkpoint(task_id)) log_dir = checkpoint.header.log_dir process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)] if len(args) == 2: process_runs = [(process, run) for (process, run) in process_runs if process == args[1]] if len(process_runs) == 0: print('ERROR: No processes found.', file=sys.stderr) sys.exit(1) processes = set([process for process, _ in process_runs]) if len(processes) != 1: print('ERROR: More than one process matches query.', file=sys.stderr) sys.exit(1) process = processes.pop() run = max([run for _, run in process_runs]) logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath('process_logdir') logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout') monitor = TaskMonitor(TaskPath(root=options.root), args[0]) def log_is_active(): active_processes = monitor.get_active_processes() for process_status, process_run in active_processes: if process_status.process == process and process_run == run: return True return False if not log_is_active(): print('Tail of terminal log %s' % logfile) for line in tail_closed(logfile): print(line.rstrip()) return now = time.time() next_check = now + 5.0 print('Tail of active log %s' % logfile) for line in tail_f(logfile, include_last=True, forever=False): print(line.rstrip()) if time.time() > next_check: if not log_is_active(): break else: next_check = time.time() + 5.0
def state(self): """Return final state of Task (RunnerState, read from disk and cached for future access)""" if self._state is None: path = self._pathspec.given(task_id=self._task_id).getpath('runner_checkpoint') self._state = CheckpointDispatcher.from_file(path) return self._state