def _fast_forward_stream(self, process_name): log.debug('Fast forwarding %s stream to seq=%s' % (process_name, self._watermarks[process_name])) assert self._processes.get(process_name) is not None fp = self._processes[process_name] rr = ThriftRecordReader(fp, RunnerCkpt) current_watermark = -1 records = 0 while current_watermark < self._watermarks[process_name]: last_pos = fp.tell() record = rr.try_read() if record is None: break new_watermark = record.process_status.seq if new_watermark > self._watermarks[process_name]: log.debug( 'Over-seeked %s [watermark = %s, high watermark = %s], rewinding.' % (process_name, new_watermark, self._watermarks[process_name])) fp.seek(last_pos) break current_watermark = new_watermark records += 1 if current_watermark < self._watermarks[process_name]: log.warning( 'Only able to fast forward to %s@sequence=%s, high watermark is %s' % (process_name, current_watermark, self._watermarks[process_name])) if records: log.debug('Fast forwarded %s %s record(s) to seq=%s.' % (process_name, records, current_watermark))
def _wait_for_control(self): """Wait for control of the checkpoint stream: must be run in the child.""" total_wait_time = Amount(0, Time.SECONDS) with open(self.ckpt_file(), 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while total_wait_time < self.MAXIMUM_CONTROL_WAIT: ckpt_tail = os.path.getsize(self.ckpt_file()) if ckpt_tail == self._ckpt_head: self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS)) total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL continue checkpoint = rr.try_read() if checkpoint: if not checkpoint.process_status: raise self.CheckpointError('No process status in checkpoint!') if (checkpoint.process_status.process != self.name() or checkpoint.process_status.state != ProcessState.FORKED or checkpoint.process_status.fork_time != self._fork_time or checkpoint.process_status.coordinator_pid != self._pid): self._log('Losing control of the checkpoint stream:') self._log(' fork_time [%s] vs self._fork_time [%s]' % ( checkpoint.process_status.fork_time, self._fork_time)) self._log(' coordinator_pid [%s] vs self._pid [%s]' % ( checkpoint.process_status.coordinator_pid, self._pid)) raise self.CheckpointError('Lost control of the checkpoint stream!') self._log('Taking control of the checkpoint stream at record: %s' % checkpoint.process_status) self._seq = checkpoint.process_status.seq + 1 return True raise self.CheckpointError('Timed out waiting for checkpoint stream!')
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def _apply_states(self): """ os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt records. Attempt to read those records and update the high watermark for that stream. Returns True if new states were applied, False otherwise. """ ckpt_offset = None try: ckpt_offset = os.stat(self._runner_ckpt).st_size updated = False if self._ckpt_head < ckpt_offset: with open(self._runner_ckpt, 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while True: runner_update = rr.try_read() if not runner_update: break try: self._dispatcher.dispatch(self._runnerstate, runner_update) except CheckpointDispatcher.InvalidSequenceNumber as e: log.error('Checkpoint stream is corrupt: %s' % e) break new_ckpt_head = fp.tell() updated = self._ckpt_head != new_ckpt_head self._ckpt_head = new_ckpt_head return updated except OSError as e: if e.errno == errno.ENOENT: # The log doesn't yet exist, will retry later. log.warning('Could not read from checkpoint %s' % self._runner_ckpt) return False else: raise
def iter_updates(cls, filename): try: with open(filename) as fp: rr = ThriftRecordReader(fp, RunnerCkpt) for update in rr: yield update except (IOError, OSError, RecordIO.Error) as err: raise cls.ErrorRecoveringState(err)
def wait_for_rc(checkpoint, timeout=5.0): start = time.time() trr = ThriftRecordReader(open(checkpoint), RunnerCkpt) while time.time() < start + timeout: record = trr.read() if record and record.process_status and record.process_status.return_code is not None: return record.process_status.return_code else: time.sleep(0.1)
def read(args, options): """Replay a thermos checkpoint. Usage: thermos read [options] checkpoint_filename Options: --simple Do not replay the full task state machine. Only print out the contents of each checkpoint log message. """ if len(args) != 1: app.error('Expected one checkpoint file, got %s' % len(args)) if not os.path.exists(args[0]): app.error('Could not find %s' % args[0]) dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) with open(args[0], 'r') as fp: try: for record in ThriftRecordReader(fp, RunnerCkpt): if not options.simple: dispatcher.dispatch(state, record) else: print('CKPT: %s' % record) except RecordIO.Error as err: print("Failed to recover from %s: %s" % (fp.name, err)) return if not options.simple: if state is None or state.header is None: print('Checkpoint stream CORRUPT or outdated format') return print('Recovered Task Header:') print(' id: %s' % state.header.task_id) print(' user: %s' % state.header.user) print(' host: %s' % state.header.hostname) print(' sandbox: %s' % state.header.sandbox) if state.header.ports: print(' ports: %s' % ' '.join('%s->%s' % (name, port) for (name, port) in state.header.ports.items())) print('Recovered Task States:') for task_status in state.statuses: print( ' %s [pid: %d] => %s' % (time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state])) print('Recovered Processes:') for process, process_history in state.processes.items(): print(' %s runs: %s' % (process, len(process_history))) for k in reversed(range(len(process_history))): run = process_history[k] print( ' %2d: pid=%d, rc=%s, finish:%s, state:%s' % (k, run.pid, run.return_code if run.return_code is not None else '', time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None', ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
def _replay_runner_ckpt(self): """ Replay the checkpoint stream associated with this task. """ ckpt_file = self._pathspec.getpath('runner_checkpoint') if os.path.exists(ckpt_file): with open(ckpt_file, 'r') as fp: ckpt_recover = ThriftRecordReader(fp, RunnerCkpt) for record in ckpt_recover: log.debug('Replaying runner checkpoint record: %s' % record) self._dispatcher.dispatch(self._state, record, recovery=True)
def test_thrift_garbage(): with EphemeralFile('w') as fp: fn = fp.name fp.write(struct.pack('>L', 2)) fp.write('ab') fp.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) with pytest.raises(RecordIO.PrematureEndOfStream): rr.read()
def test_basic_thriftrecordwriter_write(): test_string = StringType("hello world") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string
def test_paranoid_thrift_append_framing(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name ThriftRecordWriter.append(fn, test_string_1) ThriftRecordWriter.append(fn, test_string_2) with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string_1 assert rr.read() == test_string_2
def test_thrift_recordwriter_type_mismatch(): test_string = StringType("hello world") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, IntType) # This is a peculiar behavior of Thrift in that it just returns # ThriftType() with no serialization applied assert rr.read() == IntType()
def test_thriftrecordreader_iteration(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string_1) rw.write(test_string_2) rw.close() with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) records = [] for record in rr: records.append(record) assert records == [test_string_1, test_string_2]
def test_thriftrecordwriter_framing(): test_string_1 = StringType("hello world") test_string_2 = StringType("ahoy ahoy, bonjour") with EphemeralFile('w') as fp: fn = fp.name rw = ThriftRecordWriter(fp) rw.write(test_string_1) rw.close() with open(fn, 'a') as fpa: rw = ThriftRecordWriter(fpa) rw.write(test_string_2) with open(fn) as fpr: rr = ThriftRecordReader(fpr, StringType) assert rr.read() == test_string_1 assert rr.read() == test_string_2
def has_data(self, process): """ Return true if we think that there are updates available from the supplied process. """ self._bind_processes() # TODO(wickman) Should this raise ProcessNotFound? if process not in self._processes: return False fp = self._processes[process] rr = ThriftRecordReader(fp, RunnerCkpt) old_pos = fp.tell() try: os.fstat(fp.fileno()).st_size except OSError: log.debug('ProcessMuxer could not fstat for process %s' % process) return False update = rr.try_read() if update: fp.seek(old_pos) return True return False
def main(args): values = app.get_options() if len(args) > 0: print >> sys.stderr, "ERROR: unrecognized arguments: %s\n" % ( " ".join(args)) app.help() sys.exit(1) if not values.ckpt: print >> sys.stderr, "ERROR: must supply --checkpoint" app.help() sys.exit(1) fp = file(values.ckpt, "r") rr = ThriftRecordReader(fp, RunnerCkpt) wrs = RunnerState(processes={}) dispatcher = CheckpointDispatcher() try: for wts in rr: print 'Recovering: ', wts if values.assemble is True: dispatcher.dispatch(wrs, wts) except RecordIO.Error as err: print 'Error recovering checkpoint stream: %s' % err return print '\n\n\n' if values.assemble: print 'Recovered Task Header' pprint.pprint(wrs.header, indent=4) print '\nRecovered Task States' for task_status in wrs.statuses: print ' %s [pid: %d] => %s' % ( time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)), task_status.runner_pid, TaskState._VALUES_TO_NAMES[task_status.state]) print '\nRecovered Processes' pprint.pprint(wrs.processes, indent=4)
def inspect(self, task_id): """ Reconstructs the checkpoint stream and returns a CheckpointInspection. """ dispatcher = CheckpointDispatcher() state = RunnerState(processes={}) muxer = ProcessMuxer(self._path.given(task_id=task_id)) runner_processes = [] coordinator_processes = set() processes = set() def consume_process_record(record): if not record.process_status: return try: user_uid = pwd.getpwnam(state.header.user).pw_uid except KeyError: log.error('Could not find user: %s' % state.header.user) return if record.process_status.state == ProcessState.FORKED: coordinator_processes.add( (record.process_status.coordinator_pid, user_uid, record.process_status.fork_time)) elif record.process_status.state == ProcessState.RUNNING: processes.add((record.process_status.pid, user_uid, record.process_status.start_time)) # replay runner checkpoint runner_pid = None runner_latest_update = 0 try: with open( self._path.given( task_id=task_id).getpath('runner_checkpoint')) as fp: with closing(ThriftRecordReader(fp, RunnerCkpt)) as ckpt: for record in ckpt: dispatcher.dispatch(state, record) runner_latest_update = max( runner_latest_update, self.get_timestamp(record.process_status)) # collect all bound runners if record.task_status: if record.task_status.runner_pid != runner_pid: runner_processes.append( (record.task_status.runner_pid, record.task_status.runner_uid or 0, record.task_status.timestamp_ms)) runner_pid = record.task_status.runner_pid elif record.process_status: consume_process_record(record) except (IOError, OSError, RecordIO.Error) as err: log.debug('Error inspecting task runner checkpoint: %s' % err) return # register existing processes in muxer for process_name in state.processes: muxer.register(process_name) # read process checkpoints process_latest_update = runner_latest_update for record in muxer.select(): process_latest_update = max( process_latest_update, self.get_timestamp(record.process_status)) consume_process_record(record) return CheckpointInspection( runner_latest_update=runner_latest_update, process_latest_update=process_latest_update, runner_processes=runner_processes, coordinator_processes=coordinator_processes, processes=processes)
def test_thrift_invalid_codec_with_nonclass(): with EphemeralFile('w') as fp: with pytest.raises(ThriftRecordIO.InvalidThriftException): ThriftRecordReader(fp, 5)
def test_thrift_invalid_codec_with_object_instead_of_class(): with EphemeralFile('w') as fp: with pytest.raises(ThriftRecordIO.InvalidThriftException): ThriftRecordReader(fp, StringType())