def _wait_for_control(self): """Wait for control of the checkpoint stream: must be run in the child.""" total_wait_time = Amount(0, Time.SECONDS) with open(self.ckpt_file(), 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while total_wait_time < self.MAXIMUM_CONTROL_WAIT: ckpt_tail = os.path.getsize(self.ckpt_file()) if ckpt_tail == self._ckpt_head: self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS)) total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL continue checkpoint = rr.try_read() if checkpoint: if not checkpoint.process_status: raise self.CheckpointError('No process status in checkpoint!') if (checkpoint.process_status.process != self.name() or checkpoint.process_status.state != ProcessState.FORKED or checkpoint.process_status.fork_time != self._fork_time or checkpoint.process_status.coordinator_pid != self._pid): self._log('Losing control of the checkpoint stream:') self._log(' fork_time [%s] vs self._fork_time [%s]' % ( checkpoint.process_status.fork_time, self._fork_time)) self._log(' coordinator_pid [%s] vs self._pid [%s]' % ( checkpoint.process_status.coordinator_pid, self._pid)) raise self.CheckpointError('Lost control of the checkpoint stream!') self._log('Taking control of the checkpoint stream at record: %s' % checkpoint.process_status) self._seq = checkpoint.process_status.seq + 1 return True raise self.CheckpointError('Timed out waiting for checkpoint stream!')
def _fast_forward_stream(self, process_name): log.debug('Fast forwarding %s stream to seq=%s' % (process_name, self._watermarks[process_name])) assert self._processes.get(process_name) is not None fp = self._processes[process_name] rr = ThriftRecordReader(fp, RunnerCkpt) current_watermark = -1 records = 0 while current_watermark < self._watermarks[process_name]: last_pos = fp.tell() record = rr.try_read() if record is None: break new_watermark = record.process_status.seq if new_watermark > self._watermarks[process_name]: log.debug('Over-seeked %s [watermark = %s, high watermark = %s], rewinding.' % ( process_name, new_watermark, self._watermarks[process_name])) fp.seek(last_pos) break current_watermark = new_watermark records += 1 if current_watermark < self._watermarks[process_name]: log.warning('Only able to fast forward to %s@sequence=%s, high watermark is %s' % ( process_name, current_watermark, self._watermarks[process_name])) if records: log.debug('Fast forwarded %s %s record(s) to seq=%s.' % (process_name, records, current_watermark))
def _apply_states(self): """ os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt records. Attempt to read those records and update the high watermark for that stream. Returns True if new states were applied, False otherwise. """ ckpt_offset = None try: ckpt_offset = os.stat(self._runner_ckpt).st_size updated = False if self._ckpt_head < ckpt_offset: with open(self._runner_ckpt, 'r') as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while True: runner_update = rr.try_read() if not runner_update: break try: self._dispatcher.dispatch(self._runnerstate, runner_update) except CheckpointDispatcher.InvalidSequenceNumber as e: log.error('Checkpoint stream is corrupt: %s' % e) break new_ckpt_head = fp.tell() updated = self._ckpt_head != new_ckpt_head self._ckpt_head = new_ckpt_head return updated except OSError as e: if e.errno == errno.ENOENT: # The log doesn't yet exist, will retry later. log.warning('Could not read from checkpoint %s' % self._runner_ckpt) return False else: raise
def _apply_states(self): """ os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt records. Attempt to read those records and update the high watermark for that stream. Returns True if new states were applied, False otherwise. """ ckpt_offset = None try: ckpt_offset = os.stat(self._runner_ckpt).st_size updated = False if self._ckpt_head < ckpt_offset: with open(self._runner_ckpt, "r") as fp: fp.seek(self._ckpt_head) rr = ThriftRecordReader(fp, RunnerCkpt) while True: runner_update = rr.try_read() if not runner_update: break try: self._dispatcher.dispatch(self._runnerstate, runner_update) except CheckpointDispatcher.InvalidSequenceNumber as e: log.error("Checkpoint stream is corrupt: %s" % e) break new_ckpt_head = fp.tell() updated = self._ckpt_head != new_ckpt_head self._ckpt_head = new_ckpt_head return updated except OSError as e: if e.errno == errno.ENOENT: # The log doesn't yet exist, will retry later. log.warning("Could not read from checkpoint %s" % self._runner_ckpt) return False else: raise
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError as e: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def _fast_forward_stream(self, process_name): log.debug('Fast forwarding %s stream to seq=%s' % (process_name, self._watermarks[process_name])) assert self._processes.get(process_name) is not None fp = self._processes[process_name] rr = ThriftRecordReader(fp, RunnerCkpt) current_watermark = -1 records = 0 while current_watermark < self._watermarks[process_name]: last_pos = fp.tell() record = rr.try_read() if record is None: break new_watermark = record.process_status.seq if new_watermark > self._watermarks[process_name]: log.debug( 'Over-seeked %s [watermark = %s, high watermark = %s], rewinding.' % (process_name, new_watermark, self._watermarks[process_name])) fp.seek(last_pos) break current_watermark = new_watermark records += 1 if current_watermark < self._watermarks[process_name]: log.warning( 'Only able to fast forward to %s@sequence=%s, high watermark is %s' % (process_name, current_watermark, self._watermarks[process_name])) if records: log.debug('Fast forwarded %s %s record(s) to seq=%s.' % (process_name, records, current_watermark))
def select(self): """ Read and multiplex checkpoint records from all the forked off process coordinators. Checkpoint records can come from one of two places: in-process: checkpoint records synthesized for FORKED and LOST events out-of-process: checkpoint records from from file descriptors of forked coordinators Returns a list of RunnerCkpt objects that were successfully read, or an empty list if none were read. """ self._bind_processes() updates = [] for handle in filter(None, self._processes.values()): try: fstat = os.fstat(handle.fileno()) except OSError: log.error('Unable to fstat %s!' % handle.name) continue if handle.tell() > fstat.st_size: log.error('Truncated checkpoint record detected on %s!' % handle.name) elif handle.tell() < fstat.st_size: rr = ThriftRecordReader(handle, RunnerCkpt) while True: process_update = rr.try_read() if process_update: updates.append(process_update) else: break if len(updates) > 0: log.debug('select() returning %s updates:' % len(updates)) for update in updates: log.debug(' = %s' % update) return updates
def has_data(self, process): """ Return true if we think that there are updates available from the supplied process. """ self._bind_processes() # TODO(wickman) Should this raise ProcessNotFound? if process not in self._processes: return False fp = self._processes[process] rr = ThriftRecordReader(fp, RunnerCkpt) old_pos = fp.tell() try: expected_new_pos = os.fstat(fp.fileno()).st_size except OSError as e: log.debug('ProcessMuxer could not fstat for process %s' % process) return False update = rr.try_read() if update: fp.seek(old_pos) return True return False
def has_data(self, process): """ Return true if we think that there are updates available from the supplied process. """ self._bind_processes() # TODO(wickman) Should this raise ProcessNotFound? if process not in self._processes: return False fp = self._processes[process] rr = ThriftRecordReader(fp, RunnerCkpt) old_pos = fp.tell() try: os.fstat(fp.fileno()).st_size except OSError: log.debug('ProcessMuxer could not fstat for process %s' % process) return False update = rr.try_read() if update: fp.seek(old_pos) return True return False