Пример #1
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Пример #2
0
 def _setup_ckpt(self):
   """Set up the checkpoint: must be run on the parent."""
   self._log('initializing checkpoint file: %s' % self.ckpt_file())
   ckpt_fp = lock_file(self.ckpt_file(), "a+")
   if ckpt_fp in (None, False):
     raise self.CheckpointError('Could not acquire checkpoint permission or lock for %s!' %
       self.ckpt_file())
   self._ckpt_head = os.path.getsize(self.ckpt_file())
   ckpt_fp.seek(self._ckpt_head)
   self._ckpt = ThriftRecordWriter(ckpt_fp)
   self._ckpt.set_sync(True)
Пример #3
0
def test_basic_thriftrecordwriter_write():
  test_string = StringType("hello world")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string
Пример #4
0
def test_thrift_recordwriter_type_mismatch():
  test_string = StringType("hello world")
  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, IntType)
      # This is a peculiar behavior of Thrift in that it just returns
      # ThriftType() with no serialization applied
      assert rr.read() == IntType()
Пример #5
0
def test_paranoid_thrift_append_framing():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    ThriftRecordWriter.append(fn, test_string_1)
    ThriftRecordWriter.append(fn, test_string_2)

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string_1
      assert rr.read() == test_string_2
Пример #6
0
def test_paranoid_thrift_append_framing():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    ThriftRecordWriter.append(fn, test_string_1)
    ThriftRecordWriter.append(fn, test_string_2)

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string_1
      assert rr.read() == test_string_2
Пример #7
0
def test_thriftrecordreader_iteration():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string_1)
    rw.write(test_string_2)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      records = []
      for record in rr:
        records.append(record)
      assert records == [test_string_1, test_string_2]
Пример #8
0
def test_thriftrecordwriter_framing():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string_1)
    rw.close()

    with open(fn, 'a') as fpa:
      rw = ThriftRecordWriter(fpa)
      rw.write(test_string_2)

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string_1
      assert rr.read() == test_string_2
Пример #9
0
 def _setup_ckpt(self):
   """Set up the checkpoint: must be run on the parent."""
   self._log('initializing checkpoint file: %s' % self.ckpt_file())
   ckpt_fp = lock_file(self.ckpt_file(), "a+")
   if ckpt_fp in (None, False):
     raise self.CheckpointError('Could not acquire checkpoint permission or lock for %s!' %
       self.ckpt_file())
   self._ckpt_head = os.path.getsize(self.ckpt_file())
   ckpt_fp.seek(self._ckpt_head)
   self._ckpt = ThriftRecordWriter(ckpt_fp)
   self._ckpt.set_sync(True)
Пример #10
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info("Found existing runner, forcing leadership forfeit.")
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info("Successfully killed leader.")
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error("Found existing runner, cannot take control.")
     if fp in (None, False):
         raise cls.PermissionError("Could not open locked checkpoint: %s, lock_file = %s" % (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Пример #11
0
def test_thriftrecordwriter_framing():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string_1)
    rw.close()

    with open(fn, 'a') as fpa:
      rw = ThriftRecordWriter(fpa)
      rw.write(test_string_2)

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string_1
      assert rr.read() == test_string_2
Пример #12
0
def test_basic_thriftrecordwriter_write():
  test_string = StringType("hello world")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      assert rr.read() == test_string
Пример #13
0
def test_thrift_recordwriter_type_mismatch():
  test_string = StringType("hello world")
  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, IntType)
      # This is a peculiar behavior of Thrift in that it just returns
      # ThriftType() with no serialization applied
      assert rr.read() == IntType()
Пример #14
0
def test_thriftrecordreader_iteration():
  test_string_1 = StringType("hello world")
  test_string_2 = StringType("ahoy ahoy, bonjour")

  with EphemeralFile('w') as fp:
    fn = fp.name

    rw = ThriftRecordWriter(fp)
    rw.write(test_string_1)
    rw.write(test_string_2)
    rw.close()

    with open(fn) as fpr:
      rr = ThriftRecordReader(fpr, StringType)
      records = []
      for record in rr:
        records.append(record)
      assert records == [test_string_1, test_string_2]
Пример #15
0
class ProcessBase(object):
  """
    Encapsulate a running process for a task.
  """
  class Error(Exception): pass
  class UnknownUserError(Error): pass
  class CheckpointError(Error): pass
  class UnspecifiedSandbox(Error): pass
  class PermissionError(Error): pass

  CONTROL_WAIT_CHECK_INTERVAL = Amount(100, Time.MILLISECONDS)
  MAXIMUM_CONTROL_WAIT = Amount(1, Time.MINUTES)

  def __init__(self, name, cmdline, sequence, pathspec, sandbox_dir, user=None, platform=None,
               logger_destination=LoggerDestination.FILE, logger_mode=LoggerMode.STANDARD,
               rotate_log_size=None, rotate_log_backups=None):
    """
      required:
        name        = name of the process
        cmdline     = cmdline of the process
        sequence    = the next available sequence number for state updates
        pathspec    = TaskPath object for synthesizing path names
        sandbox_dir = the sandbox in which to run the process
        platform    = Platform providing fork, clock, getpid

      optional:
        user               = the user to run as (if unspecified, will default to current user.)
                             if specified to a user that is not the current user, you must have root
                             access
        logger_destination = The destination for logs output.
        logger_mode        = The type of logger to use for the process.
        rotate_log_size    = The maximum size of the rotated stdout/stderr logs.
        rotate_log_backups = The maximum number of rotated stdout/stderr log backups.
    """
    self._name = name
    self._cmdline = cmdline
    self._pathspec = pathspec
    self._seq = sequence
    self._sandbox = sandbox_dir
    if self._sandbox:
      safe_mkdir(self._sandbox)
    self._pid = None
    self._fork_time = None
    self._user = user
    self._ckpt = None
    self._ckpt_head = -1
    if platform is None:
      raise ValueError("Platform must be specified")
    self._platform = platform
    self._logger_destination = logger_destination
    self._logger_mode = logger_mode
    self._rotate_log_size = rotate_log_size
    self._rotate_log_backups = rotate_log_backups

    if not LoggerDestination.is_valid(self._logger_destination):
      raise ValueError("Logger destination %s is invalid." % self._logger_destination)

    if not LoggerMode.is_valid(self._logger_mode):
      raise ValueError("Logger mode %s is invalid." % self._logger_mode)

    if self._logger_mode == LoggerMode.ROTATE:
      if self._rotate_log_size.as_(Data.BYTES) <= 0:
        raise ValueError('Log size cannot be less than one byte.')
      if self._rotate_log_backups <= 0:
        raise ValueError('Log backups cannot be less than one.')

  def _log(self, msg, exc_info=None):
    log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg),
            exc_info=exc_info)

  def _getpwuid(self):
    """Returns a tuple of the user (i.e. --user) and current user."""
    uid = os.getuid()
    try:
      current_user = pwd.getpwuid(uid)
    except KeyError:
      raise self.UnknownUserError('Unknown uid %s!' % uid)
    try:
      user = pwd.getpwnam(self._user) if self._user is not None else current_user
    except KeyError:
      raise self.UnknownUserError('Unable to get pwent information!')
    return user, current_user

  def _ckpt_write(self, msg):
    self._init_ckpt_if_necessary()
    self._log("child state transition [%s] <= %s" % (self.ckpt_file(), msg))
    self._ckpt.write(msg)

  def _write_process_update(self, **kw):
    """Write a process update to the coordinator's checkpoint stream."""
    process_status = ProcessStatus(**kw)
    process_status.seq = self._seq
    process_status.process = self.name()
    self._ckpt_write(RunnerCkpt(process_status=process_status))
    self._seq += 1

  def _write_initial_update(self):
    self._write_process_update(state=ProcessState.FORKED,
                               fork_time=self._fork_time,
                               coordinator_pid=self._pid)

  def cmdline(self):
    return self._cmdline

  def name(self):
    return self._name

  def pid(self):
    """pid of the coordinator"""
    return self._pid

  def rebind(self, pid, fork_time):
    """rebind Process to an existing coordinator pid without forking"""
    self._pid = pid
    self._fork_time = fork_time

  def ckpt_file(self):
    return self._pathspec.getpath('process_checkpoint')

  def process_logdir(self):
    return self._pathspec.getpath('process_logdir')

  def _setup_ckpt(self):
    """Set up the checkpoint: must be run on the parent."""
    self._log('initializing checkpoint file: %s' % self.ckpt_file())
    ckpt_fp = lock_file(self.ckpt_file(), "a+")
    if ckpt_fp in (None, False):
      raise self.CheckpointError('Could not acquire checkpoint permission or lock for %s!' %
        self.ckpt_file())
    self._ckpt_head = os.path.getsize(self.ckpt_file())
    ckpt_fp.seek(self._ckpt_head)
    self._ckpt = ThriftRecordWriter(ckpt_fp)
    self._ckpt.set_sync(True)

  def _init_ckpt_if_necessary(self):
    if self._ckpt is None:
      self._setup_ckpt()

  def _wait_for_control(self):
    """Wait for control of the checkpoint stream: must be run in the child."""
    total_wait_time = Amount(0, Time.SECONDS)

    with open(self.ckpt_file(), 'r') as fp:
      fp.seek(self._ckpt_head)
      rr = ThriftRecordReader(fp, RunnerCkpt)
      while total_wait_time < self.MAXIMUM_CONTROL_WAIT:
        ckpt_tail = os.path.getsize(self.ckpt_file())
        if ckpt_tail == self._ckpt_head:
          self._platform.clock().sleep(self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS))
          total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL
          continue
        checkpoint = rr.try_read()
        if checkpoint:
          if not checkpoint.process_status:
            raise self.CheckpointError('No process status in checkpoint!')
          if (checkpoint.process_status.process != self.name() or
              checkpoint.process_status.state != ProcessState.FORKED or
              checkpoint.process_status.fork_time != self._fork_time or
              checkpoint.process_status.coordinator_pid != self._pid):
            self._log('Losing control of the checkpoint stream:')
            self._log('   fork_time [%s] vs self._fork_time [%s]' % (
                checkpoint.process_status.fork_time, self._fork_time))
            self._log('   coordinator_pid [%s] vs self._pid [%s]' % (
                checkpoint.process_status.coordinator_pid, self._pid))
            raise self.CheckpointError('Lost control of the checkpoint stream!')
          self._log('Taking control of the checkpoint stream at record: %s' %
            checkpoint.process_status)
          self._seq = checkpoint.process_status.seq + 1
          return True
    raise self.CheckpointError('Timed out waiting for checkpoint stream!')

  def _prepare_fork(self):
    user, current_user = self._getpwuid()
    if self._user:
      if user != current_user and os.geteuid() != 0:
        raise self.PermissionError('Must be root to run processes as other users!')
    self._fork_time = self._platform.clock().time()
    self._setup_ckpt()
    # Since the forked process is responsible for creating log files, it needs to own the log dir.
    safe_mkdir(self.process_logdir())
    os.chown(self.process_logdir(), user.pw_uid, user.pw_gid)

  def _finalize_fork(self):
    self._write_initial_update()
    self._ckpt.close()
    self._ckpt = None

  def start(self):
    """
      This is the main call point from the runner, and forks a co-ordinator process to run the
      target process (i.e. self.cmdline())

      The parent returns immediately and populates information about the pid of the co-ordinator.
      The child (co-ordinator) will launch the target process in a subprocess.
    """
    self._prepare_fork()  # calls _setup_ckpt which can raise CheckpointError
                          # calls _getpwuid which can raise:
                          #    UnknownUserError
                          #    PermissionError
    self._pid = self._platform.fork()
    if self._pid == 0:
      self._pid = self._platform.getpid()
      self._wait_for_control()  # can raise CheckpointError
      try:
        self.execute()
      except Exception as e:
        self._log('Error trying to execute %s: %s' % (self._name, e))
        raise e
      finally:
        self._ckpt.close()
        self.finish()
    else:
      self._finalize_fork()  # can raise CheckpointError

  def execute(self):
    raise NotImplementedError

  def finish(self):
    pass
Пример #16
0
class ProcessBase(object):
    """
    Encapsulate a running process for a task.
  """
    class Error(Exception):
        pass

    class UnknownUserError(Error):
        pass

    class CheckpointError(Error):
        pass

    class UnspecifiedSandbox(Error):
        pass

    class PermissionError(Error):
        pass

    CONTROL_WAIT_CHECK_INTERVAL = Amount(100, Time.MILLISECONDS)
    MAXIMUM_CONTROL_WAIT = Amount(1, Time.MINUTES)

    def __init__(self,
                 name,
                 cmdline,
                 sequence,
                 pathspec,
                 sandbox_dir,
                 user=None,
                 platform=None,
                 logger_destination=LoggerDestination.FILE,
                 logger_mode=LoggerMode.STANDARD,
                 rotate_log_size=None,
                 rotate_log_backups=None):
        """
      required:
        name        = name of the process
        cmdline     = cmdline of the process
        sequence    = the next available sequence number for state updates
        pathspec    = TaskPath object for synthesizing path names
        sandbox_dir = the sandbox in which to run the process
        platform    = Platform providing fork, clock, getpid

      optional:
        user               = the user to run as (if unspecified, will default to current user.)
                             if specified to a user that is not the current user, you must have root
                             access
        logger_destination = The destination for logs output.
        logger_mode        = The type of logger to use for the process.
        rotate_log_size    = The maximum size of the rotated stdout/stderr logs.
        rotate_log_backups = The maximum number of rotated stdout/stderr log backups.
    """
        self._name = name
        self._cmdline = cmdline
        self._pathspec = pathspec
        self._seq = sequence
        self._sandbox = sandbox_dir
        if self._sandbox:
            safe_mkdir(self._sandbox)
        self._pid = None
        self._fork_time = None
        self._user = user
        self._ckpt = None
        self._ckpt_head = -1
        if platform is None:
            raise ValueError("Platform must be specified")
        self._platform = platform
        self._logger_destination = logger_destination
        self._logger_mode = logger_mode
        self._rotate_log_size = rotate_log_size
        self._rotate_log_backups = rotate_log_backups

        if not LoggerDestination.is_valid(self._logger_destination):
            raise ValueError("Logger destination %s is invalid." %
                             self._logger_destination)

        if not LoggerMode.is_valid(self._logger_mode):
            raise ValueError("Logger mode %s is invalid." % self._logger_mode)

        if self._logger_mode == LoggerMode.ROTATE:
            if self._rotate_log_size.as_(Data.BYTES) <= 0:
                raise ValueError('Log size cannot be less than one byte.')
            if self._rotate_log_backups <= 0:
                raise ValueError('Log backups cannot be less than one.')

    def _log(self, msg, exc_info=None):
        log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg),
                  exc_info=exc_info)

    def _getpwuid(self):
        """Returns a tuple of the user (i.e. --user) and current user."""
        uid = os.getuid()
        try:
            current_user = pwd.getpwuid(uid)
        except KeyError:
            raise self.UnknownUserError('Unknown uid %s!' % uid)
        try:
            user = pwd.getpwnam(
                self._user) if self._user is not None else current_user
        except KeyError:
            raise self.UnknownUserError('Unable to get pwent information!')
        return user, current_user

    def _ckpt_write(self, msg):
        self._init_ckpt_if_necessary()
        self._log("child state transition [%s] <= %s" %
                  (self.ckpt_file(), msg))
        self._ckpt.write(msg)

    def _write_process_update(self, **kw):
        """Write a process update to the coordinator's checkpoint stream."""
        process_status = ProcessStatus(**kw)
        process_status.seq = self._seq
        process_status.process = self.name()
        self._ckpt_write(RunnerCkpt(process_status=process_status))
        self._seq += 1

    def _write_initial_update(self):
        self._write_process_update(state=ProcessState.FORKED,
                                   fork_time=self._fork_time,
                                   coordinator_pid=self._pid)

    def cmdline(self):
        return self._cmdline

    def name(self):
        return self._name

    def pid(self):
        """pid of the coordinator"""
        return self._pid

    def rebind(self, pid, fork_time):
        """rebind Process to an existing coordinator pid without forking"""
        self._pid = pid
        self._fork_time = fork_time

    def ckpt_file(self):
        return self._pathspec.getpath('process_checkpoint')

    def process_logdir(self):
        return self._pathspec.getpath('process_logdir')

    def _setup_ckpt(self):
        """Set up the checkpoint: must be run on the parent."""
        self._log('initializing checkpoint file: %s' % self.ckpt_file())
        ckpt_fp = lock_file(self.ckpt_file(), "a+")
        if ckpt_fp in (None, False):
            raise self.CheckpointError(
                'Could not acquire checkpoint permission or lock for %s!' %
                self.ckpt_file())
        self._ckpt_head = os.path.getsize(self.ckpt_file())
        ckpt_fp.seek(self._ckpt_head)
        self._ckpt = ThriftRecordWriter(ckpt_fp)
        self._ckpt.set_sync(True)

    def _init_ckpt_if_necessary(self):
        if self._ckpt is None:
            self._setup_ckpt()

    def _wait_for_control(self):
        """Wait for control of the checkpoint stream: must be run in the child."""
        total_wait_time = Amount(0, Time.SECONDS)

        with open(self.ckpt_file(), 'r') as fp:
            fp.seek(self._ckpt_head)
            rr = ThriftRecordReader(fp, RunnerCkpt)
            while total_wait_time < self.MAXIMUM_CONTROL_WAIT:
                ckpt_tail = os.path.getsize(self.ckpt_file())
                if ckpt_tail == self._ckpt_head:
                    self._platform.clock().sleep(
                        self.CONTROL_WAIT_CHECK_INTERVAL.as_(Time.SECONDS))
                    total_wait_time += self.CONTROL_WAIT_CHECK_INTERVAL
                    continue
                checkpoint = rr.try_read()
                if checkpoint:
                    if not checkpoint.process_status:
                        raise self.CheckpointError(
                            'No process status in checkpoint!')
                    if (checkpoint.process_status.process != self.name()
                            or checkpoint.process_status.state !=
                            ProcessState.FORKED
                            or checkpoint.process_status.fork_time !=
                            self._fork_time
                            or checkpoint.process_status.coordinator_pid !=
                            self._pid):
                        self._log('Losing control of the checkpoint stream:')
                        self._log('   fork_time [%s] vs self._fork_time [%s]' %
                                  (checkpoint.process_status.fork_time,
                                   self._fork_time))
                        self._log('   coordinator_pid [%s] vs self._pid [%s]' %
                                  (checkpoint.process_status.coordinator_pid,
                                   self._pid))
                        raise self.CheckpointError(
                            'Lost control of the checkpoint stream!')
                    self._log(
                        'Taking control of the checkpoint stream at record: %s'
                        % checkpoint.process_status)
                    self._seq = checkpoint.process_status.seq + 1
                    return True
        raise self.CheckpointError('Timed out waiting for checkpoint stream!')

    def _prepare_fork(self):
        user, current_user = self._getpwuid()
        if self._user:
            if user != current_user and os.geteuid() != 0:
                raise self.PermissionError(
                    'Must be root to run processes as other users!')
        self._fork_time = self._platform.clock().time()
        self._setup_ckpt()
        # Since the forked process is responsible for creating log files, it needs to own the log dir.
        safe_mkdir(self.process_logdir())
        os.chown(self.process_logdir(), user.pw_uid, user.pw_gid)

    def _finalize_fork(self):
        self._write_initial_update()
        self._ckpt.close()
        self._ckpt = None

    def start(self):
        """
      This is the main call point from the runner, and forks a co-ordinator process to run the
      target process (i.e. self.cmdline())

      The parent returns immediately and populates information about the pid of the co-ordinator.
      The child (co-ordinator) will launch the target process in a subprocess.
    """
        self._prepare_fork(
        )  # calls _setup_ckpt which can raise CheckpointError
        # calls _getpwuid which can raise:
        #    UnknownUserError
        #    PermissionError
        self._pid = self._platform.fork()
        if self._pid == 0:
            self._pid = self._platform.getpid()
            self._wait_for_control()  # can raise CheckpointError
            try:
                self.execute()
            except Exception as e:
                self._log('Error trying to execute %s: %s' % (self._name, e))
                raise e
            finally:
                self._ckpt.close()
                self.finish()
        else:
            self._finalize_fork()  # can raise CheckpointError

    def execute(self):
        raise NotImplementedError

    def finish(self):
        pass