Пример #1
0
def proxy_main(args, opts):
  assert opts.thermos_json and os.path.exists(opts.thermos_json)
  assert opts.sandbox
  assert opts.checkpoint_root

  thermos_task = get_task_from_options(opts)
  prebound_ports = opts.prebound_ports
  missing_ports = set(thermos_task.ports()) - set(prebound_ports)

  if missing_ports:
    app.error('ERROR!  Unbound ports: %s' % ' '.join(port for port in missing_ports))

  task_runner = TaskRunner(
      thermos_task.task,
      opts.checkpoint_root,
      opts.sandbox,
      task_id=opts.task_id,
      user=opts.setuid,
      portmap=prebound_ports,
      chroot=opts.chroot,
      planner_class=CappedTaskPlanner
  )

  for sig in (signal.SIGUSR1, signal.SIGUSR2):
    signal.signal(sig, functools.partial(runner_teardown, task_runner))

  try:
    task_runner.run()
  except TaskRunner.InternalError as err:
    app.error('Internal error: %s' % err)
  except TaskRunner.InvalidTask as err:
    app.error(str(err))
  except TaskRunner.StateError:
    app.error('Task appears to already be in a terminal state.')
  except KeyboardInterrupt:
    runner_teardown(task_runner)
Пример #2
0
def make_executor(
    proxy_driver,
    checkpoint_root,
    task,
    ports={},
    fast_status=False,
    runner_class=ThermosTaskRunner,
    status_providers=[HealthCheckerProvider()],
    assert_task_is_running=True,
    stop_timeout_in_secs=120):

  status_manager_class = FastStatusManager if fast_status else StatusManager
  runner_provider = make_provider(checkpoint_root, runner_class)
  te = FastThermosExecutor(
      runner_provider=runner_provider,
      status_manager_class=status_manager_class,
      sandbox_provider=DefaultTestSandboxProvider(),
      status_providers=status_providers,
      stop_timeout_in_secs=stop_timeout_in_secs
  )

  ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start()
  task_description = make_task(task, assigned_ports=ports, instanceId=0)
  te.launchTask(proxy_driver, task_description)

  te.status_manager_started.wait()

  while len(proxy_driver.method_calls['sendStatusUpdate']) < 2:
    time.sleep(0.1)

  # make sure startup was kosher
  updates = proxy_driver.method_calls['sendStatusUpdate']
  assert len(updates) == 2
  status_updates = [arg_tuple[0][0] for arg_tuple in updates]
  assert status_updates[0].state == mesos_pb2.TASK_STARTING

  runner = None
  if assert_task_is_running:
    assert status_updates[1].state == mesos_pb2.TASK_RUNNING
    # wait for the runner to bind to a task
    while True:
      runner = TaskRunner.get(task_description.task_id.value, checkpoint_root)
      if runner:
        break
      time.sleep(0.1)

  assert te.launched.is_set()
  return runner, te
def make_executor(
    proxy_driver,
    checkpoint_root,
    task,
    ports={},
    fast_status=False,
    runner_class=ThermosTaskRunner,
    status_providers=()):

  status_manager_class = FastStatusManager if fast_status else StatusManager
  runner_provider = make_provider(checkpoint_root, runner_class)
  te = FastThermosExecutor(
      runner_provider=runner_provider,
      status_manager_class=status_manager_class,
      sandbox_provider=DefaultTestSandboxProvider,
      status_providers=status_providers,
  )

  ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start()
  task_description = make_task(task, assigned_ports=ports, instanceId=0)
  te.launchTask(proxy_driver, task_description)

  te.status_manager_started.wait()
  sampled_metrics = te.metrics.sample()
  assert 'kill_manager.enabled' in sampled_metrics
  for checker in te._chained_checker._status_checkers:  # hacky
    assert ('%s.enabled' % checker.name()) in sampled_metrics

  while len(proxy_driver.method_calls['sendStatusUpdate']) < 2:
    time.sleep(0.1)

  # make sure startup was kosher
  updates = proxy_driver.method_calls['sendStatusUpdate']
  assert len(updates) == 2
  status_updates = [arg_tuple[0][0] for arg_tuple in updates]
  assert status_updates[0].state == mesos_pb2.TASK_STARTING
  assert status_updates[1].state == mesos_pb2.TASK_RUNNING

  # wait for the runner to bind to a task
  while True:
    runner = TaskRunner.get(task_description.task_id.value, checkpoint_root)
    if runner:
      break
    time.sleep(0.1)

  assert te.launched.is_set()
  return runner, te
Пример #4
0
  def test_coordinator_dead_kill(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 0

    os.kill(runner.po.pid, signal.SIGKILL)
    os.kill(process_state.coordinator_pid, signal.SIGKILL)
    os.kill(process_state.pid, signal.SIGKILL)

    killer = TaskRunner.get(runner.task_id, runner.root)
    assert killer is not None
    killer.kill(force=True)

    state = tm.get_state()
    assert len(state.processes['ignorant_process']) == 1
    assert state.processes['ignorant_process'][0].state == ProcessState.LOST
Пример #5
0
    def test_coordinator_dead_kill(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        os.kill(runner.po.pid, signal.SIGKILL)
        os.kill(process_state.coordinator_pid, signal.SIGKILL)
        os.kill(process_state.pid, signal.SIGKILL)

        killer = TaskRunner.get(runner.task_id, runner.root)
        assert killer is not None
        killer.kill(force=True)

        state = tm.get_state()
        assert len(state.processes['ignorant_process']) == 1
        assert state.processes['ignorant_process'][
            0].state == ProcessState.LOST
Пример #6
0
def _really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None,
                daemon=False):
  prebound_ports = prebound_ports or {}
  missing_ports = set(task.ports()) - set(prebound_ports.keys())
  if missing_ports:
    app.error('ERROR!  Unbound ports: %s' % ' '.join(port for port in missing_ports))
  task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id,
                           user=user, portmap=prebound_ports, chroot=chroot)
  if daemon:
    print('Daemonizing and starting runner.')
    try:
      log.teardown_stderr_logging()
      daemonize()
    except Exception as e:
      print("Failed to daemonize: %s" % e)
      sys.exit(1)
  try:
    task_runner.run()
  except KeyboardInterrupt:
    print('Got keyboard interrupt, killing job!')
    task_runner.close_ckpt()
    task_runner.kill()
Пример #7
0
  def test_preemption_wait(self):
    runner = self.start_runner()
    tm = TaskMonitor(runner.pathspec, runner.task_id)
    self.wait_until_running(tm)
    process_state, run_number = tm.get_active_processes()[0]
    assert process_state.process == 'ignorant_process'
    assert run_number == 0

    preempter = TaskRunner.get(runner.task_id, runner.root)
    assert preempter is not None
    now = time.time()
    preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
    duration = time.time() - now

    # This is arbitrary, but make sure we finish within half a second of
    # requested preemption wait.
    assert abs(duration - 1.0) < 0.5

    assert preempter.state.statuses[-1].state == TaskState.KILLED
    assert preempter.state.processes['ignorant_process'][-1].state == ProcessState.KILLED
Пример #8
0
    def test_preemption_wait(self):
        runner = self.start_runner()
        tm = TaskMonitor(runner.tempdir, runner.task_id)
        self.wait_until_running(tm)
        process_state, run_number = tm.get_active_processes()[0]
        assert process_state.process == 'ignorant_process'
        assert run_number == 0

        preempter = TaskRunner.get(runner.task_id, runner.root)
        assert preempter is not None
        now = time.time()
        preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS))
        duration = time.time() - now

        # This is arbitrary, but make sure we finish within half a second of
        # requested preemption wait.
        assert abs(duration - 1.0) < 0.5

        assert preempter.state.statuses[-1].state == TaskState.KILLED
        assert preempter.state.processes['ignorant_process'][
            -1].state == ProcessState.KILLED
Пример #9
0
def proxy_main(args, opts):
  assert opts.thermos_json and os.path.exists(opts.thermos_json)
  assert opts.sandbox
  assert opts.checkpoint_root

  thermos_task = get_task_from_options(opts)
  prebound_ports = opts.prebound_ports
  missing_ports = set(thermos_task.ports()) - set(prebound_ports)

  if missing_ports:
    log.error('ERROR!  Unbound ports: %s' % ' '.join(port for port in missing_ports))
    sys.exit(INTERNAL_ERROR)

  if opts.setuid:
    user = opts.setuid
  else:
    user = getpass.getuser()

  # if we cannot get the uid, this is an unknown user and we should fail
  try:
    pwd.getpwnam(user).pw_uid
  except KeyError:
    log.error('Unknown user: %s' % user)
    sys.exit(UNKNOWN_USER)

  task_runner = TaskRunner(
      thermos_task.task,
      opts.checkpoint_root,
      opts.sandbox,
      task_id=opts.task_id,
      user=opts.setuid,
      portmap=prebound_ports,
      chroot=opts.chroot,
      planner_class=CappedTaskPlanner
  )

  for sig in (signal.SIGUSR1, signal.SIGUSR2):
    signal.signal(sig, functools.partial(runner_teardown, task_runner))

  try:
    task_runner.run()
  except TaskRunner.InternalError as err:
    log.error('Internal error: %s' % err)
    sys.exit(INTERNAL_ERROR)
  except TaskRunner.InvalidTask as err:
    log.error('Invalid task: %s' % err)
    sys.exit(INVALID_TASK)
  except TaskRunner.StateError as err:
    log.error('Checkpoint error: %s' % err)
    sys.exit(TERMINAL_TASK)
  except Process.UnknownUserError as err:
    log.error('User ceased to exist: %s' % err)
    sys.exit(UNKNOWN_USER)
  except KeyboardInterrupt:
    log.info('Caught ^C, tearing down runner.')
    runner_teardown(task_runner)
  except Exception as e:
    log.error('Unknown exception: %s' % e)
    for line in traceback.format_exc().splitlines():
      log.error(line)
    sys.exit(UNKNOWN_ERROR)
Пример #10
0
def proxy_main(args, opts):
    assert opts.thermos_json and os.path.exists(opts.thermos_json)
    assert opts.sandbox
    assert opts.checkpoint_root

    thermos_task = get_task_from_options(opts)
    prebound_ports = opts.prebound_ports
    missing_ports = set(thermos_task.ports()) - set(prebound_ports)

    if missing_ports:
        log.error('ERROR!  Unbound ports: %s' %
                  ' '.join(port for port in missing_ports))
        sys.exit(INTERNAL_ERROR)

    if opts.setuid:
        user = opts.setuid
    else:
        user = getpass.getuser()

    # if we cannot get the uid, this is an unknown user and we should fail
    try:
        pwd.getpwnam(user).pw_uid
    except KeyError:
        log.error('Unknown user: %s' % user)
        sys.exit(UNKNOWN_USER)

    task_runner = TaskRunner(
        thermos_task.task,
        opts.checkpoint_root,
        opts.sandbox,
        task_id=opts.task_id,
        user=opts.setuid,
        portmap=prebound_ports,
        chroot=opts.chroot,
        planner_class=CappedTaskPlanner,
        hostname=opts.hostname,
        process_logger_destination=opts.process_logger_destination,
        process_logger_mode=opts.process_logger_mode,
        rotate_log_size_mb=opts.rotate_log_size_mb,
        rotate_log_backups=opts.rotate_log_backups,
        preserve_env=opts.preserve_env,
        mesos_containerizer_path=opts.mesos_containerizer_path,
        container_sandbox=opts.container_sandbox)

    for sig in (signal.SIGUSR1, signal.SIGUSR2):
        signal.signal(sig, functools.partial(runner_teardown, task_runner))

    try:
        task_runner.run()
    except TaskRunner.InternalError as err:
        log.error('Internal error: %s' % err)
        sys.exit(INTERNAL_ERROR)
    except TaskRunner.InvalidTask as err:
        log.error('Invalid task: %s' % err)
        sys.exit(INVALID_TASK)
    except TaskRunner.StateError as err:
        log.error('Checkpoint error: %s' % err)
        sys.exit(TERMINAL_TASK)
    except Process.UnknownUserError as err:
        log.error('User ceased to exist: %s' % err)
        sys.exit(UNKNOWN_USER)
    except KeyboardInterrupt:
        log.info('Caught ^C, tearing down runner.')
        runner_teardown(task_runner)
    except Exception as e:
        log.error('Unknown exception: %s' % e)
        for line in traceback.format_exc().splitlines():
            log.error(line)
        sys.exit(UNKNOWN_ERROR)