def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner( thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner ) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: app.error('Internal error: %s' % err) except TaskRunner.InvalidTask as err: app.error(str(err)) except TaskRunner.StateError: app.error('Task appears to already be in a terminal state.') except KeyboardInterrupt: runner_teardown(task_runner)
def make_executor( proxy_driver, checkpoint_root, task, ports={}, fast_status=False, runner_class=ThermosTaskRunner, status_providers=[HealthCheckerProvider()], assert_task_is_running=True, stop_timeout_in_secs=120): status_manager_class = FastStatusManager if fast_status else StatusManager runner_provider = make_provider(checkpoint_root, runner_class) te = FastThermosExecutor( runner_provider=runner_provider, status_manager_class=status_manager_class, sandbox_provider=DefaultTestSandboxProvider(), status_providers=status_providers, stop_timeout_in_secs=stop_timeout_in_secs ) ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start() task_description = make_task(task, assigned_ports=ports, instanceId=0) te.launchTask(proxy_driver, task_description) te.status_manager_started.wait() while len(proxy_driver.method_calls['sendStatusUpdate']) < 2: time.sleep(0.1) # make sure startup was kosher updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 2 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING runner = None if assert_task_is_running: assert status_updates[1].state == mesos_pb2.TASK_RUNNING # wait for the runner to bind to a task while True: runner = TaskRunner.get(task_description.task_id.value, checkpoint_root) if runner: break time.sleep(0.1) assert te.launched.is_set() return runner, te
def make_executor( proxy_driver, checkpoint_root, task, ports={}, fast_status=False, runner_class=ThermosTaskRunner, status_providers=()): status_manager_class = FastStatusManager if fast_status else StatusManager runner_provider = make_provider(checkpoint_root, runner_class) te = FastThermosExecutor( runner_provider=runner_provider, status_manager_class=status_manager_class, sandbox_provider=DefaultTestSandboxProvider, status_providers=status_providers, ) ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start() task_description = make_task(task, assigned_ports=ports, instanceId=0) te.launchTask(proxy_driver, task_description) te.status_manager_started.wait() sampled_metrics = te.metrics.sample() assert 'kill_manager.enabled' in sampled_metrics for checker in te._chained_checker._status_checkers: # hacky assert ('%s.enabled' % checker.name()) in sampled_metrics while len(proxy_driver.method_calls['sendStatusUpdate']) < 2: time.sleep(0.1) # make sure startup was kosher updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 2 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb2.TASK_STARTING assert status_updates[1].state == mesos_pb2.TASK_RUNNING # wait for the runner to bind to a task while True: runner = TaskRunner.get(task_description.task_id.value, checkpoint_root) if runner: break time.sleep(0.1) assert te.launched.is_set() return runner, te
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][0].state == ProcessState.LOST
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST
def _really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None, daemon=False): prebound_ports = prebound_ports or {} missing_ports = set(task.ports()) - set(prebound_ports.keys()) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id, user=user, portmap=prebound_ports, chroot=chroot) if daemon: print('Daemonizing and starting runner.') try: log.teardown_stderr_logging() daemonize() except Exception as e: print("Failed to daemonize: %s" % e) sys.exit(1) try: task_runner.run() except KeyboardInterrupt: print('Got keyboard interrupt, killing job!') task_runner.close_ckpt() task_runner.kill()
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][-1].state == ProcessState.KILLED
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.tempdir, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][ -1].state == ProcessState.KILLED
def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: log.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) sys.exit(INTERNAL_ERROR) if opts.setuid: user = opts.setuid else: user = getpass.getuser() # if we cannot get the uid, this is an unknown user and we should fail try: pwd.getpwnam(user).pw_uid except KeyError: log.error('Unknown user: %s' % user) sys.exit(UNKNOWN_USER) task_runner = TaskRunner( thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner ) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: log.error('Internal error: %s' % err) sys.exit(INTERNAL_ERROR) except TaskRunner.InvalidTask as err: log.error('Invalid task: %s' % err) sys.exit(INVALID_TASK) except TaskRunner.StateError as err: log.error('Checkpoint error: %s' % err) sys.exit(TERMINAL_TASK) except Process.UnknownUserError as err: log.error('User ceased to exist: %s' % err) sys.exit(UNKNOWN_USER) except KeyboardInterrupt: log.info('Caught ^C, tearing down runner.') runner_teardown(task_runner) except Exception as e: log.error('Unknown exception: %s' % e) for line in traceback.format_exc().splitlines(): log.error(line) sys.exit(UNKNOWN_ERROR)
def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: log.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) sys.exit(INTERNAL_ERROR) if opts.setuid: user = opts.setuid else: user = getpass.getuser() # if we cannot get the uid, this is an unknown user and we should fail try: pwd.getpwnam(user).pw_uid except KeyError: log.error('Unknown user: %s' % user) sys.exit(UNKNOWN_USER) task_runner = TaskRunner( thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner, hostname=opts.hostname, process_logger_destination=opts.process_logger_destination, process_logger_mode=opts.process_logger_mode, rotate_log_size_mb=opts.rotate_log_size_mb, rotate_log_backups=opts.rotate_log_backups, preserve_env=opts.preserve_env, mesos_containerizer_path=opts.mesos_containerizer_path, container_sandbox=opts.container_sandbox) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: log.error('Internal error: %s' % err) sys.exit(INTERNAL_ERROR) except TaskRunner.InvalidTask as err: log.error('Invalid task: %s' % err) sys.exit(INVALID_TASK) except TaskRunner.StateError as err: log.error('Checkpoint error: %s' % err) sys.exit(TERMINAL_TASK) except Process.UnknownUserError as err: log.error('User ceased to exist: %s' % err) sys.exit(UNKNOWN_USER) except KeyboardInterrupt: log.info('Caught ^C, tearing down runner.') runner_teardown(task_runner) except Exception as e: log.error('Unknown exception: %s' % e) for line in traceback.format_exc().splitlines(): log.error(line) sys.exit(UNKNOWN_ERROR)