def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: app.error('Internal error: %s' % err) except TaskRunner.InvalidTask as err: app.error(str(err)) except TaskRunner.StateError: app.error('Task appears to already be in a terminal state.') except KeyboardInterrupt: runner_teardown(task_runner)
def _really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None, daemon=False): prebound_ports = prebound_ports or {} missing_ports = set(task.ports()) - set(prebound_ports.keys()) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id, user=user, portmap=prebound_ports, chroot=chroot) if daemon: print('Daemonizing and starting runner.') try: log.teardown_stderr_logging() daemonize() except Exception as e: print("Failed to daemonize: %s" % e) sys.exit(1) try: task_runner.run() except KeyboardInterrupt: print('Got keyboard interrupt, killing job!') task_runner.close_ckpt() task_runner.kill()
def proxy_main(args, opts): assert opts.thermos_json and os.path.exists(opts.thermos_json) assert opts.sandbox assert opts.checkpoint_root thermos_task = get_task_from_options(opts) prebound_ports = opts.prebound_ports missing_ports = set(thermos_task.ports()) - set(prebound_ports) if missing_ports: app.error('ERROR! Unbound ports: %s' % ' '.join(port for port in missing_ports)) task_runner = TaskRunner( thermos_task.task, opts.checkpoint_root, opts.sandbox, task_id=opts.task_id, user=opts.setuid, portmap=prebound_ports, chroot=opts.chroot, planner_class=CappedTaskPlanner ) for sig in (signal.SIGUSR1, signal.SIGUSR2): signal.signal(sig, functools.partial(runner_teardown, task_runner)) try: task_runner.run() except TaskRunner.InternalError as err: app.error('Internal error: %s' % err) except TaskRunner.InvalidTask as err: app.error(str(err)) except TaskRunner.StateError: app.error('Task appears to already be in a terminal state.') except KeyboardInterrupt: runner_teardown(task_runner)
def make_executor(proxy_driver, checkpoint_root, task, ports={}, fast_status=False, runner_class=ThermosTaskRunner, status_providers=()): status_manager_class = FastStatusManager if fast_status else StatusManager runner_provider = make_provider(checkpoint_root, runner_class) te = FastThermosExecutor( runner_provider=runner_provider, status_manager_class=status_manager_class, sandbox_provider=DefaultTestSandboxProvider, status_providers=status_providers, ) ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start() task_description = make_task(task, assigned_ports=ports, instanceId=0) te.launchTask(proxy_driver, task_description) te.runner_started.wait() while len(proxy_driver.method_calls['sendStatusUpdate']) < 2: time.sleep(0.1) # make sure startup was kosher updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 2 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb.TASK_STARTING assert status_updates[1].state == mesos_pb.TASK_RUNNING # wait for the runner to bind to a task while True: runner = TaskRunner.get(task_description.task_id.value, checkpoint_root) if runner: break time.sleep(0.1) assert te.launched.is_set() return runner, te
def make_executor( proxy_driver, checkpoint_root, task, ports={}, fast_status=False, runner_class=ThermosTaskRunner, status_providers=()): status_manager_class = FastStatusManager if fast_status else StatusManager runner_provider = make_provider(checkpoint_root, runner_class) te = FastThermosExecutor( runner_provider=runner_provider, status_manager_class=status_manager_class, sandbox_provider=DefaultTestSandboxProvider, status_providers=status_providers, ) ExecutorTimeout(te.launched, proxy_driver, timeout=Amount(100, Time.MILLISECONDS)).start() task_description = make_task(task, assigned_ports=ports, instanceId=0) te.launchTask(proxy_driver, task_description) te.runner_started.wait() while len(proxy_driver.method_calls['sendStatusUpdate']) < 2: time.sleep(0.1) # make sure startup was kosher updates = proxy_driver.method_calls['sendStatusUpdate'] assert len(updates) == 2 status_updates = [arg_tuple[0][0] for arg_tuple in updates] assert status_updates[0].state == mesos_pb.TASK_STARTING assert status_updates[1].state == mesos_pb.TASK_RUNNING # wait for the runner to bind to a task while True: runner = TaskRunner.get(task_description.task_id.value, checkpoint_root) if runner: break time.sleep(0.1) assert te.launched.is_set() return runner, te
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes["ignorant_process"]) == 1 assert state.processes["ignorant_process"][0].state == ProcessState.LOST
def test_coordinator_dead_kill(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 os.kill(runner.po.pid, signal.SIGKILL) os.kill(process_state.coordinator_pid, signal.SIGKILL) os.kill(process_state.pid, signal.SIGKILL) killer = TaskRunner.get(runner.task_id, runner.root) assert killer is not None killer.kill(force=True) state = tm.get_state() assert len(state.processes['ignorant_process']) == 1 assert state.processes['ignorant_process'][ 0].state == ProcessState.LOST
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == "ignorant_process" assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes["ignorant_process"][-1].state == ProcessState.KILLED
def test_preemption_wait(self): runner = self.start_runner() tm = TaskMonitor(runner.pathspec, runner.task_id) self.wait_until_running(tm) process_state, run_number = tm.get_active_processes()[0] assert process_state.process == 'ignorant_process' assert run_number == 0 preempter = TaskRunner.get(runner.task_id, runner.root) assert preempter is not None now = time.time() preempter.kill(force=True, preemption_wait=Amount(1, Time.SECONDS)) duration = time.time() - now # This is arbitrary, but make sure we finish within half a second of # requested preemption wait. assert abs(duration - 1.0) < 0.5 assert preempter.state.statuses[-1].state == TaskState.KILLED assert preempter.state.processes['ignorant_process'][ -1].state == ProcessState.KILLED