logger = logging.getLogger('ochopod') web = Flask(__name__) if __name__ == '__main__': try: # # - parse our ochopod hints # - enable CLI logging # - pass down the ZK ensemble coordinate as $OCHOPOD_ZK (all tools use that to perform their queries) # env = os.environ hints = json.loads(env['ochopod']) ochopod.enable_cli_log(debug=hints['debug'] == 'true') env['OCHOPOD_ZK'] = hints['zk'] @web.route('/shell', methods=['POST']) def _from_curl(): out = [] ok = False ts = time.time() tmp = tempfile.mkdtemp() try: # # - retrieve the command line # assert 'X-Shell' in request.headers, 'X-Shell header missing'
web = Flask(__name__) if __name__ == "__main__": try: # # - parse our ochopod hints # - enable CLI logging # blocked = {} env = os.environ hints = json.loads(env["ochopod"]) ochopod.enable_cli_log(debug=hints["debug"] == "true") @web.route("/callback/<token>", methods=["POST"]) @web.route("/callback/<token>/<tag>", methods=["POST"]) def _set_callback(token, tag="callback.raw"): if token not in blocked: return "", 404 # # - dump the incoming payload under the temp directory # - use the specified filename # with open(path.join(blocked[token], tag), "w") as f: logger.info("callback received for %s (%d B)" % (token, len(request.data))) f.write(request.data)
#: our ochopod logger logger = logging.getLogger('ochopod') #: our falcon endpoint endpoint = falcon.API() # # - load our pod configuration settings # - this little json payload is packaged by the marathon toolset upon a push # - is it passed down to the container as the $pod environment variable # - parse our ochopod hints # - enable CLI logging # cfg = json.loads(os.environ['pod']) hints = json.loads(os.environ['ochopod']) ochopod.enable_cli_log(debug=hints['debug'] == 'true') class _Accumulator(FSM): """ Simple state-machine bundling incoming messages together. This acts as a small accumulator and will help avoid being throttled by the API. """ def __init__(self): super(_Accumulator, self).__init__() self.pending = deque() def initial(self, data): return 'spin', data, 0
def boot(self, lifecycle, model=Reactive, local=0): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - start logging to /var/log/ochopod.log # logger.info('EC2 marathon bindings started') web = Flask(__name__) # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the marathon application configuration for instance) # - the other settings come from marathon (namely the port bindings & application/task identifiers) # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API # env = \ { 'ochopod_application': '', 'ochopod_cluster': 'default', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'marathon', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '', 'PORT_8080': '8080' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables (which are set by the marathon executor) # - extract the mesos PORT_* bindings and construct a small remapping dict # ports = {} logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) for key, val in env.items(): if key.startswith('PORT_'): ports[key[5:]] = int(val) # # - keep any "ochopod_" environment variable & trim its prefix # - default all our settings, especially the mandatory ones # - the ip and zookeeper are defaulted to localhost to enable easy testing # hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')} if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info('running in local mode (make sure you run a standalone zookeeper)') hints.update( { 'fwk': 'marathon-ec2', 'ip': '127.0.0.1', 'node': 'local', 'ports': ports, 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - we are (assuming to be) deployed on EC2 # - get our underlying metadata using curl # def _peek(token, strict=True): code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token) assert not strict or code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token return lines[0] # # - get our local and public IPV4 addresses # - the "node" will show up as the EC2 instance ID # - note we allow the public IPv4 lookup to fail (in case we run in VPC) # hints.update( { 'application': env['MARATHON_APP_ID'][1:], 'fwk': 'marathon-ec2', 'ip': _peek('local-ipv4'), 'node': _peek('instance-id'), 'ports': ports, 'public': _peek('public-ipv4', strict=False), 'task': env['MESOS_TASK_ID'], 'zk': '' }) def _install_from_package(): # # - a regular package install will write the slave settings under /etc/mesos/zk # - the snippet in there looks like zk://10.0.0.56:2181/mesos # code, lines = shell("cat /etc/mesos/zk") assert code is 0 and lines[0], 'unable to retrieve the zk connection string' return lines[0][5:].split('/')[0] def _dcos_deployment(): # # - a DCOS slave is setup slightly differently with the settings being environment # variables set in /opt/mesosphere/etc/mesos-slave # - the snippet in there is prefixed by MESOS_MASTER= and uses an alias # - it looks like MESOS_MASTER=zk://leader.mesos:2181/mesos # code, lines = shell("grep MASTER /opt/mesosphere/etc/mesos-slave") assert code is 0 and lines[0], 'unable to retrieve the zk connection string' return lines[0][18:].split('/')[0] # # - depending on how the slave has been installed we might have to look in various places # to find out what our zookeeper connection string is # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used # - just keep the ip & port part and discard the rest # for method in [_install_from_package, _dcos_deployment]: try: hints['zk'] = method() break except: pass assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus setup ?)' # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)' # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (marathon/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) hints['metrics'] = {} env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start( hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): coordinator.tell({'request': 'reset'}) return '{}', 200 # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # @web.route('/info', methods=['POST']) def _info(): keys = \ [ 'application', 'ip', 'metrics', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200 # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200 # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410 # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200 class _Runner(threading.Thread): """ Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down using a dedicated HTTP POST. """ def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # while 1: time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure))
def boot(self, lifecycle, model=Reactive, tools=None, local=False): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass( lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - instantiate our flask endpoint # - default to a json handler for all HTTP errors (including an unexpected 500) # def _handler(error): http = error.code if isinstance(error, HTTPException) else 500 return '{}', http, { 'Content-Type': 'application/json; charset=utf-8' } web = Flask(__name__) for code in default_exceptions.iterkeys(): web.error_handler_spec[None][code] = _handler # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the marathon application configuration for instance) # - the other settings come from marathon (namely the port bindings & application/task identifiers) # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API # env = \ { 'ochopod_application': '', 'ochopod_cluster': 'default', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'marathon', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '', 'ochopod_zk': '', 'PORT_8080': '8080' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables (which are set by the marathon executor) # - extract the mesos PORT_* bindings and construct a small remapping dict # ports = {} logger.debug( 'environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) for key, val in env.items(): if key.startswith('PORT_'): ports[key[5:]] = int(val) # # - keep any "ochopod_" environment variable & trim its prefix # - default all our settings, especially the mandatory ones # - the ip and zookeeper are defaulted to localhost to enable easy testing # hints = { k[8:]: v for k, v in env.items() if k.startswith('ochopod_') } if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info( 'running in local mode (make sure you run a standalone zookeeper)' ) hints.update({ 'fwk': 'marathon (debug)', 'ip': '127.0.0.1', 'node': 'local', 'ports': ports, 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - extend our hints # - add the application + task # hints.update({ 'application': env['MARATHON_APP_ID'][1:], 'fwk': 'marathon', 'ip': '', 'node': '', 'ports': ports, 'public': '', 'task': env['MESOS_TASK_ID'], 'zk': '' }) # # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public' # hints.update(self.get_node_details()) # # - lookup for the zookeeper connection string from environment variable or on disk # - we have to look into different places depending on how mesos was installed # def _1(): # # - most recent DCOS release # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common # - the snippet in there is prefixed by MESOS_MASTER=zk://<ip:port>/mesos # logger.debug( 'checking /opt/mesosphere/etc/mesos-slave-common...') _, lines = shell( "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common" ) return lines[0][13:] def _2(): # # - same as above except for slightly older DCOS releases # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave # logger.debug('checking /opt/mesosphere/etc/mesos-slave...') _, lines = shell( "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave") return lines[0][13:] def _3(): # # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in # there looks like zk://10.0.0.56:2181/mesos) # logger.debug('checking /etc/mesos/zk...') _, lines = shell("cat /etc/mesos/zk") return lines[0] def _4(): # # - look for ZK from environment variables # - user can pass down ZK using $ochopod_zk # - this last-resort situation is used mostly for debugging # logger.debug( 'checking $ochopod_zk environment variable...') return env['ochopod_zk'] # # - depending on how the slave has been installed we might have to look in various places # to find out what our zookeeper connection string is # - use urlparse to keep the host:port part of the URL (possibly including a login+password) # for method in [_1, _2, _3, _4]: try: hints['zk'] = urlparse(method()).netloc break except Exception: pass # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints[ 'zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)' assert hints['cluster'] and hints[ 'namespace'], 'no cluster and/or namespace defined (user error ?)' # # - load the tools # if tools: tools = { tool.tag: tool for tool in [clz() for clz in tools if issubclass(clz, Tool)] if tool.tag } logger.info('supporting tools %s' % ', '.join(tools.keys())) # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (marathon) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) hints['metrics'] = {} hints['dependencies'] = model.depends_on env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start(hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): logger.debug('http in -> /reset') coordinator.tell({'request': 'reset'}) return '{}', 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # - from @pferro -> the pod's dependencies defined in the model are now added as well # @web.route('/info', methods=['POST']) def _info(): logger.debug('http in -> /info') keys = \ [ 'application', 'dependencies', 'ip', 'metrics', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): logger.debug('http in -> /log') with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - RPC call to run a custom tool within the pod # @web.route('/exec', methods=['POST']) def _exec(): logger.debug('http in -> /exec') # # - make sure the command (first token in the X-Shell header) maps to a tool # - if no match abort on a 404 # line = request.headers['X-Shell'] tokens = line.split(' ') cmd = tokens[0] if not tools or cmd not in tools: return '{}', 404, { 'Content-Type': 'application/json; charset=utf-8' } code = 1 tool = tools[cmd] # # - make sure the parser does not sys.exit() # class _Parser(ArgumentParser): def exit(self, status=0, message=None): raise ValueError(message) # # - prep a temporary directory # - invoke define_cmdline_parsing() # - switch off parsing if NotImplementedError is raised # use_parser = 1 parser = _Parser(prog=tool.tag) try: tool.define_cmdline_parsing(parser) except NotImplementedError: use_parser = 0 tmp = tempfile.mkdtemp() try: # # - parse the command line # - upload any attachment # args = parser.parse_args( tokens[1:]) if use_parser else ' '.join(tokens[1:]) for tag, upload in request.files.items(): where = path.join(tmp, tag) logger.debug('uploading %s @ %s' % (tag, tmp)) upload.save(where) # # - run the tool method # - pass the temporary directory as well # logger.info('invoking "%s"' % line) code, lines = tool.body(args, tmp) except ValueError as failure: lines = [ parser.format_help() if failure.message is None else failure.message ] except Exception as failure: lines = ['unexpected failure -> %s' % failure] finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) out = \ { 'code': code, 'stdout': lines } return json.dumps(out), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): logger.debug('http in -> /control/%s' % task) if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']: # # - fail on a HTTP 400 if the request is not supported # return '{}', 400, { 'Content-Type': 'application/json; charset=utf-8' } try: ts = time.time() latch = ThreadingFuture() executor.tell({ 'request': task, 'latch': latch, 'data': request.data }) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code, { 'Content-Type': 'application/json; charset=utf-8' } except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408, { 'Content-Type': 'application/json; charset=utf-8' } except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410, { 'Content-Type': 'application/json; charset=utf-8' } # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - run werkzeug from a separate thread to avoid blocking the main one # - we'll have to shut it down using a dedicated HTTP POST # class _Runner(threading.Thread): def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') while 1: # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure))
def boot(self, lifecycle, model=Reactive, tools=None, local=False): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - instantiate our flask endpoint # - default to a json handler for all HTTP errors (including an unexpected 500) # def _handler(error): http = error.code if isinstance(error, HTTPException) else 500 return '{}', http, {'Content-Type': 'application/json; charset=utf-8'} web = Flask(__name__) for code in default_exceptions.iterkeys(): web.error_handler_spec[None][code] = _handler # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the marathon application configuration for instance) # - the other settings come from marathon (namely the port bindings & application/task identifiers) # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API # env = \ { 'ochopod_application': '', 'ochopod_cluster': 'default', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'marathon', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '', 'ochopod_zk': '', 'PORT_8080': '8080' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables (which are set by the marathon executor) # - extract the mesos PORT_* bindings and construct a small remapping dict # ports = {} logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) for key, val in env.items(): if key.startswith('PORT_'): ports[key[5:]] = int(val) # # - keep any "ochopod_" environment variable & trim its prefix # - default all our settings, especially the mandatory ones # - the ip and zookeeper are defaulted to localhost to enable easy testing # hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')} if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info('running in local mode (make sure you run a standalone zookeeper)') hints.update( { 'fwk': 'marathon (debug)', 'ip': '127.0.0.1', 'node': 'local', 'ports': ports, 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - extend our hints # - add the application + task # hints.update( { 'application': env['MARATHON_APP_ID'][1:], 'fwk': 'marathon', 'ip': '', 'node': '', 'ports': ports, 'public': '', 'task': env['MESOS_TASK_ID'], 'zk': '' }) # # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public' # hints.update(self.get_node_details()) # # - lookup for the zookeeper connection string from environment variable or on disk # - we have to look into different places depending on how mesos was installed # def _1(): # # - most recent DCOS release # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common # - the snippet in there is prefixed by MESOS_ZK=zk://<ip:port>/mesos # logger.debug('checking /opt/mesosphere/etc/mesos-slave-common...') _, lines = shell("grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common") return lines[0][18:].split('/')[0] def _2(): # # - same as above except for slightly older DCOS releases # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave # logger.debug('checking /opt/mesosphere/etc/mesos-slave...') _, lines = shell("grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave") return lines[0][18:].split('/')[0] def _3(): # # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in # there looks like zk://10.0.0.56:2181/mesos) # logger.debug('checking /etc/mesos/zk...') _, lines = shell("cat /etc/mesos/zk") return lines[0][5:].split('/')[0] def _4(): # # - look for ZK from environment variables # - user can pass down ZK using $ochopod_zk # logger.debug('checking $ochopod_zk environment variable...') if env['ochopod_zk']: logger.debug('found $ochopod_zk environment variable...') return env['ochopod_zk'][5:].split('/')[0] # # - depending on how the slave has been installed we might have to look in various places # to find out what our zookeeper connection string is # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used # - just keep the ip & port part and discard the rest # for method in [_1, _2, _3, _4]: try: hints['zk'] = method() break except Exception: pass # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)' assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)' # # - load the tools # if tools: tools = {tool.tag: tool for tool in [clz() for clz in tools if issubclass(clz, Tool)] if tool.tag} logger.info('supporting tools %s' % ', '.join(tools.keys())) # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (marathon) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) hints['metrics'] = {} hints['dependencies'] = model.depends_on env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start( hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): logger.debug('http in -> /reset') coordinator.tell({'request': 'reset'}) return '{}', 200, {'Content-Type': 'application/json; charset=utf-8'} # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # - from @pferro -> the pod's dependencies defined in the model are now added as well # @web.route('/info', methods=['POST']) def _info(): logger.debug('http in -> /info') keys = \ [ 'application', 'dependencies', 'ip', 'metrics', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200, {'Content-Type': 'application/json; charset=utf-8'} # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): logger.debug('http in -> /log') with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200, {'Content-Type': 'application/json; charset=utf-8'} # # - RPC call to run a custom tool within the pod # @web.route('/exec', methods=['POST']) def _exec(): logger.debug('http in -> /exec') # # - make sure the command (first token in the X-Shell header) maps to a tool # - if no match abort on a 404 # line = request.headers['X-Shell'] tokens = line.split(' ') cmd = tokens[0] if not tools or cmd not in tools: return '{}', 404, {'Content-Type': 'application/json; charset=utf-8'} code = 1 tool = tools[cmd] # # - make sure the parser does not sys.exit() # class _Parser(ArgumentParser): def exit(self, status=0, message=None): raise ValueError(message) # # - prep a temporary directory # - invoke define_cmdline_parsing() # - switch off parsing if NotImplementedError is raised # use_parser = 1 parser = _Parser(prog=tool.tag) try: tool.define_cmdline_parsing(parser) except NotImplementedError: use_parser = 0 tmp = tempfile.mkdtemp() try: # # - parse the command line # - upload any attachment # args = parser.parse_args(tokens[1:]) if use_parser else ' '.join(tokens[1:]) for tag, upload in request.files.items(): where = path.join(tmp, tag) logger.debug('uploading %s @ %s' % (tag, tmp)) upload.save(where) # # - run the tool method # - pass the temporary directory as well # logger.info('invoking "%s"' % line) code, lines = tool.body(args, tmp) except ValueError as failure: lines = [parser.format_help() if failure.message is None else failure.message] except Exception as failure: lines = ['unexpected failure -> %s' % failure] finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) out = \ { 'code': code, 'stdout': lines } return json.dumps(out), 200, {'Content-Type': 'application/json; charset=utf-8'} # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): logger.debug('http in -> /control/%s' % task) if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']: # # - fail on a HTTP 400 if the request is not supported # return '{}', 400, {'Content-Type': 'application/json; charset=utf-8'} try: ts = time.time() latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code, {'Content-Type': 'application/json; charset=utf-8'} except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408, {'Content-Type': 'application/json; charset=utf-8'} except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410, {'Content-Type': 'application/json; charset=utf-8'} # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200, {'Content-Type': 'application/json; charset=utf-8'} # # - run werkzeug from a separate thread to avoid blocking the main one # - we'll have to shut it down using a dedicated HTTP POST # class _Runner(threading.Thread): def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') while 1: # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure))
def boot(self, lifecycle, model=Reactive, local=0): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass( lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - start logging to /var/log/ochopod.log # logger.info('EC2 kubernetes bindings started') web = Flask(__name__) # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the pod configuration yaml/json for instance) # env = \ { 'ochopod_application': '', 'ochopod_cluster': '', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'default', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables # - isolate the ones prefixed with ochopod_ # logger.debug( 'environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) hints = { k[8:]: v for k, v in env.items() if k.startswith('ochopod_') } if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info( 'running in local mode (make sure you run a standalone zookeeper)' ) hints.update({ 'fwk': 'kubernetes', 'ip': '127.0.0.1', 'node': 'localhost', 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - we are (assuming to be) deployed on EC2 # - we'll retrieve the underlying metadata using curl # def _aws(token): code, lines = shell( 'curl -f http://169.254.169.254/latest/meta-data/%s' % token) assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token return lines[0] # # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to # use 10.0.0.2 from within the pod yet (or i'm too stupid to find out) # - curl to the master to retrieve info about our cluster # - don't forget to merge the resulting output # def _k8s(token): code, lines = shell( 'curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s' % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'], env['KUBERNETES_MASTER'], token)) assert code is 0, 'unable to look the RO service up (is the master running ?)' return json.loads(''.join(lines)) # # - look our local k8s pod up # - get our container ip # - extract the port bindings # - keep any "ochopod_" environment variable & trim its prefix # @retry(timeout=60, pause=1) def _spin(): # # - wait til the k8s pod is running and publishing its IP # cfg = _k8s('pods/%s' % env['HOSTNAME']) assert 'podIP' in cfg[ 'status'], 'pod not ready yet -> %s' % cfg['status'][ 'phase'] return cfg this_pod = _spin() hints['ip'] = this_pod['status']['podIP'] # # - revert to the k8s pod name if no cluster is specified # if not hints['cluster']: hints['cluster'] = this_pod['metadata']['name'] # # - consider the 1st pod container # - grab the exposed ports (no remapping required) # ports = {} container = this_pod['spec']['containers'][0] for binding in container['ports']: port = binding['containerPort'] ports[str(port)] = port # # - set 'task' to $HOSTNAME (the container is named after the k8s pod) # - get our public IPV4 address # - the "node" will show up as the EC2 instance ID # hints.update({ 'fwk': 'k8s-ec2', 'node': _aws('instance-id'), 'ports': ports, 'public': _aws('public-ipv4'), 'task': env['HOSTNAME'] }) # # - look the k8s "ocho-proxy" pod up # - it should be design run our synchronization zookeeper # proxy = _k8s('pods/ocho-proxy') assert 'podIP' in proxy['status'], 'proxy not ready ?' hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP'] # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['namespace'], 'no namespace defined (user error ?)' # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (kubernetes/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start(hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): coordinator.tell({'request': 'reset'}) return '{}', 200 # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # @web.route('/info', methods=['POST']) def _info(): keys = \ [ 'application', 'ip', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200 # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200 # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({ 'request': task, 'latch': latch, 'data': request.data }) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410 # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200 class _Runner(threading.Thread): """ Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down using a dedicated HTTP POST. """ def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # while 1: time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure)) exit(1)
def boot(self, lifecycle, model=Reactive, local=0): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - start logging to /var/log/ochopod.log # logger.info('EC2 kubernetes bindings started') web = Flask(__name__) # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the pod configuration yaml/json for instance) # env = \ { 'ochopod_application': '', 'ochopod_cluster': '', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'default', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables # - isolate the ones prefixed with ochopod_ # logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')} if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info('running in local mode (make sure you run a standalone zookeeper)') hints.update( { 'fwk': 'kubernetes', 'ip': '127.0.0.1', 'node': 'localhost', 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - we are (assuming to be) deployed on EC2 # - we'll retrieve the underlying metadata using curl # def _aws(token): code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token) assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token return lines[0] # # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to # use 10.0.0.2 from within the pod yet (or i'm too stupid to find out) # - curl to the master to retrieve info about our cluster # - don't forget to merge the resulting output # def _k8s(token): code, lines = shell('curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s' % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'], env['KUBERNETES_MASTER'], token)) assert code is 0, 'unable to look the RO service up (is the master running ?)' return json.loads(''.join(lines)) # # - look our local k8s pod up # - get our container ip # - extract the port bindings # - keep any "ochopod_" environment variable & trim its prefix # @retry(timeout=60, pause=1) def _spin(): # # - wait til the k8s pod is running and publishing its IP # cfg = _k8s('pods/%s' % env['HOSTNAME']) assert 'podIP' in cfg['status'], 'pod not ready yet -> %s' % cfg['status']['phase'] return cfg this_pod = _spin() hints['ip'] = this_pod['status']['podIP'] # # - revert to the k8s pod name if no cluster is specified # if not hints['cluster']: hints['cluster'] = this_pod['metadata']['name'] # # - consider the 1st pod container # - grab the exposed ports (no remapping required) # ports = {} container = this_pod['spec']['containers'][0] for binding in container['ports']: port = binding['containerPort'] ports[str(port)] = port # # - set 'task' to $HOSTNAME (the container is named after the k8s pod) # - get our public IPV4 address # - the "node" will show up as the EC2 instance ID # hints.update( { 'fwk': 'k8s-ec2', 'node': _aws('instance-id'), 'ports': ports, 'public': _aws('public-ipv4'), 'task': env['HOSTNAME'] }) # # - look the k8s "ocho-proxy" pod up # - it should be design run our synchronization zookeeper # proxy = _k8s('pods/ocho-proxy') assert 'podIP' in proxy['status'], 'proxy not ready ?' hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP'] # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['namespace'], 'no namespace defined (user error ?)' # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (kubernetes/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start( hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): coordinator.tell({'request': 'reset'}) return '{}', 200 # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # @web.route('/info', methods=['POST']) def _info(): keys = \ [ 'application', 'ip', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200 # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200 # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410 # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200 class _Runner(threading.Thread): """ Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down using a dedicated HTTP POST. """ def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # while 1: time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure)) exit(1)