def _control(task, timeout='60'): logger.debug('http in -> /control/%s' % task) if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']: # # - fail on a HTTP 400 if the request is not supported # return '{}', 400, {'Content-Type': 'application/json; charset=utf-8'} try: ts = time.time() latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code, {'Content-Type': 'application/json; charset=utf-8'} except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408, {'Content-Type': 'application/json; charset=utf-8'} except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410, {'Content-Type': 'application/json; charset=utf-8'}
def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410
def generate_sync_link(display_name, storage_id, capacity=1000): """Return a dummy sync link.""" sync_link = mock.Mock(SynchronizationLink) sync_link.local = mock.Mock(BasicStorage) sync_link.remote = mock.Mock(BasicStorage) sync_link.remote.storage_id = storage_id metrics = ThreadingFuture() metrics.set(StorageMetrics(storage_id, capacity, display_name=display_name)) sync_link.metrics = metrics sync_engine = mock.MagicMock(SyncEngine) sync_engine.query().get.return_value = {} sync_link.sync_engine = sync_engine return sync_link
def invoke(self, raw): """ RPC API: shell invokation on behalf of the master. The code is run by the script actor and its stdout returned back to the caller. :type raw: str :param raw: serialized json payload :rtype: the shell script stdout upon succes, None upon failure """ try: js = json.loads(raw) logger.debug('RPC invoke() <- "%s"' % js['cmd']) msg = MSG({'request': 'invoke'}) msg.cmd = js['cmd'] msg.env = {'INPUT': json.dumps(js)} msg.latch = ThreadingFuture() # # - block on a latch and reply with whatever the shell script # wrote to its standard output # actors['script'].tell(msg) return msg.latch.get(timeout=60) except Exception as failure: return None
def _control(task, timeout='60'): logger.debug('http in -> /control/%s' % task) if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']: # # - fail on a HTTP 400 if the request is not supported # return '{}', 400, { 'Content-Type': 'application/json; charset=utf-8' } try: ts = time.time() latch = ThreadingFuture() executor.tell({ 'request': task, 'latch': latch, 'data': request.data }) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code, { 'Content-Type': 'application/json; charset=utf-8' } except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408, { 'Content-Type': 'application/json; charset=utf-8' } except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410, { 'Content-Type': 'application/json; charset=utf-8' }
def _request(self, tokens): # # - we use this help to schedule commands internally (mostly used to switch # the pod on/off) # for token in tokens: self.commands.append((token, {}, ThreadingFuture()))
def shutdown(actor_ref, timeout=None): """ Shuts a state-machine down and wait for it to acknowledge it's down using a latch. :type actor_ref: :class:`pykka.ActorRef` :param actor_ref: a pykka actor reference :type timeout: float :param timeout: optional timeout in seconds """ try: if not actor_ref: return latch = ThreadingFuture() actor_ref.tell({'request': 'shutdown', 'latch': latch}) Event() latch.get(timeout=timeout) except Timeout: pass except ActorDeadError: pass
def spin(self, data): # # - if the termination trigger is set, abort immediately # if self.force_reset or self.terminate: raise Aborted('resetting') # # - attempt to fetch the lock # lock = data.zk.Lock('%s/coordinator' % self.prefix) try: # # - the kazoo lock recipe seems to be sensitive if being switched to SUSPENDED .. in order to # avoid stalling on the lock (which is the default behavior), attempt to lock multiple time # with a short timeout (e.g spin-lock) # if hasattr(data, 'lock') and data.lock: try: data.lock.release() except ConnectionClosedError: pass data.lock = None lock.acquire(timeout=SAMPLING) logger.debug('%s : lock acquired @ %s, now leading' % (self.path, self.prefix)) data.lock = lock # # - we have the lock (e.g we are the leader) # - start the controller actor # data.latch = ThreadingFuture() data.controller = self.model.start(data.zk, self.hints, self.scope, self.tag, self.port, data.latch) return 'lock', data, 0 except LockTimeout: pass # # - we could not obtain the lock # - blindly loop back and attempt to get it again # return 'spin', data, 0
def block(creator, strict=1, spin=0.5, collect=None): """ Compound spin-lock creating a latch, passing it to a lambda and then blocking. :type creator: lambda :param creator: lambda taking a :class:`pykka.ThreadingFuture` as parameter :type strict: bool :param strict: if true the method will raise if ever the future outcome is an exception :type spin: float :param spin: wait timeout in seconds :type collect: list :param collect: receives the lambda result if specified :rtype: """ latch = ThreadingFuture() ref = creator(latch) if collect is not None: collect.append(ref) spin_lock(latch, strict, spin)
def start_controller(self, data): # # - if the termination trigger is set, abort immediately # - this is important as it is possible to somehow get the lock after a suspend (acquire() returns # true in that case which is misleading) # if self.force_reset or self.terminate: raise Aborted('resetting') # # - we have the lock (e.g we are the leader) # - start the controller actor # data.latch = ThreadingFuture() logger.debug('%s : lock acquired @ %s, now leading' % (self.path, self.prefix)) data.controller = self.model.start(data.zk, self.id, self.hints, self.scope, self.tag, self.port, data.latch) return 'lock', data, 0
def boot(self, lifecycle, model=Reactive, local=0): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass( lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - start logging to /var/log/ochopod.log # logger.info('EC2 kubernetes bindings started') web = Flask(__name__) # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the pod configuration yaml/json for instance) # env = \ { 'ochopod_application': '', 'ochopod_cluster': '', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'default', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables # - isolate the ones prefixed with ochopod_ # logger.debug( 'environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) hints = { k[8:]: v for k, v in env.items() if k.startswith('ochopod_') } if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info( 'running in local mode (make sure you run a standalone zookeeper)' ) hints.update({ 'fwk': 'kubernetes', 'ip': '127.0.0.1', 'node': 'localhost', 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - we are (assuming to be) deployed on EC2 # - we'll retrieve the underlying metadata using curl # def _aws(token): code, lines = shell( 'curl -f http://169.254.169.254/latest/meta-data/%s' % token) assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token return lines[0] # # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to # use 10.0.0.2 from within the pod yet (or i'm too stupid to find out) # - curl to the master to retrieve info about our cluster # - don't forget to merge the resulting output # def _k8s(token): code, lines = shell( 'curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s' % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'], env['KUBERNETES_MASTER'], token)) assert code is 0, 'unable to look the RO service up (is the master running ?)' return json.loads(''.join(lines)) # # - look our local k8s pod up # - get our container ip # - extract the port bindings # - keep any "ochopod_" environment variable & trim its prefix # @retry(timeout=60, pause=1) def _spin(): # # - wait til the k8s pod is running and publishing its IP # cfg = _k8s('pods/%s' % env['HOSTNAME']) assert 'podIP' in cfg[ 'status'], 'pod not ready yet -> %s' % cfg['status'][ 'phase'] return cfg this_pod = _spin() hints['ip'] = this_pod['status']['podIP'] # # - revert to the k8s pod name if no cluster is specified # if not hints['cluster']: hints['cluster'] = this_pod['metadata']['name'] # # - consider the 1st pod container # - grab the exposed ports (no remapping required) # ports = {} container = this_pod['spec']['containers'][0] for binding in container['ports']: port = binding['containerPort'] ports[str(port)] = port # # - set 'task' to $HOSTNAME (the container is named after the k8s pod) # - get our public IPV4 address # - the "node" will show up as the EC2 instance ID # hints.update({ 'fwk': 'k8s-ec2', 'node': _aws('instance-id'), 'ports': ports, 'public': _aws('public-ipv4'), 'task': env['HOSTNAME'] }) # # - look the k8s "ocho-proxy" pod up # - it should be design run our synchronization zookeeper # proxy = _k8s('pods/ocho-proxy') assert 'podIP' in proxy['status'], 'proxy not ready ?' hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP'] # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['namespace'], 'no namespace defined (user error ?)' # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (kubernetes/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start(hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): coordinator.tell({'request': 'reset'}) return '{}', 200 # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # @web.route('/info', methods=['POST']) def _info(): keys = \ [ 'application', 'ip', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200 # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200 # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({ 'request': task, 'latch': latch, 'data': request.data }) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410 # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200 class _Runner(threading.Thread): """ Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down using a dedicated HTTP POST. """ def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # while 1: time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure)) exit(1)
def boot(self, lifecycle, model=Reactive, local=0): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - start logging to /var/log/ochopod.log # logger.info('EC2 marathon bindings started') web = Flask(__name__) # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the marathon application configuration for instance) # - the other settings come from marathon (namely the port bindings & application/task identifiers) # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API # env = \ { 'ochopod_application': '', 'ochopod_cluster': 'default', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'marathon', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '', 'PORT_8080': '8080' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables (which are set by the marathon executor) # - extract the mesos PORT_* bindings and construct a small remapping dict # ports = {} logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) for key, val in env.items(): if key.startswith('PORT_'): ports[key[5:]] = int(val) # # - keep any "ochopod_" environment variable & trim its prefix # - default all our settings, especially the mandatory ones # - the ip and zookeeper are defaulted to localhost to enable easy testing # hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')} if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info('running in local mode (make sure you run a standalone zookeeper)') hints.update( { 'fwk': 'marathon-ec2', 'ip': '127.0.0.1', 'node': 'local', 'ports': ports, 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - we are (assuming to be) deployed on EC2 # - get our underlying metadata using curl # def _peek(token, strict=True): code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token) assert not strict or code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token return lines[0] # # - get our local and public IPV4 addresses # - the "node" will show up as the EC2 instance ID # - note we allow the public IPv4 lookup to fail (in case we run in VPC) # hints.update( { 'application': env['MARATHON_APP_ID'][1:], 'fwk': 'marathon-ec2', 'ip': _peek('local-ipv4'), 'node': _peek('instance-id'), 'ports': ports, 'public': _peek('public-ipv4', strict=False), 'task': env['MESOS_TASK_ID'], 'zk': '' }) def _install_from_package(): # # - a regular package install will write the slave settings under /etc/mesos/zk # - the snippet in there looks like zk://10.0.0.56:2181/mesos # code, lines = shell("cat /etc/mesos/zk") assert code is 0 and lines[0], 'unable to retrieve the zk connection string' return lines[0][5:].split('/')[0] def _dcos_deployment(): # # - a DCOS slave is setup slightly differently with the settings being environment # variables set in /opt/mesosphere/etc/mesos-slave # - the snippet in there is prefixed by MESOS_MASTER= and uses an alias # - it looks like MESOS_MASTER=zk://leader.mesos:2181/mesos # code, lines = shell("grep MASTER /opt/mesosphere/etc/mesos-slave") assert code is 0 and lines[0], 'unable to retrieve the zk connection string' return lines[0][18:].split('/')[0] # # - depending on how the slave has been installed we might have to look in various places # to find out what our zookeeper connection string is # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used # - just keep the ip & port part and discard the rest # for method in [_install_from_package, _dcos_deployment]: try: hints['zk'] = method() break except: pass assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus setup ?)' # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)' # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (marathon/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) hints['metrics'] = {} env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start( hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): coordinator.tell({'request': 'reset'}) return '{}', 200 # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # @web.route('/info', methods=['POST']) def _info(): keys = \ [ 'application', 'ip', 'metrics', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200 # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200 # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): try: ts = time.time() logger.debug('http in -> /control/%s' % task) latch = ThreadingFuture() executor.tell({'request': task, 'latch': latch, 'data': request.data}) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408 except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410 # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200 class _Runner(threading.Thread): """ Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down using a dedicated HTTP POST. """ def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # while 1: time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure))
def test_future_nested_future(future): inner_future = ThreadingFuture() inner_future.set("foo") outer_future = ThreadingFuture() outer_future.set(inner_future) assert outer_future.get().get() == "foo"
def futures(): return [ThreadingFuture() for _ in range(3)]
def future(): return ThreadingFuture()
def _start(): latch = ThreadingFuture() ref = creator(latch) if collect is not None: collect.append(ref) return latch
def boot(self, lifecycle, model=Reactive, tools=None, local=False): # # - quick check to make sure we get the right implementations # assert issubclass(model, Model), 'model must derive from ochopod.api.Model' assert issubclass( lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle' # # - instantiate our flask endpoint # - default to a json handler for all HTTP errors (including an unexpected 500) # def _handler(error): http = error.code if isinstance(error, HTTPException) else 500 return '{}', http, { 'Content-Type': 'application/json; charset=utf-8' } web = Flask(__name__) for code in default_exceptions.iterkeys(): web.error_handler_spec[None][code] = _handler # # - default presets in case we run outside of marathon (local vm testing) # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts # in the marathon application configuration for instance) # - the other settings come from marathon (namely the port bindings & application/task identifiers) # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API # env = \ { 'ochopod_application': '', 'ochopod_cluster': 'default', 'ochopod_debug': 'true', 'ochopod_local': 'false', 'ochopod_namespace': 'marathon', 'ochopod_port': '8080', 'ochopod_start': 'true', 'ochopod_task': '', 'ochopod_zk': '', 'PORT_8080': '8080' } env.update(os.environ) ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true') try: # # - grab our environment variables (which are set by the marathon executor) # - extract the mesos PORT_* bindings and construct a small remapping dict # ports = {} logger.debug( 'environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()])) for key, val in env.items(): if key.startswith('PORT_'): ports[key[5:]] = int(val) # # - keep any "ochopod_" environment variable & trim its prefix # - default all our settings, especially the mandatory ones # - the ip and zookeeper are defaulted to localhost to enable easy testing # hints = { k[8:]: v for k, v in env.items() if k.startswith('ochopod_') } if local or hints['local'] == 'true': # # - we are running in local mode (e.g on a dev workstation) # - default everything to localhost # logger.info( 'running in local mode (make sure you run a standalone zookeeper)' ) hints.update({ 'fwk': 'marathon (debug)', 'ip': '127.0.0.1', 'node': 'local', 'ports': ports, 'public': '127.0.0.1', 'zk': '127.0.0.1:2181' }) else: # # - extend our hints # - add the application + task # hints.update({ 'application': env['MARATHON_APP_ID'][1:], 'fwk': 'marathon', 'ip': '', 'node': '', 'ports': ports, 'public': '', 'task': env['MESOS_TASK_ID'], 'zk': '' }) # # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public' # hints.update(self.get_node_details()) # # - lookup for the zookeeper connection string from environment variable or on disk # - we have to look into different places depending on how mesos was installed # def _1(): # # - most recent DCOS release # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common # - the snippet in there is prefixed by MESOS_MASTER=zk://<ip:port>/mesos # logger.debug( 'checking /opt/mesosphere/etc/mesos-slave-common...') _, lines = shell( "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common" ) return lines[0][13:] def _2(): # # - same as above except for slightly older DCOS releases # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave # logger.debug('checking /opt/mesosphere/etc/mesos-slave...') _, lines = shell( "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave") return lines[0][13:] def _3(): # # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in # there looks like zk://10.0.0.56:2181/mesos) # logger.debug('checking /etc/mesos/zk...') _, lines = shell("cat /etc/mesos/zk") return lines[0] def _4(): # # - look for ZK from environment variables # - user can pass down ZK using $ochopod_zk # - this last-resort situation is used mostly for debugging # logger.debug( 'checking $ochopod_zk environment variable...') return env['ochopod_zk'] # # - depending on how the slave has been installed we might have to look in various places # to find out what our zookeeper connection string is # - use urlparse to keep the host:port part of the URL (possibly including a login+password) # for method in [_1, _2, _3, _4]: try: hints['zk'] = urlparse(method()).netloc break except Exception: pass # # - the cluster must be fully qualified with a namespace (which is defaulted anyway) # assert hints[ 'zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)' assert hints['cluster'] and hints[ 'namespace'], 'no cluster and/or namespace defined (user error ?)' # # - load the tools # if tools: tools = { tool.tag: tool for tool in [clz() for clz in tools if issubclass(clz, Tool)] if tool.tag } logger.info('supporting tools %s' % ', '.join(tools.keys())) # # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process # - start our coordinator which will connect to zookeeper and attempt to lead the cluster # - upon grabbing the lock the model actor will start and implement the configuration process # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via # the HTTP POST /info request) # - what's being registered in zookeeper is immutable though and decorated with additional details by # the coordinator (especially the pod index which is derived from zookeeper) # latch = ThreadingFuture() logger.info('starting %s.%s (marathon) @ %s' % (hints['namespace'], hints['cluster'], hints['node'])) breadcrumbs = deepcopy(hints) hints['metrics'] = {} hints['dependencies'] = model.depends_on env.update({'ochopod': json.dumps(hints)}) executor = lifecycle.start(env, latch, hints) coordinator = Coordinator.start(hints['zk'].split(','), hints['namespace'], hints['cluster'], int(hints['port']), breadcrumbs, model, hints) # # - external hook forcing a coordinator reset # - this will force a re-connection to zookeeper and pod registration # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be # left running) # @web.route('/reset', methods=['POST']) def _reset(): logger.debug('http in -> /reset') coordinator.tell({'request': 'reset'}) return '{}', 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - external hook exposing information about our pod # - this is a subset of what's registered in zookeeper at boot-time # - the data is dynamic and updated from time to time by the model and executor actors # - from @pferro -> the pod's dependencies defined in the model are now added as well # @web.route('/info', methods=['POST']) def _info(): logger.debug('http in -> /info') keys = \ [ 'application', 'dependencies', 'ip', 'metrics', 'node', 'port', 'ports', 'process', 'public', 'state', 'status', 'task' ] subset = dict(filter(lambda i: i[0] in keys, hints.iteritems())) return json.dumps(subset), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - external hook exposing our circular log # - reverse and dump ochopod.log as a json array # @web.route('/log', methods=['POST']) def _log(): logger.debug('http in -> /log') with open(ochopod.LOG, 'r+') as log: lines = [line for line in log] return json.dumps(lines), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - RPC call to run a custom tool within the pod # @web.route('/exec', methods=['POST']) def _exec(): logger.debug('http in -> /exec') # # - make sure the command (first token in the X-Shell header) maps to a tool # - if no match abort on a 404 # line = request.headers['X-Shell'] tokens = line.split(' ') cmd = tokens[0] if not tools or cmd not in tools: return '{}', 404, { 'Content-Type': 'application/json; charset=utf-8' } code = 1 tool = tools[cmd] # # - make sure the parser does not sys.exit() # class _Parser(ArgumentParser): def exit(self, status=0, message=None): raise ValueError(message) # # - prep a temporary directory # - invoke define_cmdline_parsing() # - switch off parsing if NotImplementedError is raised # use_parser = 1 parser = _Parser(prog=tool.tag) try: tool.define_cmdline_parsing(parser) except NotImplementedError: use_parser = 0 tmp = tempfile.mkdtemp() try: # # - parse the command line # - upload any attachment # args = parser.parse_args( tokens[1:]) if use_parser else ' '.join(tokens[1:]) for tag, upload in request.files.items(): where = path.join(tmp, tag) logger.debug('uploading %s @ %s' % (tag, tmp)) upload.save(where) # # - run the tool method # - pass the temporary directory as well # logger.info('invoking "%s"' % line) code, lines = tool.body(args, tmp) except ValueError as failure: lines = [ parser.format_help() if failure.message is None else failure.message ] except Exception as failure: lines = ['unexpected failure -> %s' % failure] finally: # # - make sure to cleanup our temporary directory # shutil.rmtree(tmp) out = \ { 'code': code, 'stdout': lines } return json.dumps(out), 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - web-hook used to receive requests from the leader or the CLI tools # - those requests are passed down to the executor actor # - any non HTTP 200 response is a failure # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT) # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE) # @web.route('/control/<task>', methods=['POST']) @web.route('/control/<task>/<timeout>', methods=['POST']) def _control(task, timeout='60'): logger.debug('http in -> /control/%s' % task) if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']: # # - fail on a HTTP 400 if the request is not supported # return '{}', 400, { 'Content-Type': 'application/json; charset=utf-8' } try: ts = time.time() latch = ThreadingFuture() executor.tell({ 'request': task, 'latch': latch, 'data': request.data }) js, code = latch.get(timeout=int(timeout)) ms = time.time() - ts logger.debug('http out -> HTTP %s (%d ms)' % (code, ms)) return json.dumps(js), code, { 'Content-Type': 'application/json; charset=utf-8' } except Timeout: # # - we failed to match the specified timeout # - gracefully fail on a HTTP 408 # return '{}', 408, { 'Content-Type': 'application/json; charset=utf-8' } except ActorDeadError: # # - the executor has been shutdown (probably after a /control/kill) # - gracefully fail on a HTTP 410 # return '{}', 410, { 'Content-Type': 'application/json; charset=utf-8' } # # - internal hook required to shutdown the web-server # - it's not possible to do it outside of a request handler # - make sure this calls only comes from localhost (todo) # @web.route('/terminate', methods=['POST']) def _terminate(): request.environ.get('werkzeug.server.shutdown')() return '{}', 200, { 'Content-Type': 'application/json; charset=utf-8' } # # - run werkzeug from a separate thread to avoid blocking the main one # - we'll have to shut it down using a dedicated HTTP POST # class _Runner(threading.Thread): def run(self): web.run(host='0.0.0.0', port=int(hints['port']), threaded=True) try: # # - block on the lifecycle actor until it goes down (usually after a /control/kill request) # _Runner().start() spin_lock(latch) logger.debug('pod is dead, idling') while 1: # # - simply idle forever (since the framework would restart any container that terminates) # - /log and /hints HTTP requests will succeed (and show the pod as being killed) # - any control request will now fail # time.sleep(60.0) finally: # # - when we exit the block first shutdown our executor (which may probably be already down) # - then shutdown the coordinator to un-register from zookeeper # - finally ask werkzeug to shutdown via a REST call # shutdown(executor) shutdown(coordinator) post('http://127.0.0.1:%s/terminate' % env['ochopod_port']) except KeyboardInterrupt: logger.fatal('CTRL-C pressed') except Exception as failure: logger.fatal('unexpected condition -> %s' % diagnostic(failure))