Пример #1
0
            def _control(task, timeout='60'):

                logger.debug('http in -> /control/%s' % task)
                if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']:

                    #
                    # - fail on a HTTP 400 if the request is not supported
                    #
                    return '{}', 400, {'Content-Type': 'application/json; charset=utf-8'}

                try:

                    ts = time.time()
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code, {'Content-Type': 'application/json; charset=utf-8'}

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408, {'Content-Type': 'application/json; charset=utf-8'}

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410, {'Content-Type': 'application/json; charset=utf-8'}
Пример #2
0
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410
Пример #3
0
def generate_sync_link(display_name, storage_id, capacity=1000):
    """Return a dummy sync link."""
    sync_link = mock.Mock(SynchronizationLink)
    sync_link.local = mock.Mock(BasicStorage)
    sync_link.remote = mock.Mock(BasicStorage)
    sync_link.remote.storage_id = storage_id
    metrics = ThreadingFuture()
    metrics.set(StorageMetrics(storage_id, capacity,
                               display_name=display_name))
    sync_link.metrics = metrics
    sync_engine = mock.MagicMock(SyncEngine)
    sync_engine.query().get.return_value = {}
    sync_link.sync_engine = sync_engine
    return sync_link
Пример #4
0
    def invoke(self, raw):

        """
        RPC API: shell invokation on behalf of the master. The code is run by
        the script actor and its stdout returned back to the caller.

        :type raw: str
        :param raw: serialized json payload
        :rtype: the shell script stdout upon succes, None upon failure
        """

        try:
            js = json.loads(raw)

            logger.debug('RPC invoke() <- "%s"' % js['cmd'])
            msg = MSG({'request': 'invoke'})
            msg.cmd = js['cmd']
            msg.env = {'INPUT': json.dumps(js)}
            msg.latch = ThreadingFuture()     

            #
            # - block on a latch and reply with whatever the shell script
            #   wrote to its standard output
            #
            actors['script'].tell(msg)
            return msg.latch.get(timeout=60)
            
        except Exception as failure:
            return None
Пример #5
0
            def _control(task, timeout='60'):

                logger.debug('http in -> /control/%s' % task)
                if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']:

                    #
                    # - fail on a HTTP 400 if the request is not supported
                    #
                    return '{}', 400, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                try:

                    ts = time.time()
                    latch = ThreadingFuture()
                    executor.tell({
                        'request': task,
                        'latch': latch,
                        'data': request.data
                    })
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }
Пример #6
0
    def _request(self, tokens):

        #
        # - we use this help to schedule commands internally (mostly used to switch
        #   the pod on/off)
        #
        for token in tokens:
            self.commands.append((token, {}, ThreadingFuture()))
Пример #7
0
Файл: fsm.py Проект: d2e/ochopod
def shutdown(actor_ref, timeout=None):
    """
    Shuts a state-machine down and wait for it to acknowledge it's down using a latch.

    :type actor_ref: :class:`pykka.ActorRef`
    :param actor_ref: a pykka actor reference
    :type timeout: float
    :param timeout: optional timeout in seconds
    """
    try:
        if not actor_ref:
            return

        latch = ThreadingFuture()
        actor_ref.tell({'request': 'shutdown', 'latch': latch})
        Event()
        latch.get(timeout=timeout)

    except Timeout:
        pass

    except ActorDeadError:
        pass
Пример #8
0
def shutdown(actor_ref, timeout=None):
    """
    Shuts a state-machine down and wait for it to acknowledge it's down using a latch.

    :type actor_ref: :class:`pykka.ActorRef`
    :param actor_ref: a pykka actor reference
    :type timeout: float
    :param timeout: optional timeout in seconds
    """
    try:
        if not actor_ref:
            return

        latch = ThreadingFuture()
        actor_ref.tell({'request': 'shutdown', 'latch': latch})
        Event()
        latch.get(timeout=timeout)

    except Timeout:
        pass

    except ActorDeadError:
        pass
Пример #9
0
    def spin(self, data):

        #
        # - if the termination trigger is set, abort immediately
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - attempt to fetch the lock
        #
        lock = data.zk.Lock('%s/coordinator' % self.prefix)
        try:

            #
            # - the kazoo lock recipe seems to be sensitive if being switched to SUSPENDED .. in order to
            #   avoid stalling on the lock (which is the default behavior), attempt to lock multiple time
            #   with a short timeout (e.g spin-lock)
            #
            if hasattr(data, 'lock') and data.lock:
                try:
                    data.lock.release()
                except ConnectionClosedError:
                    pass

            data.lock = None
            lock.acquire(timeout=SAMPLING)
            logger.debug('%s : lock acquired @ %s, now leading' %
                         (self.path, self.prefix))
            data.lock = lock

            #
            # - we have the lock (e.g we are the leader)
            # - start the controller actor
            #
            data.latch = ThreadingFuture()
            data.controller = self.model.start(data.zk, self.hints, self.scope,
                                               self.tag, self.port, data.latch)
            return 'lock', data, 0

        except LockTimeout:
            pass

        #
        # - we could not obtain the lock
        # - blindly loop back and attempt to get it again
        #
        return 'spin', data, 0
Пример #10
0
def block(creator, strict=1, spin=0.5, collect=None):
    """
    Compound spin-lock creating a latch, passing it to a lambda and then blocking.

    :type creator: lambda
    :param creator: lambda taking a :class:`pykka.ThreadingFuture` as parameter
    :type strict: bool
    :param strict: if true the method will raise if ever the future outcome is an exception
    :type spin: float
    :param spin: wait timeout in seconds
    :type collect: list
    :param collect: receives the lambda result if specified
    :rtype:
    """

    latch = ThreadingFuture()
    ref = creator(latch)
    if collect is not None:
        collect.append(ref)

    spin_lock(latch, strict, spin)
Пример #11
0
    def start_controller(self, data):

        #
        # - if the termination trigger is set, abort immediately
        # - this is important as it is possible to somehow get the lock after a suspend (acquire() returns
        #   true in that case which is misleading)
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - we have the lock (e.g we are the leader)
        # - start the controller actor
        #
        data.latch = ThreadingFuture()
        logger.debug('%s : lock acquired @ %s, now leading' %
                     (self.path, self.prefix))
        data.controller = self.model.start(data.zk, self.id, self.hints,
                                           self.scope, self.tag, self.port,
                                           data.latch)

        return 'lock', data, 0
Пример #12
0
    def boot(self, lifecycle, model=Reactive, local=0):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model,
                          Model), 'model must derive from ochopod.api.Model'
        assert issubclass(
            lifecycle,
            LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - start logging to /var/log/ochopod.log
        #
        logger.info('EC2 kubernetes bindings started')
        web = Flask(__name__)

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the pod configuration yaml/json for instance)
        #
        env = \
            {
                'ochopod_application': '',
                'ochopod_cluster': '',
                'ochopod_debug': 'true',
                'ochopod_local': 'false',
                'ochopod_namespace': 'default',
                'ochopod_port': '8080',
                'ochopod_start': 'true',
                'ochopod_task': ''
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables
            # - isolate the ones prefixed with ochopod_
            #
            logger.debug(
                'environment ->\n%s' %
                '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            hints = {
                k[8:]: v
                for k, v in env.items() if k.startswith('ochopod_')
            }
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info(
                    'running in local mode (make sure you run a standalone zookeeper)'
                )
                hints.update({
                    'fwk': 'kubernetes',
                    'ip': '127.0.0.1',
                    'node': 'localhost',
                    'public': '127.0.0.1',
                    'zk': '127.0.0.1:2181'
                })
            else:

                #
                # - we are (assuming to be) deployed on EC2
                # - we'll retrieve the underlying metadata using curl
                #
                def _aws(token):
                    code, lines = shell(
                        'curl -f http://169.254.169.254/latest/meta-data/%s' %
                        token)
                    assert code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token
                    return lines[0]

                #
                # - lame workaround to fetch the master IP and credentials as there does not seem to be a way to
                #   use 10.0.0.2 from within the pod yet (or i'm too stupid to find out)
                # - curl to the master to retrieve info about our cluster
                # - don't forget to merge the resulting output
                #
                def _k8s(token):
                    code, lines = shell(
                        'curl -f -u %s:%s -k https://%s/api/v1beta3/namespaces/default/%s'
                        % (env['KUBERNETES_USER'], env['KUBERNETES_PWD'],
                           env['KUBERNETES_MASTER'], token))
                    assert code is 0, 'unable to look the RO service up (is the master running ?)'
                    return json.loads(''.join(lines))

                #
                # - look our local k8s pod up
                # - get our container ip
                # - extract the port bindings
                # - keep any "ochopod_" environment variable & trim its prefix
                #
                @retry(timeout=60, pause=1)
                def _spin():

                    #
                    # - wait til the k8s pod is running and publishing its IP
                    #
                    cfg = _k8s('pods/%s' % env['HOSTNAME'])
                    assert 'podIP' in cfg[
                        'status'], 'pod not ready yet -> %s' % cfg['status'][
                            'phase']
                    return cfg

                this_pod = _spin()
                hints['ip'] = this_pod['status']['podIP']

                #
                # - revert to the k8s pod name if no cluster is specified
                #
                if not hints['cluster']:
                    hints['cluster'] = this_pod['metadata']['name']

                #
                # - consider the 1st pod container
                # - grab the exposed ports (no remapping required)
                #
                ports = {}
                container = this_pod['spec']['containers'][0]
                for binding in container['ports']:
                    port = binding['containerPort']
                    ports[str(port)] = port

                #
                # - set 'task' to $HOSTNAME (the container is named after the k8s pod)
                # - get our public IPV4 address
                # - the "node" will show up as the EC2 instance ID
                #
                hints.update({
                    'fwk': 'k8s-ec2',
                    'node': _aws('instance-id'),
                    'ports': ports,
                    'public': _aws('public-ipv4'),
                    'task': env['HOSTNAME']
                })

                #
                # - look the k8s "ocho-proxy" pod up
                # - it should be design run our synchronization zookeeper
                #
                proxy = _k8s('pods/ocho-proxy')
                assert 'podIP' in proxy['status'], 'proxy not ready ?'
                hints['zk'] = _k8s('pods/ocho-proxy')['status']['podIP']

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['namespace'], 'no namespace defined (user error ?)'

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (kubernetes/ec2) @ %s' %
                        (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(hints['zk'].split(','),
                                            hints['namespace'],
                                            hints['cluster'],
                                            int(hints['port']), breadcrumbs,
                                            model, hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():
                coordinator.tell({'request': 'reset'})
                return '{}', 200

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            #
            @web.route('/info', methods=['POST'])
            def _info():
                keys = \
                    [
                        'application',
                        'ip',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys,
                                     hints.iteritems()))
                return json.dumps(subset), 200

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({
                        'request': task,
                        'latch': latch,
                        'data': request.data
                    })
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():
                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200

            class _Runner(threading.Thread):
                """
                Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down
                using a dedicated HTTP POST.
                """
                def run(self):
                    web.run(host='0.0.0.0',
                            port=int(hints['port']),
                            threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')

                #
                # - simply idle forever (since the framework would restart any container that terminates)
                # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                # - any control request will now fail
                #
                while 1:
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))

        exit(1)
Пример #13
0
    def boot(self, lifecycle, model=Reactive, local=0):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model, Model), 'model must derive from ochopod.api.Model'
        assert issubclass(lifecycle, LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - start logging to /var/log/ochopod.log
        #
        logger.info('EC2 marathon bindings started')
        web = Flask(__name__)

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the marathon application configuration for instance)
        # - the other settings come from marathon (namely the port bindings & application/task identifiers)
        # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API
        #
        env = \
            {
                'ochopod_application': '',
                'ochopod_cluster': 'default',
                'ochopod_debug': 'true',
                'ochopod_local': 'false',
                'ochopod_namespace': 'marathon',
                'ochopod_port': '8080',
                'ochopod_start': 'true',
                'ochopod_task': '',
                'PORT_8080': '8080'
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables (which are set by the marathon executor)
            # - extract the mesos PORT_* bindings and construct a small remapping dict
            #
            ports = {}
            logger.debug('environment ->\n%s' % '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            for key, val in env.items():
                if key.startswith('PORT_'):
                    ports[key[5:]] = int(val)

            #
            # - keep any "ochopod_" environment variable & trim its prefix
            # - default all our settings, especially the mandatory ones
            # - the ip and zookeeper are defaulted to localhost to enable easy testing
            #
            hints = {k[8:]: v for k, v in env.items() if k.startswith('ochopod_')}
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info('running in local mode (make sure you run a standalone zookeeper)')
                hints.update(
                    {
                        'fwk': 'marathon-ec2',
                        'ip': '127.0.0.1',
                        'node': 'local',
                        'ports': ports,
                        'public': '127.0.0.1',
                        'zk': '127.0.0.1:2181'
                    })
            else:

                #
                # - we are (assuming to be) deployed on EC2
                # - get our underlying metadata using curl
                #
                def _peek(token, strict=True):
                    code, lines = shell('curl -f http://169.254.169.254/latest/meta-data/%s' % token)
                    assert not strict or code is 0, 'unable to lookup EC2 metadata for %s (are you running on EC2 ?)' % token
                    return lines[0]

                #
                # - get our local and public IPV4 addresses
                # - the "node" will show up as the EC2 instance ID
                # - note we allow the public IPv4 lookup to fail (in case we run in VPC)
                #
                hints.update(
                    {
                        'application': env['MARATHON_APP_ID'][1:],
                        'fwk': 'marathon-ec2',
                        'ip': _peek('local-ipv4'),
                        'node': _peek('instance-id'),
                        'ports': ports,
                        'public': _peek('public-ipv4', strict=False),
                        'task': env['MESOS_TASK_ID'],
                        'zk': ''
                    })

                def _install_from_package():

                    #
                    # - a regular package install will write the slave settings under /etc/mesos/zk
                    # - the snippet in there looks like zk://10.0.0.56:2181/mesos
                    #
                    code, lines = shell("cat /etc/mesos/zk")
                    assert code is 0 and lines[0], 'unable to retrieve the zk connection string'
                    return lines[0][5:].split('/')[0]

                def _dcos_deployment():

                    #
                    # - a DCOS slave is setup slightly differently with the settings being environment
                    #   variables set in /opt/mesosphere/etc/mesos-slave
                    # - the snippet in there is prefixed by MESOS_MASTER= and uses an alias
                    # - it looks like MESOS_MASTER=zk://leader.mesos:2181/mesos
                    #
                    code, lines = shell("grep MASTER /opt/mesosphere/etc/mesos-slave")
                    assert code is 0 and lines[0], 'unable to retrieve the zk connection string'
                    return lines[0][18:].split('/')[0]

                #
                # - depending on how the slave has been installed we might have to look in various places
                #   to find out what our zookeeper connection string is
                # - warning, a URL like format such as zk://<ip:port>,..,<ip:port>/mesos is used
                # - just keep the ip & port part and discard the rest
                #
                for method in [_install_from_package, _dcos_deployment]:
                    try:
                        hints['zk'] = method()
                        break

                    except:
                        pass

                assert hints['zk'], 'unable to determine where zookeeper is located (unsupported/bogus setup ?)'

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints['cluster'] and hints['namespace'], 'no cluster and/or namespace defined (user error ?)'

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (marathon/ec2) @ %s' % (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            hints['metrics'] = {}
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(
                hints['zk'].split(','),
                hints['namespace'],
                hints['cluster'],
                int(hints['port']),
                breadcrumbs,
                model,
                hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():
                coordinator.tell({'request': 'reset'})
                return '{}', 200

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            #
            @web.route('/info', methods=['POST'])
            def _info():
                keys = \
                    [
                        'application',
                        'ip',
                        'metrics',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys, hints.iteritems()))
                return json.dumps(subset), 200

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):
                try:

                    ts = time.time()
                    logger.debug('http in -> /control/%s' % task)
                    latch = ThreadingFuture()
                    executor.tell({'request': task, 'latch': latch, 'data': request.data})
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():
                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200

            class _Runner(threading.Thread):
                """
                Run werkzeug from a separate thread to avoid blocking the main one. We'll have to shut it down
                using a dedicated HTTP POST.
                """

                def run(self):
                    web.run(host='0.0.0.0', port=int(hints['port']), threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')

                #
                # - simply idle forever (since the framework would restart any container that terminates)
                # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                # - any control request will now fail
                #
                while 1:
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))
Пример #14
0
def test_future_nested_future(future):
    inner_future = ThreadingFuture()
    inner_future.set("foo")
    outer_future = ThreadingFuture()
    outer_future.set(inner_future)
    assert outer_future.get().get() == "foo"
Пример #15
0
def futures():
    return [ThreadingFuture() for _ in range(3)]
Пример #16
0
def future():
    return ThreadingFuture()
Пример #17
0
 def _start():
     latch = ThreadingFuture()
     ref = creator(latch)
     if collect is not None:
         collect.append(ref)
     return latch
Пример #18
0
    def boot(self, lifecycle, model=Reactive, tools=None, local=False):

        #
        # - quick check to make sure we get the right implementations
        #
        assert issubclass(model,
                          Model), 'model must derive from ochopod.api.Model'
        assert issubclass(
            lifecycle,
            LifeCycle), 'lifecycle must derive from ochopod.api.LifeCycle'

        #
        # - instantiate our flask endpoint
        # - default to a json handler for all HTTP errors (including an unexpected 500)
        #
        def _handler(error):
            http = error.code if isinstance(error, HTTPException) else 500
            return '{}', http, {
                'Content-Type': 'application/json; charset=utf-8'
            }

        web = Flask(__name__)
        for code in default_exceptions.iterkeys():
            web.error_handler_spec[None][code] = _handler

        #
        # - default presets in case we run outside of marathon (local vm testing)
        # - any environment variable prefixed with "ochopod." is of interest for us (e.g this is what the user puts
        #   in the marathon application configuration for instance)
        # - the other settings come from marathon (namely the port bindings & application/task identifiers)
        # - the MESOS_TASK_ID is important to keep around to enable task deletion via the marathon REST API
        #
        env = \
            {
                'ochopod_application':  '',
                'ochopod_cluster':      'default',
                'ochopod_debug':        'true',
                'ochopod_local':        'false',
                'ochopod_namespace':    'marathon',
                'ochopod_port':         '8080',
                'ochopod_start':        'true',
                'ochopod_task':         '',
                'ochopod_zk':           '',
                'PORT_8080':            '8080'
            }

        env.update(os.environ)
        ochopod.enable_cli_log(debug=env['ochopod_debug'] == 'true')
        try:

            #
            # - grab our environment variables (which are set by the marathon executor)
            # - extract the mesos PORT_* bindings and construct a small remapping dict
            #
            ports = {}
            logger.debug(
                'environment ->\n%s' %
                '\n'.join(['\t%s -> %s' % (k, v) for k, v in env.items()]))
            for key, val in env.items():
                if key.startswith('PORT_'):
                    ports[key[5:]] = int(val)

            #
            # - keep any "ochopod_" environment variable & trim its prefix
            # - default all our settings, especially the mandatory ones
            # - the ip and zookeeper are defaulted to localhost to enable easy testing
            #
            hints = {
                k[8:]: v
                for k, v in env.items() if k.startswith('ochopod_')
            }
            if local or hints['local'] == 'true':

                #
                # - we are running in local mode (e.g on a dev workstation)
                # - default everything to localhost
                #
                logger.info(
                    'running in local mode (make sure you run a standalone zookeeper)'
                )
                hints.update({
                    'fwk': 'marathon (debug)',
                    'ip': '127.0.0.1',
                    'node': 'local',
                    'ports': ports,
                    'public': '127.0.0.1',
                    'zk': '127.0.0.1:2181'
                })
            else:

                #
                # - extend our hints
                # - add the application + task
                #
                hints.update({
                    'application': env['MARATHON_APP_ID'][1:],
                    'fwk': 'marathon',
                    'ip': '',
                    'node': '',
                    'ports': ports,
                    'public': '',
                    'task': env['MESOS_TASK_ID'],
                    'zk': ''
                })

                #
                # - use whatever subclass is implementing us to infer 'ip', 'node' and 'public'
                #
                hints.update(self.get_node_details())

                #
                # - lookup for the zookeeper connection string from environment variable or on disk
                # - we have to look into different places depending on how mesos was installed
                #
                def _1():

                    #
                    # - most recent DCOS release
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave-common
                    # - the snippet in there is prefixed by MESOS_MASTER=zk://<ip:port>/mesos
                    #
                    logger.debug(
                        'checking /opt/mesosphere/etc/mesos-slave-common...')
                    _, lines = shell(
                        "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave-common"
                    )
                    return lines[0][13:]

                def _2():

                    #
                    # - same as above except for slightly older DCOS releases
                    # - $MESOS_MASTER is located in /opt/mesosphere/etc/mesos-slave
                    #
                    logger.debug('checking /opt/mesosphere/etc/mesos-slave...')
                    _, lines = shell(
                        "grep MESOS_MASTER /opt/mesosphere/etc/mesos-slave")
                    return lines[0][13:]

                def _3():

                    #
                    # - a regular package install will write the slave settings under /etc/mesos/zk (the snippet in
                    #   there looks like zk://10.0.0.56:2181/mesos)
                    #
                    logger.debug('checking /etc/mesos/zk...')
                    _, lines = shell("cat /etc/mesos/zk")
                    return lines[0]

                def _4():

                    #
                    # - look for ZK from environment variables
                    # - user can pass down ZK using $ochopod_zk
                    # - this last-resort situation is used mostly for debugging
                    #
                    logger.debug(
                        'checking $ochopod_zk environment variable...')
                    return env['ochopod_zk']

                #
                # - depending on how the slave has been installed we might have to look in various places
                #   to find out what our zookeeper connection string is
                # - use urlparse to keep the host:port part of the URL (possibly including a login+password)
                #
                for method in [_1, _2, _3, _4]:
                    try:
                        hints['zk'] = urlparse(method()).netloc
                        break

                    except Exception:
                        pass

            #
            # - the cluster must be fully qualified with a namespace (which is defaulted anyway)
            #
            assert hints[
                'zk'], 'unable to determine where zookeeper is located (unsupported/bogus mesos setup ?)'
            assert hints['cluster'] and hints[
                'namespace'], 'no cluster and/or namespace defined (user error ?)'

            #
            # - load the tools
            #
            if tools:
                tools = {
                    tool.tag: tool
                    for tool in
                    [clz() for clz in tools if issubclass(clz, Tool)]
                    if tool.tag
                }
                logger.info('supporting tools %s' % ', '.join(tools.keys()))

            #
            # - start the life-cycle actor which will pass our hints (as a json object) to its underlying sub-process
            # - start our coordinator which will connect to zookeeper and attempt to lead the cluster
            # - upon grabbing the lock the model actor will start and implement the configuration process
            # - the hints are a convenient bag for any data that may change at runtime and needs to be returned (via
            #   the HTTP POST /info request)
            # - what's being registered in zookeeper is immutable though and decorated with additional details by
            #   the coordinator (especially the pod index which is derived from zookeeper)
            #
            latch = ThreadingFuture()
            logger.info('starting %s.%s (marathon) @ %s' %
                        (hints['namespace'], hints['cluster'], hints['node']))
            breadcrumbs = deepcopy(hints)
            hints['metrics'] = {}
            hints['dependencies'] = model.depends_on
            env.update({'ochopod': json.dumps(hints)})
            executor = lifecycle.start(env, latch, hints)
            coordinator = Coordinator.start(hints['zk'].split(','),
                                            hints['namespace'],
                                            hints['cluster'],
                                            int(hints['port']), breadcrumbs,
                                            model, hints)

            #
            # - external hook forcing a coordinator reset
            # - this will force a re-connection to zookeeper and pod registration
            # - please note this will not impact the pod lifecycle (e.g the underlying sub-process will be
            #   left running)
            #
            @web.route('/reset', methods=['POST'])
            def _reset():

                logger.debug('http in -> /reset')
                coordinator.tell({'request': 'reset'})
                return '{}', 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - external hook exposing information about our pod
            # - this is a subset of what's registered in zookeeper at boot-time
            # - the data is dynamic and updated from time to time by the model and executor actors
            # - from @pferro -> the pod's dependencies defined in the model are now added as well
            #
            @web.route('/info', methods=['POST'])
            def _info():

                logger.debug('http in -> /info')
                keys = \
                    [
                        'application',
                        'dependencies',
                        'ip',
                        'metrics',
                        'node',
                        'port',
                        'ports',
                        'process',
                        'public',
                        'state',
                        'status',
                        'task'
                    ]

                subset = dict(filter(lambda i: i[0] in keys,
                                     hints.iteritems()))
                return json.dumps(subset), 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - external hook exposing our circular log
            # - reverse and dump ochopod.log as a json array
            #
            @web.route('/log', methods=['POST'])
            def _log():

                logger.debug('http in -> /log')
                with open(ochopod.LOG, 'r+') as log:
                    lines = [line for line in log]
                    return json.dumps(lines), 200, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

            #
            # - RPC call to run a custom tool within the pod
            #
            @web.route('/exec', methods=['POST'])
            def _exec():

                logger.debug('http in -> /exec')

                #
                # - make sure the command (first token in the X-Shell header) maps to a tool
                # - if no match abort on a 404
                #
                line = request.headers['X-Shell']
                tokens = line.split(' ')
                cmd = tokens[0]
                if not tools or cmd not in tools:
                    return '{}', 404, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                code = 1
                tool = tools[cmd]

                #
                # - make sure the parser does not sys.exit()
                #
                class _Parser(ArgumentParser):
                    def exit(self, status=0, message=None):
                        raise ValueError(message)

                #
                # - prep a temporary directory
                # - invoke define_cmdline_parsing()
                # - switch off parsing if NotImplementedError is raised
                #
                use_parser = 1
                parser = _Parser(prog=tool.tag)
                try:
                    tool.define_cmdline_parsing(parser)

                except NotImplementedError:
                    use_parser = 0

                tmp = tempfile.mkdtemp()
                try:

                    #
                    # - parse the command line
                    # - upload any attachment
                    #
                    args = parser.parse_args(
                        tokens[1:]) if use_parser else ' '.join(tokens[1:])
                    for tag, upload in request.files.items():
                        where = path.join(tmp, tag)
                        logger.debug('uploading %s @ %s' % (tag, tmp))
                        upload.save(where)

                    #
                    # - run the tool method
                    # - pass the temporary directory as well
                    #
                    logger.info('invoking "%s"' % line)
                    code, lines = tool.body(args, tmp)

                except ValueError as failure:

                    lines = [
                        parser.format_help()
                        if failure.message is None else failure.message
                    ]

                except Exception as failure:

                    lines = ['unexpected failure -> %s' % failure]

                finally:

                    #
                    # - make sure to cleanup our temporary directory
                    #
                    shutil.rmtree(tmp)

                out = \
                    {
                        'code': code,
                        'stdout': lines
                    }

                return json.dumps(out), 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - web-hook used to receive requests from the leader or the CLI tools
            # - those requests are passed down to the executor actor
            # - any non HTTP 200 response is a failure
            # - failure to acknowledge within the specified timeout will result in a HTTP 408 (REQUEST TIMEOUT)
            # - attempting to send a control request to a dead pod will result in a HTTP 410 (GONE)
            #
            @web.route('/control/<task>', methods=['POST'])
            @web.route('/control/<task>/<timeout>', methods=['POST'])
            def _control(task, timeout='60'):

                logger.debug('http in -> /control/%s' % task)
                if task not in ['check', 'on', 'off', 'ok', 'kill', 'signal']:

                    #
                    # - fail on a HTTP 400 if the request is not supported
                    #
                    return '{}', 400, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                try:

                    ts = time.time()
                    latch = ThreadingFuture()
                    executor.tell({
                        'request': task,
                        'latch': latch,
                        'data': request.data
                    })
                    js, code = latch.get(timeout=int(timeout))
                    ms = time.time() - ts
                    logger.debug('http out -> HTTP %s (%d ms)' % (code, ms))
                    return json.dumps(js), code, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except Timeout:

                    #
                    # - we failed to match the specified timeout
                    # - gracefully fail on a HTTP 408
                    #
                    return '{}', 408, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

                except ActorDeadError:

                    #
                    # - the executor has been shutdown (probably after a /control/kill)
                    # - gracefully fail on a HTTP 410
                    #
                    return '{}', 410, {
                        'Content-Type': 'application/json; charset=utf-8'
                    }

            #
            # - internal hook required to shutdown the web-server
            # - it's not possible to do it outside of a request handler
            # - make sure this calls only comes from localhost (todo)
            #
            @web.route('/terminate', methods=['POST'])
            def _terminate():

                request.environ.get('werkzeug.server.shutdown')()
                return '{}', 200, {
                    'Content-Type': 'application/json; charset=utf-8'
                }

            #
            # - run werkzeug from a separate thread to avoid blocking the main one
            # - we'll have to shut it down using a dedicated HTTP POST
            #
            class _Runner(threading.Thread):
                def run(self):
                    web.run(host='0.0.0.0',
                            port=int(hints['port']),
                            threaded=True)

            try:

                #
                # - block on the lifecycle actor until it goes down (usually after a /control/kill request)
                #
                _Runner().start()
                spin_lock(latch)
                logger.debug('pod is dead, idling')
                while 1:

                    #
                    # - simply idle forever (since the framework would restart any container that terminates)
                    # - /log and /hints HTTP requests will succeed (and show the pod as being killed)
                    # - any control request will now fail
                    #
                    time.sleep(60.0)

            finally:

                #
                # - when we exit the block first shutdown our executor (which may probably be already down)
                # - then shutdown the coordinator to un-register from zookeeper
                # - finally ask werkzeug to shutdown via a REST call
                #
                shutdown(executor)
                shutdown(coordinator)
                post('http://127.0.0.1:%s/terminate' % env['ochopod_port'])

        except KeyboardInterrupt:

            logger.fatal('CTRL-C pressed')

        except Exception as failure:

            logger.fatal('unexpected condition -> %s' % diagnostic(failure))