예제 #1
0
파일: io.py 프로젝트: inthecloud247/ochopod
    def spin(self, data):

        if self.terminate:
            raise Aborted('terminating')

        while len(self.pending) > 0:

            out = None
            msg = self.pending.popleft()
            try:

                #
                # - run the specified closure
                # - assign the latch to whatever is returned
                #
                out = msg['function'](data.zk)

            except Exception as failure:

                #
                # - in case of exception simply pass it upwards via the latch
                # - this will allow for finer-grained error handling
                #
                out = failure

            msg['latch'].set(out)

        return 'spin', data, 0.25
예제 #2
0
    def spin(self, data):

        #
        # - if the termination trigger is set, abort immediately
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - attempt to fetch the lock
        # - allocate it if not already done
        # - it is *important* to just allocate one lock as there is a leak in kazoo
        #
        if not hasattr(data, 'lock'):
            data.lock = data.zk.Lock('%s/coordinator' % self.prefix)

        try:

            #
            # - attempt to lock within a 5 seconds timeout to avoid stalling in some cases
            #
            if data.lock.acquire(timeout=5.0 * SAMPLING):
                return 'start_controller', data, 0

        except LockTimeout:
            pass

        return 'spin', data, 0
예제 #3
0
    def kill(self, data):

        #
        # - the /kill request will first guarantee we terminate the process
        #
        if data.sub:
            raise Aborted('resetting to terminate pid %s' % data.sub.pid)

        try:

            #
            # - invoke the optional finalize() callback
            #
            logger.info('%s : finalizing pod' % self.path)
            self.finalize()

        except Exception as failure:

            #
            # - log something if for some reason finalize() failed as we can't really recover
            # - don't bother responding with a 406
            #
            logger.warning('%s : failed to finalize -> %s' %
                           (self.path, diagnostic(failure)))

        #
        # - in any case request a termination and tag the pod as 'dead'
        #
        reply = {}, 200
        self.terminate = 1
        self.hints['process'] = 'dead'
        data.latch.set(reply)
        self.commands.popleft()
        return 'spin', data, 0
예제 #4
0
파일: io.py 프로젝트: inthecloud247/ochopod
    def wait_for_cnx(self, data):

        if self.terminate:
            raise Aborted('terminating')

        if not self.connected:
            return 'wait_for_cnx', data, 1.0

        return 'spin', data, 0
예제 #5
0
    def wait_for_cnx(self, data):

        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - loop back if we haven't received a CONNECTED event from the driver
        #
        if not self.connected:
            return 'wait_for_cnx', data, SAMPLING

        #
        # - the /pods node holds all our ephemeral per-container data (one container == one child node)
        # - the /hash node stores the last recorded md5 hash (local pods + dependencies), which we use to
        #   flag any change amongst the pods or their dependencies
        #
        data.zk.ensure_path('%s/pods' % self.prefix)
        data.zk.ensure_path('%s/hash' % self.prefix)
        try:

            #
            # - register ourselves by creating an ephemeral
            # - this is where we can store arbitrary information (e.g our breadcrumbs)
            # - we ask for a sequence counter as well which we then keep (e.g in case of connection loss or reset
            #   we guarantee the pod won't get assigned a new index)
            # - this is *critical* for some use-cases (e.g Kafka where the broker index must remain the same)
            #
            path = data.zk.create('%s/pods/%s.' % (self.prefix, self.id),
                                  ephemeral=True,
                                  sequence=True)
            tokens = path.split('.')
            if self.seq is None:
                self.seq = int(tokens[-1])
            self.breadcrumbs['seq'] = self.seq
            js = json.dumps(self.breadcrumbs)
            data.zk.set(path, js)

        except NodeExistsError:

            #
            # - if the node is already there we just recovered from a zookeeper connection loss
            #   and /snapshot has not been phased out yet .. this is not an issue, simply pause a bit
            #   to re-attempt later
            #
            logger.debug(
                '%s : pod %s is already there (probably a zk reconnect)' %
                (self.path, self.id))
            return 'wait_for_cnx', data, 5.0 * SAMPLING

        logger.debug('%s : registered as %s (#%d)' %
                     (self.path, self.id, self.seq))
        data.connected_at = time.time()
        return 'spin', data, 0
예제 #6
0
    def off(self, data):

        #
        # - the /stop request does basically nothing
        # - it only guarantees we terminate the process
        #
        if data.forked:
            raise Aborted('resetting to terminate pid %s' % data.forked.pid)

        data.latch.set(200)
        self.commands.popleft()
        return 'spin', data, 0
예제 #7
0
    def spin(self, data):

        #
        # - if the termination trigger is set, abort immediately
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - attempt to fetch the lock
        #
        lock = data.zk.Lock('%s/coordinator' % self.prefix)
        try:

            #
            # - the kazoo lock recipe seems to be sensitive if being switched to SUSPENDED .. in order to
            #   avoid stalling on the lock (which is the default behavior), attempt to lock multiple time
            #   with a short timeout (e.g spin-lock)
            #
            if hasattr(data, 'lock') and data.lock:
                try:
                    data.lock.release()
                except ConnectionClosedError:
                    pass

            data.lock = None
            lock.acquire(timeout=SAMPLING)
            logger.debug('%s : lock acquired @ %s, now leading' %
                         (self.path, self.prefix))
            data.lock = lock

            #
            # - we have the lock (e.g we are the leader)
            # - start the controller actor
            #
            data.latch = ThreadingFuture()
            data.controller = self.model.start(data.zk, self.hints, self.scope,
                                               self.tag, self.port, data.latch)
            return 'lock', data, 0

        except LockTimeout:
            pass

        #
        # - we could not obtain the lock
        # - blindly loop back and attempt to get it again
        #
        return 'spin', data, 0
예제 #8
0
    def lock(self, data):

        #
        # - if the termination trigger is set, abort immediately
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - spin-lock on the controller latch
        # - any catastrophic plug failure will be trapped that way
        #
        try:
            Event()
            out = data.latch.get(SAMPLING)
            if isinstance(out, Exception):
                raise out

        except Timeout:
            pass

        return 'lock', data, 0
예제 #9
0
    def start_controller(self, data):

        #
        # - if the termination trigger is set, abort immediately
        # - this is important as it is possible to somehow get the lock after a suspend (acquire() returns
        #   true in that case which is misleading)
        #
        if self.force_reset or self.terminate:
            raise Aborted('resetting')

        #
        # - we have the lock (e.g we are the leader)
        # - start the controller actor
        #
        data.latch = ThreadingFuture()
        logger.debug('%s : lock acquired @ %s, now leading' %
                     (self.path, self.prefix))
        data.controller = self.model.start(data.zk, self.id, self.hints,
                                           self.scope, self.tag, self.port,
                                           data.latch)

        return 'lock', data, 0
예제 #10
0
    def on(self, data):

        if data.sub and data.js and (self.strict or data.js['dependencies'] !=
                                     self.last['dependencies']):

            #
            # - if we already have a process, we want to re-configure -> force a reset first
            # - this will go through a graceful termination process
            # - we'll come back here afterwards (with data.sub set to None)
            #
            raise Aborted('resetting to terminate pid %s first' % data.sub.pid)

        elif data.sub:

            #
            # - the process is already running, fail gracefully on a 200
            # - this is the code-path used for instance up a leader request when strict is false
            #
            reply = {}, 200
            logger.debug('%s : skipping /control/on request' % self.path)
            data.latch.set(reply)

        else:

            #
            # - no more process running, go on with the configuration
            #
            try:

                if not self.initialized:

                    #
                    # - if this is the 1st time the pod is running invoke the initialize() callback
                    # - this is typically used to run once-only stuff such as attaching storage volumes, etc.
                    #
                    logger.info('%s : initializing pod' % self.path)
                    self.initialize()
                    self.initialized = 1

                if data.js:

                    #
                    # - run the configuration procedure if we have some json
                    # - we'll use whatever it returns to popen() a new process
                    # - keep track of the shell command line returned by configure() for later
                    # - make sure the optional overrides set by configure() are strings
                    #
                    cluster = _Cluster(data.js)
                    logger.info('%s : configuring pod %d/%d' %
                                (self.path, 1 + cluster.index, cluster.size))
                    data.command, overrides = self.configure(cluster)
                    data.env = {
                        key: str(value)
                        for key, value in overrides.items()
                    }
                    self.last = data.js

                assert data.command, 'request to start process while not yet configured (user error ?)'

                #
                # - spawn a new sub-process if the auto-start flag is on OR if we already ran at least once
                # - the start flag comes from the $ochopod_start environment variable
                #
                if not data.js or self.start or data.pids > 0:

                    #
                    # - combine our environment variables with the overrides from configure()
                    # - popen() the new process and log stdout/stderr in a separate thread if required
                    # - make sure to set close_fds in order to avoid sharing the flask socket with the subprocess
                    # - reset the sanity check counter
                    # - keep track of its pid to kill it later on
                    #
                    env = deepcopy(self.env)
                    env.update(data.env)
                    tokens = data.command if self.shell else data.command.split(
                        ' ')

                    if self.pipe_subprocess:

                        #
                        # - set the popen call to use piping if required
                        # - spawn an ancillary thread to forward the lines to our logger
                        # - this thread will go down automatically when the sub-process does
                        #
                        data.sub = Popen(tokens,
                                         close_fds=True,
                                         cwd=self.cwd,
                                         env=env,
                                         shell=self.shell,
                                         stderr=STDOUT,
                                         stdout=PIPE)

                        def _pipe(process):

                            while True:

                                line = process.stdout.readline().rstrip('\n')
                                code = process.poll()
                                if line == '' and code is not None:
                                    break

                                logger.info('pid %s : %s' %
                                            (process.pid, line))

                        out = Thread(target=_pipe, args=(data.sub, ))
                        out.daemon = True
                        out.start()

                    else:

                        #
                        # - default popen call without piping
                        #
                        data.sub = Popen(tokens,
                                         close_fds=True,
                                         cwd=self.cwd,
                                         env=env,
                                         shell=self.shell)

                    data.pids += 1
                    self.hints['process'] = 'running'
                    logger.info(
                        '%s : popen() #%d -> started <%s> as pid %s' %
                        (self.path, data.pids, data.command, data.sub.pid))
                    if data.env:
                        unrolled = '\n'.join([
                            '\t%s -> %s' % (k, v) for k, v in data.env.items()
                        ])
                        logger.debug(
                            '%s : extra environment for pid %s ->\n%s' %
                            (self.path, data.sub.pid, unrolled))

                reply = {}, 200
                data.latch.set(reply)

            except Exception as failure:

                #
                # - any failure trapped during the configuration -> HTTP 406
                # - the pod will shutdown automatically as well
                #
                reply = {}, 406
                logger.warning(
                    '%s : failed to configure -> %s, shutting down' %
                    (self.path, diagnostic(failure)))
                self._request(['kill'])
                data.latch.set(reply)

        self.commands.popleft()
        return 'spin', data, 0
예제 #11
0
    def spin(self, data):

        if self.terminate:

            if not data.sub:

                #
                # - kill the actor (which will release the latch and unlock the main loop)
                #
                self.exitcode()

            else:

                #
                # - this will force a reset and make sure we kill the process
                # - we'll loop back to spin() in any case and exitcode() this time
                #
                raise Aborted('terminating')

        elif self.commands:

            #
            # - we have at least one request pending
            # - pop the next command and run it (e.g switch the state-machine to it)
            #
            req, js, latch = self.commands[0]
            data.js = js
            data.latch = latch
            return req, data, 0

        elif data.sub:

            #
            # - check if the process is still running
            #
            now = time.time()
            if data.sub.poll() is None:

                if now >= data.next_sanity_check:

                    #
                    # - schedule the next sanity check
                    # - assert if the process aborted since the last one
                    #
                    data.next_sanity_check = now + self.check_every

                    try:
                        assert not data.failed, \
                            '%s : too many process failures (%d since last check)' % (self.path, data.failed)

                        js = self.sanity_check(data.sub.pid)
                        self.hints['metrics'] = {} if js is None else js
                        data.checks = self.checks
                        data.failed = 0

                    except Exception as failure:

                        #
                        # - any failure trapped during the sanity check will decrement our counter
                        # - eventually the process is stopped (up to the user to decide what to do)
                        #
                        data.checks -= 1
                        data.failed = 0
                        logger.warning(
                            '%s : sanity check (%d/%d) failed -> %s' %
                            (self.path, self.checks - data.checks, self.checks,
                             diagnostic(failure)))

                        if not data.checks:
                            logger.warning('%s : turning pod off' % self.path)
                            data.checks = self.checks
                            self._request(['off'])

            else:

                code = data.sub.returncode
                if not code:

                    #
                    # - a successful exit code (0) will automatically force a shutdown
                    # - this is a convenient way for pods go down automatically once their task is done
                    #
                    logger.error('%s : pid %s exited, shutting down' %
                                 (self.path, data.sub.pid))
                    self._request(['kill'])

                else:

                    #
                    # - the process died on a non zero exit code
                    # - increment the failure counter (too many failures in a row will fail the sanity check)
                    # - restart it gracefully
                    #
                    data.failed += 1
                    logger.error('%s : pid %s died (code %d), re-running' %
                                 (self.path, data.sub.pid, code))
                    self._request(['off', 'on'])

        else:

            #
            # - reset by default the metrics if the sub-process is not running
            #
            self.hints['metrics'] = {}

        return 'spin', data, SAMPLING
예제 #12
0
    def on(self, data):

        if data.forked and data.js and (self.strict or data.js['dependencies']
                                        != self.last['dependencies']):

            #
            # - if we already have a process, we want to re-configure -> force a reset first
            # - this will go through a graceful termination process
            # - we'll come back here afterwards (with data.forked set to None)
            #
            raise Aborted('resetting to terminate pid %s first' %
                          data.forked.pid)

        elif data.forked:

            #
            # - the process is already running, fail gracefully on a 200
            # - this is the code-path used for instance up a leader request when strict is false
            #
            logger.debug('%s : skipping /control/on request' % self.path)
            data.latch.set(200)

        else:

            #
            # - no more process running, go on with the configuration
            #
            try:

                if not self.initialized:

                    #
                    # - if this is the 1st time the pod is running invoke the initialize() callback
                    # - this is typically used to run once-only stuff such as attaching storage volumes, etc.
                    #
                    logger.info('%s : initializing pod' % self.path)
                    self.initialize()
                    self.initialized = 1

                if data.js:

                    #
                    # - run the configuration procedure if we have some json
                    # - we'll use whatever it returns to popen() a new process
                    # - keep track of the shell command line returned by configure() for later
                    #
                    cluster = _Cluster(data.js)
                    logger.info('%s : configuring pod %d/%d' %
                                (self.path, 1 + cluster.index, cluster.size))
                    data.command, data.env = self.configure(cluster)
                    self.last = data.js

                assert data.command, 'request to start process while not yet configured (user error ?)'

                #
                # - combine our environment variables with the overrides from configure()
                # - popen() the new process
                # - reset the sanity check counter
                # - keep track of its pid to kill it later on
                #
                now = time.time()
                env = deepcopy(self.env)
                env.update(data.env)
                tokens = data.command if self.shell else data.command.split(
                    ' ')
                data.forked = Popen(tokens,
                                    cwd=self.cwd,
                                    env=env,
                                    shell=self.shell)
                data.checks = self.checks
                self.hints['process'] = 'running'
                logger.info('%s : started <%s> as pid %s' %
                            (self.path, data.command, data.forked.pid))
                if data.env:
                    unrolled = '\n'.join(
                        ['\t%s -> %s' % (k, v) for k, v in data.env.items()])
                    logger.debug('%s : extra environment for pid %s ->\n%s' %
                                 (self.path, data.forked.pid, unrolled))

                data.next_sanity_check = now + SANITY
                data.latch.set(200)

            except Exception as failure:

                #
                # - any failure trapped during the configuration -> HTTP 406
                # - the pod will shutdown automatically as well
                #
                logger.warning(
                    '%s : failed to configure -> %s, shutting down' %
                    (self.path, diagnostic(failure)))
                self._request(['kill'])
                data.latch.set(406)

        self.commands.popleft()
        return 'spin', data, 0
예제 #13
0
    def spin(self, data):

        if self.terminate:
            if not data.forked:

                #
                # - kill the actor (which will release the latch and unlock the main loop)
                #
                self.exitcode()

            else:

                #
                # - this will force a reset and make sure we kill the process
                # - we'll loop back to spin() in any case and exitcode() this time
                #
                raise Aborted('terminating')

        if self.commands:

            #
            # - we have at least one request pending
            # - pop the next command and run it (e.g switch the state-machine to it)
            #
            req, js, latch = self.commands[0]
            data.js = js
            data.latch = latch
            return req, data, 0

        if data.forked:

            #
            # - no request to run
            # - check if the process is still running and run the user-defined sanity check once in a while
            #
            now = time.time()
            if data.forked.poll() is not None:
                code = data.forked.returncode
                if not code:

                    #
                    # - a successful exit code (0) will automatically force a shutdown
                    # - this is a convenient way for pods go down automatically once their task is done
                    #
                    logger.error('%s : pid %s exited, shutting down' %
                                 (self.path, data.forked.pid))
                    self._request(['kill'])

                else:

                    #
                    # - the process died on a non zero exit code
                    # - restart it gracefully
                    #
                    logger.info('%s : pid %s died (code %d), re-running' %
                                (self.path, data.forked.pid, code))
                    self._request(['off', 'on'])

            elif now >= data.next_sanity_check:
                try:

                    #
                    # - run the sanity check and schedule the next one
                    # - reset it each time
                    #
                    data.next_sanity_check = now + SANITY
                    self.sanity_check(data.forked.pid)
                    data.checks = self.checks

                except Exception as failure:

                    #
                    # - any failure trapped during the sanity check will decrement our counter
                    # - eventually the process is stopped (up to the user to decide what to do)
                    #
                    data.checks -= 1
                    if not data.checks:
                        self._request(['off'])

                    logger.warning('%s : sanity check (%d/%d) failed -> %s' %
                                   (self.path, self.checks - data.checks,
                                    self.checks, diagnostic(failure)))

        return 'spin', data, SAMPLING
예제 #14
0
    def spin(self, data):

        #
        # - if the termination trigger is set or if we lost our connection, abort immediately
        # - this will free the lock and another controller will take the lead
        #
        if self.terminate:
            raise Aborted('terminating')

        now = time.time()
        if self.updated:

            #
            # - the update trigger is on
            # - unset it and query the last recorded hash
            # - any difference with what we have means we need to schedule a configuration
            #
            self.updated = 0
            last, stats = self.zk.get('%s/%s.%s/hash' %
                                      (ROOT, self.scope, self.tag))
            latest = self._md5()
            bad = latest != last
            if bad and not data.dirty:

                #
                # - the hash changed, switch the dirty trigger on
                # - this will start the countdown to configuration (which can be aborted if we fall back
                #   on the same hash again, typically after a transient zookeeper connection loss)
                #
                logger.info(
                    '%s : hash changed, configuration in %2.1f seconds' %
                    (self.path, self.damper))
                logger.debug('%s : hash -> %s' % (self.path, latest))
                data.next = now + self.damper
                data.dirty = 1

            elif not bad:

                #
                # - this case would typically map to a pod losing cnx to zk and joining again later
                # - based on how much damper we allow we can bridge transient idempotent changes
                # - very important -> make sure we set the snapshot (which could have been reset to {})
                #
                data.dirty = 0
                pods = self.snapshots['local']
                self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag),
                            json.dumps(pods))
                logger.debug(
                    '%s : pod update with no hash impact (did we just reconnect to zk ?)'
                    % self.path)

        if not data.dirty:

            #
            # - all cool, the cluster is configured
            # - set the state as 'leader'
            #
            self.hints['state'] = 'leader'

        else:

            #
            # - trigger the configuration procedure
            #
            self.hints['state'] = 'leader (configuration pending)'
            remaining = max(0, data.next - now)
            if not remaining:
                return 'config', data, 0

            #
            # - print some cool countdown
            #
            else:
                logger.debug('%s : configuration in %2.1f seconds' %
                             (self.path, remaining))

        return 'spin', data, SAMPLING
예제 #15
0
    def spin(self, data):

        #
        # - if the termination trigger is set or if we lost our connection, abort immediately
        # - this will free the lock and another controller will take the lead
        #
        if self.terminate:
            raise Aborted('terminating')
        
        #
        # - if it is time to run the probe callback do it now
        # - schedule the next one
        #
        now = time.time()
        if self.updated:

            #
            # - the update trigger is on
            # - unset it and query the last recorded hash
            # - any difference with what we have means we need to schedule a configuration
            #
            self.updated = 0
            last, stats = self.zk.get('%s/%s.%s/hash' % (ROOT, self.scope, self.tag))
            latest = self._md5()
            bad = latest != last
            if bad and not data.dirty:

                #
                # - the hash changed, switch the dirty trigger on
                # - this will start the countdown to configuration (which can be aborted if we fall back
                #   on the same hash again, typically after a transient zookeeper connection loss)
                #
                logger.info('%s : hash changed, configuration in %2.1f seconds' % (self.path, self.damper))
                logger.debug('%s : hash -> %s' % (self.path, latest))
                data.next = now + self.damper
                data.dirty = 1

            elif not bad:

                #
                # - this case would typically map to a pod losing cnx to zk and joining again later
                # - based on how much damper we allow we can bridge transient idempotent changes
                # - very important -> make sure we set the snapshot (which could have been reset to {})
                # - don't also forget to set data.last to enable probing
                #
                data.dirty = 0
                pods = self.snapshots['local']
                js = \
                    {
                        'pods': pods,
                        'dependencies': {k: v for k, v in self.snapshots.items() if k != 'local'}
                    }

                data.last = js
                data.last['key'] = str(self.id)
                self.zk.set('%s/%s.%s/snapshot' % (ROOT, self.scope, self.tag), json.dumps(pods))
                logger.debug('%s : pod update with no hash impact (did we just reconnect to zk ?)' % self.path)

        if not data.dirty:

            #
            # - all cool, the cluster is configured
            # - set the state as 'leader'
            # - fire a probe() if it is time to do so
            #
            self.hints['state'] = 'leader'
            if data.last and now > data.next_probe:
                try:

                    #
                    # - pass the latest cluster data to the probe() call
                    # - if successful (e.g did not assert) set the status to whatever the callable returned
                    # - unset if nothing was returned
                    #
                    snippet = self.probe(_Cluster(data.last))
                    self.hints['status'] = str(snippet) if snippet else ''

                except AssertionError as failure:

                    #
                    # - set the status to the assert message
                    #
                    self.hints['status'] = '* %s' % failure

                except Exception as failure:

                    #
                    # - something blew up in probe(), set the status accordingly
                    #
                    self.hints['status'] = '* probe() failed (check the code)'
                    logger.warning('%s : probe() failed -> %s' % (self.path, diagnostic(failure)))

                data.next_probe = now + self.probe_every
                if self.hints['status']:
                    logger.debug('%s : probe() -> "%s"' % (self.path, self.hints['status']))

        else:

            #
            # - trigger the configuration procedure
            #
            self.hints['state'] = 'leader (configuration pending)'
            remaining = max(0, data.next - now)
            self.hints['status'] = '* configuration in %2.1f seconds' % remaining
            if not remaining:
                return 'config', data, 0

            #
            # - print some cool countdown
            #
            else:
                logger.debug('%s : configuration in %2.1f seconds' % (self.path, remaining))

        return 'spin', data, SAMPLING