def _watch_version(_data, _stat, event): """Force exit if server node is deleted.""" # If the node is deleted, we exit to pick up new version code. if event is not None and event.type == 'DELETED': # The version info not present, restart services and register # new checksum. _LOGGER.info('Upgrade requested, running: %s', cli_cmd) if cli_cmd: try: subproc.check_call(cli_cmd) # Record successful upgrade. except subprocess.CalledProcessError: _LOGGER.exception('Upgrade failed.') # Immediately trigger a watchdog timeout watchdogs.create( name='version_monitor', timeout='0s', content='Upgrade to ' '{code!r}({digest}) failed'.format(code=codepath, digest=digest), ).heartbeat() del info['digest'] _LOGGER.info('Upgrade complete.') utils.sys_exit(0) return True
def on_deleted(path): """Invoked when a network rule is deleted.""" # Edge case, if the directory where the rules are kept gets removed, # abort if path == rulemgr.path: _LOGGER.critical('Network rules directory was removed: %r', path) utils.sys_exit(1) # The rule is the filename rule_file = os.path.basename(path) _LOGGER.info('Removing %r', rule_file) chain_rule = rulemgr.get_rule(rule_file) if chain_rule is not None: chain, rule = chain_rule iptables.delete_rule(rule, chain=chain) if isinstance(rule, fw.PassThroughRule): if passthrough[rule.src_ip] == 1: # Remove the IPs from the passthrough set passthrough.pop(rule.src_ip) _LOGGER.info('Removing passthrough %r', rule.src_ip) iptables.rm_ip_set(iptables.SET_PASSTHROUGHS, rule.src_ip) iptables.flush_pt_conntrack_table(rule.src_ip) else: passthrough[rule.src_ip] -= 1 else: _LOGGER.warning('Ignoring unparseable file %r', rule_file)
def _wrap(*args, **kwargs): """Wraps function to exit on unhandled exception.""" try: return func(*args, **kwargs) except Exception: # pylint: disable=W0703 _LOGGER.exception('Unhandled exception - exiting.') utils.sys_exit(-1)
def _stop_on_lost(tm_env, state): _LOGGER.debug('ZK connection state: %s', state) if state == zkutils.states.KazooState.LOST: _LOGGER.info('ZK connection lost, stopping node.') _LOGGER.info('Terminating svscan in %s', tm_env.init_dir) supervisor.control_svscan(tm_env.init_dir, supervisor.SvscanControlAction.quit) # server_init should be terminated at this point but exit just in case. utils.sys_exit(-1)
def run(self, name, image, entrypoint, cmd, **args): """Run """ client = self._get_client() if 'volumes' in args: args['volumes'] = _transform_volumes(args['volumes']) if 'envdirs' in args: args['environment'] = _read_environ(args.pop('envdirs')) # simulate docker pull logic, if tag not provided, assume latest if ':' not in image: image += ':latest' try: image_meta = _pull_image(client, image) except docker.errors.ImageNotFound: raise exc.ContainerSetupError( 'Fail to pull {}, check image name or disk size'.format(image), app_abort.AbortedReason.IMAGE) container = _create_container(client, name, image, image_meta, entrypoint, cmd, **args) # TODO: start docker container event container.start() container.reload() logs_gen = container.logs(stdout=True, stderr=True, stream=True, follow=True) _LOGGER.info('Container %s is running', name) while container.status == 'running': try: for log_lines in logs_gen: sys.stderr.write(log_lines) except socket.error: pass container.reload() # container.wait returns dict with key 'StatusCode' rc = container.wait()['StatusCode'] if os.WIFSIGNALED(rc): # Process died with a signal in docker sig = os.WTERMSIG(rc) os.kill(os.getpid(), sig) else: utils.sys_exit(os.WEXITSTATUS(rc))
def run(self, name, image, entrypoint, cmd, **args): """Load Docker image and Run """ client = self._get_client() if 'volumes' in args: args['volumes'] = _transform_volumes(args['volumes']) if 'envdirs' in args: args['environment'] = _read_environ(args.pop('envdirs')) ulimit = _get_ulimits(args.pop('ulimit')) image_meta = _fetch_image(client, image) container = _create_container(client, name, image_meta, entrypoint, cmd, ulimit, **args) # TODO: start docker container event container.start() container.reload() logs_gen = container.logs(stdout=True, stderr=True, stream=True, follow=True) _LOGGER.info('Container %s is running', name) while container.status == 'running': try: for log_lines in logs_gen: print(log_lines.decode(), file=sys.stderr, end='', flush=True) except socket.error: pass container.reload() # container.wait returns dict with key 'StatusCode' rc = container.wait()['StatusCode'] if os.WIFSIGNALED(rc): # Process died with a signal in docker sig = os.WTERMSIG(rc) os.kill(os.getpid(), sig) else: utils.sys_exit(os.WEXITSTATUS(rc))
def run(self, name, image, entrypoint, cmd, **args): """Run """ client = self._get_client() try: if 'volumes' in args: args['volumes'] = _transform_volumes(args['volumes']) if 'envdirs' in args: args['environment'] = _read_environ(args.pop('envdirs')) container = _create_container(client, name, image, entrypoint, cmd, **args) except docker.errors.ImageNotFound: raise exc.ContainerSetupError( 'Image {0} was not found'.format(image), app_abort.AbortedReason.IMAGE) container.start() container.reload() logs_gen = container.logs(stdout=True, stderr=True, stream=True, follow=True) _LOGGER.info('Container %s is running', name) while container.status == 'running': try: for log_lines in logs_gen: sys.stderr.write(log_lines) except socket.error: pass container.reload() rc = container.wait() if os.WIFSIGNALED(rc): # Process died with a signal in docker sig = os.WTERMSIG(rc) os.kill(os.getpid(), sig) else: utils.sys_exit(os.WEXITSTATUS(rc))
def top(ctx, exit_on_fail, zkid, notification_fd, approot, runtime): """Run treadmill init process.""" _LOGGER.info('Initializing Treadmill: %s (%s)', approot, runtime) tm_env = appenv.AppEnvironment(approot) stop_on_lost = functools.partial(_stop_on_lost, tm_env) zkclient = zkutils.connect(context.GLOBAL.zk.url, idpath=zkid, listener=stop_on_lost) while not zkclient.exists(z.SERVER_PRESENCE): _LOGGER.warning('namespace not ready.') time.sleep(30) hostname = sysinfo.hostname() zk_blackout_path = z.path.blackedout_server(hostname) zk_server_path = z.path.server(hostname) zk_presence_path = z.path.server_presence(hostname) while not zkclient.exists(zk_server_path): _LOGGER.warning('server %s not defined in the cell.', hostname) time.sleep(30) _LOGGER.info('Checking blackout list.') blacklisted = bool(zkclient.exists(zk_blackout_path)) root_cgroup = ctx.obj['ROOT_CGROUP'] os_args = {} if os.name == 'posix': os_args['cgroup_prefix'] = root_cgroup if not blacklisted: # Node startup. _node_start(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path, os_args) utils.report_ready(notification_fd) _init_network() _start_init1(tm_env) _LOGGER.info('Ready.') down_reason = _main_loop(tm_env, zkclient, zk_presence_path) if down_reason is not None: _LOGGER.warning('Shutting down: %s', down_reason) # Blackout the server. zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkclient.make_host_acl(hostname, 'rwcda')], data=down_reason) trigger_postmortem = True else: # Blacked out manually trigger_postmortem = bool(zkclient.exists(zk_blackout_path)) if trigger_postmortem: postmortem.run(approot, root_cgroup) else: # Node was already blacked out. _LOGGER.warning('Shutting down blacked out node.') # This is the shutdown phase. # Delete the node if zk_presence_path: zkutils.ensure_deleted(zkclient, zk_presence_path) zkclient.remove_listener(stop_on_lost) zkclient.stop() zkclient.close() _cleanup_network() # to ternminate all the running apps _blackout_terminate(tm_env) if exit_on_fail: utils.sys_exit(-1) else: # Sit forever in a broken state while True: time.sleep(1000000)
def exit_on_disconnect(state): """Watch for connection events and exit if disconnected.""" _LOGGER.debug('ZK connection state: %s', state) if state != states.KazooState.CONNECTED: _LOGGER.info('Exiting on ZK connection lost.') utils.sys_exit(-1)
def top(exit_on_fail, zkid, approot): """Run treadmill init process.""" _LOGGER.info('Initializing Treadmill: %s', approot) tm_env = appenv.AppEnvironment(approot) zkclient = zkutils.connect(context.GLOBAL.zk.url, idpath=zkid, listener=_exit_clear_watchdog_on_lost) utils.report_ready() while not zkclient.exists(z.SERVER_PRESENCE): _LOGGER.warn('namespace not ready.') time.sleep(30) hostname = sysinfo.hostname() zk_blackout_path = z.path.blackedout_server(hostname) zk_presence_path = z.path.server_presence(hostname) zk_server_path = z.path.server(hostname) while not zkclient.exists(zk_server_path): _LOGGER.warn('server %s not defined in the cell.', hostname) time.sleep(30) _LOGGER.info('Checking blackout list.') blacklisted = bool(zkclient.exists(zk_blackout_path)) if not blacklisted: # Node startup. _node_start(tm_env, zkclient, hostname, zk_server_path, zk_presence_path) # Cleanup the watchdog directory tm_env.watchdogs.initialize() _init_network() _LOGGER.info('Ready.') down_reason = _main_loop(tm_env, zkclient, zk_presence_path) if down_reason is not None: _LOGGER.warning('Shutting down: %s', down_reason) # Blackout the server. zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkutils.make_host_acl(hostname, 'rwcda')], data=down_reason) else: # Node was already blacked out. _LOGGER.warning('Shutting down blacked out node.') # This is the shutdown phase. # Delete the node zkutils.ensure_deleted(zkclient, zk_presence_path) zkclient.remove_listener(_exit_clear_watchdog_on_lost) zkclient.stop() zkclient.close() _cleanup_network() # to ternminate all the running apps _blackout_terminate(tm_env) if exit_on_fail: utils.sys_exit(-1) else: # Sit forever in a broken state while True: time.sleep(1000000)
def _exit_clear_watchdog_on_lost(state): _LOGGER.debug('ZK connection state: %s', state) if state == zkutils.states.KazooState.LOST: _LOGGER.info('Exiting on ZK connection lost.') utils.sys_exit(-1)