def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) traitz = zkutils.get(zkclient, z.path.traits()) new_node_info['traits'] = traits.detect(traitz) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # TODO: Fix the network initialization. Then the below can be part of # appenv.initialize() if os.name == 'posix': # Flush all rules in iptables nat and mangle tables (it is assumed # that none but Treadmill manages these tables) and bulk load all # the Treadmill static rules iptables.initialize(node_info['network']['external_ip']) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # Invoke the local node initialization tm_env.initialize(node_info) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def create_server(zkclient, server_id, parent_id): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkutils.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) # zkutils.get return dict/tuple if need_metadata is true. # # pylint: disable=R0204 data = zkutils.get(zkclient, server_node) if parent_id: if not data: data = {'parent': parent_id} else: data['parent'] = parent_id _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def __init__(self, endpoints_dir, zkclient, instance): self.endpoints_dir = endpoints_dir self.zkclient = zkclient self.up_to_date = True self.state = set() self.hostname = sysinfo.hostname() self.node_acl = zkutils.make_host_acl(self.hostname, 'rwcd') self.instance = instance
def __init__(self, endpoints_dir, zkclient, scan_interval, instance=None): self.endpoints_dir = endpoints_dir self.zkclient = zkclient self.scan_interval = scan_interval self.hostname = sysinfo.hostname() self.state = collections.defaultdict(dict) self.node_acl = zkutils.make_host_acl(self.hostname, 'rwcd') self.instance = instance
def _blackout_server(zkclient, server, reason): """Blackout server.""" if not reason: raise click.UsageError('--reason is required.') path = z.path.blackedout_server(server) zkutils.ensure_exists(zkclient, path, acl=[zkutils.make_host_acl(server, 'rwcda')], data=str(reason)) presence.kill_node(zkclient, server)
def register_server(zkclient, hostname, node_info): """Register server.""" server_path = z.path.server(hostname) server_data = zkutils.get(zkclient, server_path) server_data.update(node_info) _LOGGER.info('Registering server %s: %r', hostname, server_data) zkutils.update(zkclient, server_path, server_data) host_acl = zkutils.make_host_acl(hostname, 'rwcda') return zkutils.put(zkclient, z.path.server_presence(hostname + '#'), {'seen': False}, acl=[host_acl], ephemeral=True, sequence=True)
def _node_initialize(tm_env, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ tm_env.initialize() new_node_info = sysinfo.node_info(tm_env) # XXX: Why a get/update dance instead of set node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True)
def create_server(zkclient, server_id, parent_id, partition): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkutils.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) data = zkutils.get(zkclient, server_node) data.update({ 'parent': parent_id, 'partition': partition, }) _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def top(exit_on_fail, zkid, approot): """Run treadmill init process.""" _LOGGER.info('Initializing Treadmill: %s', approot) tm_env = appenv.AppEnvironment(approot) zkclient = zkutils.connect(context.GLOBAL.zk.url, idpath=zkid, listener=_exit_clear_watchdog_on_lost) utils.report_ready() while not zkclient.exists(z.SERVER_PRESENCE): _LOGGER.warn('namespace not ready.') time.sleep(30) hostname = sysinfo.hostname() zk_blackout_path = z.path.blackedout_server(hostname) zk_presence_path = z.path.server_presence(hostname) zk_server_path = z.path.server(hostname) while not zkclient.exists(zk_server_path): _LOGGER.warn('server %s not defined in the cell.', hostname) time.sleep(30) _LOGGER.info('Checking blackout list.') blacklisted = bool(zkclient.exists(zk_blackout_path)) if not blacklisted: # Node startup. _node_start(tm_env, zkclient, hostname, zk_server_path, zk_presence_path) # Cleanup the watchdog directory tm_env.watchdogs.initialize() _init_network() _LOGGER.info('Ready.') down_reason = _main_loop(tm_env, zkclient, zk_presence_path) if down_reason is not None: _LOGGER.warning('Shutting down: %s', down_reason) # Blackout the server. zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkutils.make_host_acl(hostname, 'rwcda')], data=down_reason) else: # Node was already blacked out. _LOGGER.warning('Shutting down blacked out node.') # This is the shutdown phase. # Delete the node zkutils.ensure_deleted(zkclient, zk_presence_path) zkclient.remove_listener(_exit_clear_watchdog_on_lost) zkclient.stop() zkclient.close() _cleanup_network() # to ternminate all the running apps _blackout_terminate(tm_env) if exit_on_fail: utils.sys_exit(-1) else: # Sit forever in a broken state while True: time.sleep(1000000)
def reboot_monitor(command): """Runs node reboot monitor.""" reboot_cmd = list(command) _LOGGER.info('Initializing reboot monitor: %r', reboot_cmd) zkclient = context.GLOBAL.zk.conn zkclient.add_listener(zkutils.exit_on_lost) while not zkclient.exists(z.REBOOTS): _LOGGER.warning('%r node not created yet. Cell masters running?', z.REBOOTS) time.sleep(30) hostname = sysinfo.hostname() up_since = sysinfo.up_since() _LOGGER.info('Server: %s, up since: %s', hostname, up_since) reboot_path = z.path.reboot(hostname) reboot_trigger = zkclient.handler.event_object() reboot_trigger.clear() @zkclient.DataWatch(reboot_path) @utils.exit_on_unhandled def _watch_reboot(data, stat, event): """Watch reboot node.""" if data is None and event is None: _LOGGER.info('Reboot node does not exist, ignore.') return True elif event is not None and event.type == 'DELETED': _LOGGER.info('Reboot Node deleted, ignore.') return True # We have a reboot request node if stat.created > up_since: _LOGGER.info('Reboot requested at: %s, up since: %s', time.ctime(stat.created), time.ctime(up_since)) reboot_trigger.set() else: _LOGGER.info('Reboot success, requested at %s, up since: %s', time.ctime(stat.created), time.ctime(up_since)) _LOGGER.info('Deleting zknode: %r', reboot_path) zkutils.ensure_deleted(zkclient, reboot_path) return True # We now wait for the reboot trigger reboot_trigger.wait() # Actual reboot procedure below _LOGGER.info('service shutdown.') # Strictly speaking this is not enough for graceful shutdown. # # We need a proper shutdown procedure developed. _LOGGER.info('Checking blackout list.') zk_blackout_path = z.path.blackedout_server(hostname) while zkclient.exists(zk_blackout_path): _LOGGER.info('Node blacked out - will wait.') time.sleep(60) if time.time() - up_since > _MIN_UPTIME_BEFORE_REBOOT: _LOGGER.info('exec: %r', reboot_cmd) utils.sane_execvp(reboot_cmd[0], reboot_cmd) else: _LOGGER.info('Possible reboot loop detected, blackout the node.') zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkutils.make_host_acl(hostname, 'rwcda')], data='Possible reboot loop detected.')