def load_schedule(self): """Run scheduler first time and update scheduled data.""" placement = self.cell.schedule() for servername, server in self.cell.members().items(): placement_node = z.path.placement(servername) zkutils.ensure_exists(self.zkclient, placement_node, acl=[_SERVERS_ACL]) current = set(self.zkclient.get_children(placement_node)) correct = set(server.apps.keys()) for app in current - correct: _LOGGER.info('Unscheduling: %s - %s', servername, app) zkutils.ensure_deleted(self.zkclient, os.path.join(placement_node, app)) for app in correct - current: _LOGGER.info('Scheduling: %s - %s,%s', servername, app, self.cell.apps[app].identity) placement_data = self._placement_data(app) zkutils.put(self.zkclient, os.path.join(placement_node, app), placement_data, acl=[_SERVERS_ACL]) self._update_task(app, servername, why=None) # Store latest placement as reference. zkutils.put(self.zkclient, z.path.placement(), placement) self.up_to_date = True
def create_endpoint_file(approot, port, appname, endpoint): """Create and link local endpoint file""" hostport = '%s:%s' % (sysinfo.hostname(), port) zkclinet = context.GLOBAL.zk.conn endpoint_proid_path = z.path.endpoint_proid(appname) acl = zkclinet.make_servers_acl() _LOGGER.info('Ensuring %s exists with ACL %r', endpoint_proid_path, acl) zkutils.ensure_exists(zkclinet, endpoint_proid_path, acl=[acl]) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. zkutils.ensure_deleted(zkclinet, endpoint_path) time.sleep(5) zkutils.put(zkclinet, endpoint_path, hostport) tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all(appname=appname, endpoint=endpoint, proto='tcp') endpoints_mgr.create_spec( appname=appname, endpoint=endpoint, proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), )
def create_server(zkclient, server_id, parent_id): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkutils.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) # zkutils.get return dict/tuple if need_metadata is true. # # pylint: disable=R0204 data = zkutils.get(zkclient, server_node) if parent_id: if not data: data = {'parent': parent_id} else: data['parent'] = parent_id _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def _blackout_app(zkclient, app, clear): """Blackout app.""" # list current blacklist blacklisted_node = z.path.blackedout_app(app) if clear: zkutils.ensure_deleted(zkclient, blacklisted_node) else: zkutils.ensure_exists(zkclient, blacklisted_node)
def sync_servers(): """Sync global servers list.""" _LOGGER.info('Sync servers.') admin_srv = context.GLOBAL.admin.server() global_servers = admin_srv.list({}) zkutils.ensure_exists(context.GLOBAL.zk.conn, z.path.globals('servers'), data=[server['_id'] for server in global_servers])
def sync_traits(): """Sync cell traits.""" _LOGGER.info('Sync traits.') admin_cell = context.GLOBAL.admin.cell() cell = admin_cell.get(context.GLOBAL.cell) payload = cell['traits'] zkutils.ensure_exists(context.GLOBAL.zk.conn, z.path.traits(), data=payload)
def _blackout_server(zkclient, server, reason): """Blackout server.""" if not reason: raise click.UsageError('--reason is required.') path = z.path.blackedout_server(server) zkutils.ensure_exists(zkclient, path, acl=[zkutils.make_host_acl(server, 'rwcda')], data=str(reason)) presence.kill_node(zkclient, server)
def test_ensure_exists(self): """Tests updating/creating node content.""" # with data client = kazoo.client.KazooClient() zkutils.ensure_exists(client, '/foo/bar', data='foo') kazoo.client.KazooClient.create.assert_called_with( '/foo/bar', b'foo', acl=mock.ANY, makepath=True, sequence=False) # non-data zkutils.ensure_exists(client, '/foo/bar') kazoo.client.KazooClient.create.assert_called_with( '/foo/bar', b'', acl=mock.ANY, makepath=True, sequence=False)
def test_ensure_exists_existing(self): """Test update content of existing node.""" def raise_exists(*_args, **_kwargs): """zk.create side effect, raising appropriate exception.""" raise kazoo.client.NodeExistsError() client = treadmill.zkutils.ZkClient() treadmill.zkutils.ZkClient.create.side_effect = raise_exists zkutils.ensure_exists(client, '/foo/bar') treadmill.zkutils.ZkClient.set_acls.assert_called_with( '/foo/bar', mock.ANY) # ensure with data zkutils.ensure_exists(client, '/foo/bar', data='foo') treadmill.zkutils.ZkClient.set.assert_called_with('/foo/bar', b'foo') treadmill.zkutils.ZkClient.set_acls.assert_called_with( '/foo/bar', mock.ANY)
def load_server(self, servername, readonly=False): """Load individual server.""" try: data = zkutils.get(self.zkclient, z.path.server(servername)) if not data: # The server is configured, but never reported it's capacity. _LOGGER.info('No capacity detected: %s', z.path.server(servername)) return assert 'parent' in data parentname = data['parent'] label = data.get('partition') if not label: # TODO: it will be better to have separate module for constants # and avoid unnecessary cross imports. label = admin.DEFAULT_PARTITION up_since = data.get('up_since', int(time.time())) partition = self.cell.partitions[label] server = scheduler.Server( servername, resources(data), valid_until=partition.valid_until(up_since), label=label, traits=data.get('traits', 0)) parent = self.buckets.get(parentname) if not parent: _LOGGER.warn('Server parent does not exist: %s/%s', servername, parentname) return self.buckets[parentname].add_node(server) self.servers[servername] = server assert server.parent == self.buckets[parentname] if not readonly: zkutils.ensure_exists(self.zkclient, z.path.placement(servername), acl=[_SERVERS_ACL]) self.adjust_server_state(servername, readonly) except kazoo.client.NoNodeError: _LOGGER.warn('Server node not found: %s', servername)
def accept(tkt_spool_dir, port, appname, endpoint, use_v2): """Run ticket locker acceptor.""" if port == 0: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('0.0.0.0', 0)) port = sock.getsockname()[1] sock.close() hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) endpoint_proid_path = z.path.endpoint_proid(appname) _LOGGER.info('Ensuring %s exists with ACL %r', endpoint_proid_path, _SERVERS_ACL) zkutils.ensure_exists(context.GLOBAL.zk.conn, endpoint_proid_path, acl=[_SERVERS_ACL]) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. # # Unlile typical endpoint, we cannot make the node ephemeral as we # exec into tkt-recv. zkutils.ensure_deleted(context.GLOBAL.zk.conn, endpoint_path) time.sleep(5) zkutils.put(context.GLOBAL.zk.conn, endpoint_path, hostport) context.GLOBAL.zk.conn.stop() # Exec into tickets acceptor. If race condition will not allow it to # bind to the provided port, it will exit and registration will # happen again. if use_v2: subproc.safe_exec([ 'tkt_recv_v2', '-p{}'.format(port), '-d{}'.format(tkt_spool_dir) ]) else: subproc.safe_exec( ['tkt_recv', 'tcp://*:{}'.format(port), tkt_spool_dir])
def publish_tickets(self, realms, once=False): """Publish list of all tickets present on the locker.""" zkutils.ensure_exists(self.zkclient, z.TICKETS) watcher = dirwatch.DirWatcher(self.tkt_spool_dir) def _publish_ticket(tkt_file): """Publish ticket details.""" if tkt_file.startswith('.'): return if not any([tkt_file.endswith(realm) for realm in realms]): _LOGGER.info('Ignore tkt_file: %s', tkt_file) return try: tkt_details = subproc.check_output( ['klist', '-5', '-e', '-f', tkt_file]) tkt_node = z.path.tickets(os.path.basename(tkt_file), self.hostname) zkutils.put(self.zkclient, tkt_node, tkt_details, ephemeral=True) except subproc.CalledProcessError: _LOGGER.warning('Unable to get tickets details.') for tkt_file in glob.glob(os.path.join(self.tkt_spool_dir, '*')): _publish_ticket(tkt_file) self.prune_tickets() last_prune = time.time() if once: return watcher.on_created = _publish_ticket while True: if time.time() - last_prune > _STALE_TKTS_PRUNE_INTERVAL: self.prune_tickets() last_prune = time.time() if watcher.wait_for_events(timeout=_STALE_TKTS_PRUNE_INTERVAL): watcher.process_events(max_events=_DIRWATCH_EVENTS_COUNT)
def create_server(zkclient, server_id, parent_id, partition): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkclient.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) data = zkutils.get(zkclient, server_node) if not data: data = {} data.update({ 'parent': parent_id, 'partition': partition, }) _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def _run_sync(): """Sync Zookeeper with LDAP, runs with lock held. """ while True: # Sync app groups admin_app_group = admin.AppGroup(context.GLOBAL.ldap.conn) app_groups = admin_app_group.list({}) _sync_collection(context.GLOBAL.zk.conn, app_groups, z.path.appgroup(), _match_appgroup) # Sync partitions admin_cell = admin.Cell(context.GLOBAL.ldap.conn) partitions = admin_cell.partitions(context.GLOBAL.cell) _sync_partitions(context.GLOBAL.zk.conn, partitions) # Sync allocations. admin_alloc = admin.CellAllocation(context.GLOBAL.ldap.conn) allocations = admin_alloc.list({'cell': context.GLOBAL.cell}) _sync_allocations(context.GLOBAL.zk.conn, allocations) # Global servers admin_srv = admin.Server(context.GLOBAL.ldap.conn) global_servers = admin_srv.list({}) zkutils.ensure_exists( context.GLOBAL.zk.conn, z.path.globals('servers'), data=[server['_id'] for server in global_servers]) # Servers - because they can have custom topology - are loaded # from the plugin. try: servers_plugin = importlib.import_module( 'treadmill.plugins.sproc.servers') servers_plugin.init() except ImportError as err: _LOGGER.warning( 'Unable to load treadmill.plugins.sproc.servers: %s', err) time.sleep(60)
def create_rootns(self): """Create root nodes and set appropriate acls.""" root_ns = { '/': None, z.ALLOCATIONS: None, z.APPMONITORS: None, z.BUCKETS: None, z.CELL: None, z.IDENTITY_GROUPS: None, z.PLACEMENT: None, z.PARTITIONS: None, z.SCHEDULED: [_SERVERS_ACL_DEL], z.SCHEDULER: None, z.SERVERS: None, z.STRATEGIES: None, z.FINISHED: [_SERVERS_ACL], z.FINISHED_HISTORY: None, z.TRACE: None, z.TRACE_HISTORY: None, z.VERSION_ID: None, z.ZOOKEEPER: None, z.BLACKEDOUT_SERVERS: [_SERVERS_ACL], z.ENDPOINTS: [_SERVERS_ACL], z.path.endpoint_proid('root'): [_SERVERS_ACL], z.EVENTS: [_SERVERS_ACL], z.RUNNING: [_SERVERS_ACL], z.SERVER_PRESENCE: [_SERVERS_ACL], z.VERSION: [_SERVERS_ACL], z.REBOOTS: [_SERVERS_ACL], } for path, acl in root_ns.items(): zkutils.ensure_exists(self.zkclient, path, acl) for path in z.trace_shards(): zkutils.ensure_exists(self.zkclient, path, acl=[_SERVERS_ACL])
def top(ctx, exit_on_fail, zkid, notification_fd, approot, runtime): """Run treadmill init process.""" _LOGGER.info('Initializing Treadmill: %s (%s)', approot, runtime) tm_env = appenv.AppEnvironment(approot) stop_on_lost = functools.partial(_stop_on_lost, tm_env) zkclient = zkutils.connect(context.GLOBAL.zk.url, idpath=zkid, listener=stop_on_lost) while not zkclient.exists(z.SERVER_PRESENCE): _LOGGER.warning('namespace not ready.') time.sleep(30) hostname = sysinfo.hostname() zk_blackout_path = z.path.blackedout_server(hostname) zk_server_path = z.path.server(hostname) zk_presence_path = z.path.server_presence(hostname) while not zkclient.exists(zk_server_path): _LOGGER.warning('server %s not defined in the cell.', hostname) time.sleep(30) _LOGGER.info('Checking blackout list.') blacklisted = bool(zkclient.exists(zk_blackout_path)) root_cgroup = ctx.obj['ROOT_CGROUP'] os_args = {} if os.name == 'posix': os_args['cgroup_prefix'] = root_cgroup if not blacklisted: # Node startup. _node_start(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path, os_args) utils.report_ready(notification_fd) _init_network() _start_init1(tm_env) _LOGGER.info('Ready.') down_reason = _main_loop(tm_env, zkclient, zk_presence_path) if down_reason is not None: _LOGGER.warning('Shutting down: %s', down_reason) # Blackout the server. zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkclient.make_host_acl(hostname, 'rwcda')], data=down_reason) trigger_postmortem = True else: # Blacked out manually trigger_postmortem = bool(zkclient.exists(zk_blackout_path)) if trigger_postmortem: postmortem.run(approot, root_cgroup) else: # Node was already blacked out. _LOGGER.warning('Shutting down blacked out node.') # This is the shutdown phase. # Delete the node if zk_presence_path: zkutils.ensure_deleted(zkclient, zk_presence_path) zkclient.remove_listener(stop_on_lost) zkclient.stop() zkclient.close() _cleanup_network() # to ternminate all the running apps _blackout_terminate(tm_env) if exit_on_fail: utils.sys_exit(-1) else: # Sit forever in a broken state while True: time.sleep(1000000)
def reboot_monitor(command): """Runs node reboot monitor.""" reboot_cmd = list(command) _LOGGER.info('Initializing reboot monitor: %r', reboot_cmd) zkclient = context.GLOBAL.zk.conn zkclient.add_listener(zkutils.exit_on_lost) while not zkclient.exists(z.REBOOTS): _LOGGER.warning('%r node not created yet. Cell masters running?', z.REBOOTS) time.sleep(30) hostname = sysinfo.hostname() up_since = sysinfo.up_since() _LOGGER.info('Server: %s, up since: %s', hostname, up_since) reboot_path = z.path.reboot(hostname) reboot_trigger = zkclient.handler.event_object() reboot_trigger.clear() @zkclient.DataWatch(reboot_path) @utils.exit_on_unhandled def _watch_reboot(data, stat, event): """Watch reboot node.""" if data is None and event is None: _LOGGER.info('Reboot node does not exist, ignore.') return True elif event is not None and event.type == 'DELETED': _LOGGER.info('Reboot Node deleted, ignore.') return True # We have a reboot request node if stat.created > up_since: _LOGGER.info('Reboot requested at: %s, up since: %s', time.ctime(stat.created), time.ctime(up_since)) reboot_trigger.set() else: _LOGGER.info('Reboot success, requested at %s, up since: %s', time.ctime(stat.created), time.ctime(up_since)) _LOGGER.info('Deleting zknode: %r', reboot_path) zkutils.ensure_deleted(zkclient, reboot_path) return True # We now wait for the reboot trigger reboot_trigger.wait() # Actual reboot procedure below _LOGGER.info('service shutdown.') # Strictly speaking this is not enough for graceful shutdown. # # We need a proper shutdown procedure developed. _LOGGER.info('Checking blackout list.') zk_blackout_path = z.path.blackedout_server(hostname) while zkclient.exists(zk_blackout_path): _LOGGER.info('Node blacked out - will wait.') time.sleep(60) if time.time() - up_since > _MIN_UPTIME_BEFORE_REBOOT: _LOGGER.info('exec: %r', reboot_cmd) utils.sane_execvp(reboot_cmd[0], reboot_cmd) else: _LOGGER.info('Possible reboot loop detected, blackout the node.') zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkclient.make_host_acl(hostname, 'rwcda')], data='Possible reboot loop detected.')
def reboot_server(zkclient, server_id): """Create server reboot event.""" zkutils.ensure_exists(zkclient, z.path.reboot(server_id), acl=[_SERVERS_ACL_DEL])
def accept_cmd(tkt_spool_dir, approot, port, appname, endpoint, keytab): """Run ticket locker acceptor.""" if keytab: _construct_keytab(keytab) if port == 0: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('0.0.0.0', 0)) port = sock.getsockname()[1] sock.close() hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) endpoint_proid_path = z.path.endpoint_proid(appname) acl = context.GLOBAL.zk.conn.make_servers_acl() _LOGGER.info( 'Ensuring %s exists with ACL %r', endpoint_proid_path, acl ) zkutils.ensure_exists( context.GLOBAL.zk.conn, endpoint_proid_path, acl=[acl] ) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. # # Unlile typical endpoint, we cannot make the node ephemeral as we # exec into tkt-recv. zkutils.ensure_deleted(context.GLOBAL.zk.conn, endpoint_path) time.sleep(5) zkutils.put(context.GLOBAL.zk.conn, endpoint_path, hostport) context.GLOBAL.zk.conn.stop() # TODO: this will publish information about the endpoint state # under /discovery. Once discovery is refactored (if it will be) # we can remove the "manual" zookeeper manipulation. tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all( appname=appname, endpoint=endpoint, proto='tcp' ) endpoints_mgr.create_spec( appname=appname, endpoint=endpoint, proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), ) subproc.safe_exec(['tkt_recv_v2', '-p{}'.format(port), '-d{}'.format(tkt_spool_dir)])
def cell_insert_bucket(zkclient, bucket_id): """Add bucket to the cell.""" if not zkclient.exists(z.path.cell(bucket_id)): zkutils.ensure_exists(zkclient, z.path.cell(bucket_id)) create_event(zkclient, 0, 'cell', None)
def reboot_server(zkclient, server_id): """Create server reboot event.""" zkutils.ensure_exists(zkclient, z.path.reboot(server_id), acl=[zkclient.make_servers_del_acl()])
def top(exit_on_fail, zkid, approot): """Run treadmill init process.""" _LOGGER.info('Initializing Treadmill: %s', approot) tm_env = appenv.AppEnvironment(approot) zkclient = zkutils.connect(context.GLOBAL.zk.url, idpath=zkid, listener=_exit_clear_watchdog_on_lost) utils.report_ready() while not zkclient.exists(z.SERVER_PRESENCE): _LOGGER.warn('namespace not ready.') time.sleep(30) hostname = sysinfo.hostname() zk_blackout_path = z.path.blackedout_server(hostname) zk_presence_path = z.path.server_presence(hostname) zk_server_path = z.path.server(hostname) while not zkclient.exists(zk_server_path): _LOGGER.warn('server %s not defined in the cell.', hostname) time.sleep(30) _LOGGER.info('Checking blackout list.') blacklisted = bool(zkclient.exists(zk_blackout_path)) if not blacklisted: # Node startup. _node_start(tm_env, zkclient, hostname, zk_server_path, zk_presence_path) # Cleanup the watchdog directory tm_env.watchdogs.initialize() _init_network() _LOGGER.info('Ready.') down_reason = _main_loop(tm_env, zkclient, zk_presence_path) if down_reason is not None: _LOGGER.warning('Shutting down: %s', down_reason) # Blackout the server. zkutils.ensure_exists( zkclient, zk_blackout_path, acl=[zkutils.make_host_acl(hostname, 'rwcda')], data=down_reason) else: # Node was already blacked out. _LOGGER.warning('Shutting down blacked out node.') # This is the shutdown phase. # Delete the node zkutils.ensure_deleted(zkclient, zk_presence_path) zkclient.remove_listener(_exit_clear_watchdog_on_lost) zkclient.stop() zkclient.close() _cleanup_network() # to ternminate all the running apps _blackout_terminate(tm_env) if exit_on_fail: utils.sys_exit(-1) else: # Sit forever in a broken state while True: time.sleep(1000000)
def _schedule_reboot(self, servername): """Schedule server reboot.""" zkutils.ensure_exists(self.zkclient, z.path.reboot(servername), acl=[_SERVERS_ACL_DEL])
def ensure_exists(self, path): """Ensure storage path exists.""" return zkutils.ensure_exists(self.zkclient, path, acl=self._acl(path))
def accept(tkt_spool_dir, approot, port, appname, endpoint, use_v2, keytab): """Run ticket locker acceptor.""" if keytab: _construct_keytab(keytab) if port == 0: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('0.0.0.0', 0)) port = sock.getsockname()[1] sock.close() hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) endpoint_proid_path = z.path.endpoint_proid(appname) acl = context.GLOBAL.zk.conn.make_servers_acl() _LOGGER.info( 'Ensuring %s exists with ACL %r', endpoint_proid_path, acl ) zkutils.ensure_exists( context.GLOBAL.zk.conn, endpoint_proid_path, acl=[acl] ) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. # # Unlile typical endpoint, we cannot make the node ephemeral as we # exec into tkt-recv. zkutils.ensure_deleted(context.GLOBAL.zk.conn, endpoint_path) time.sleep(5) zkutils.put(context.GLOBAL.zk.conn, endpoint_path, hostport) context.GLOBAL.zk.conn.stop() tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all( appname=appname, endpoint=endpoint, proto='tcp' ) endpoints_mgr.create_spec( appname=appname, endpoint=endpoint, proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), ) # Exec into tickets acceptor. If race condition will not allow it to # bind to the provided port, it will exit and registration will # happen again. if use_v2: subproc.safe_exec(['tkt_recv_v2', '-p{}'.format(port), '-d{}'.format(tkt_spool_dir)]) else: subproc.safe_exec(['tkt_recv', 'tcp://*:{}'.format(port), tkt_spool_dir])