def get_server(zkclient, server_id, placement=False): """Return server object.""" data = zkutils.get_default(zkclient, z.path.server(server_id), {}) if placement: placement_data = zkutils.get_default(zkclient, z.path.placement(server_id)) if placement_data: data.update(placement_data) return data
def _watch_task_instances(instance_ids): instance = None for instance_id in instance_ids: instance = '#'.join([task, instance_id]) # Either watch is established or data is acquired. if instance in cell_state.tasks: continue # On first load, optimize lookup by preloading state # of all scheduled instances. # # Once initial load is done, scheduled will be cleared. if scheduled: need_watch = instance in scheduled else: need_watch = zkclient.exists(z.path.scheduled(instance)) if need_watch: watch_task_instance(zkclient, cell_state, instance) else: data = zkutils.get_default(zkclient, z.path.task(instance)) cell_state.tasks[instance] = data return True
def load_allocations(self): """Load allocations and assignments map.""" data = zkutils.get_default(self.zkclient, z.ALLOCATIONS, default={}) if not data: return for obj in data: label = obj.get('partition') name = obj['name'] _LOGGER.info('Loading allocation: %s, label: %s', name, label) alloc = self.cell.partitions[label].allocation alloc.label = label for part in re.split('[/:]', name): alloc = alloc.get_sub_alloc(part) alloc.label = label capacity = resources(obj) alloc.update(capacity, obj['rank'], obj.get('max-utilization')) for assignment in obj.get('assignments', []): pattern = assignment['pattern'] + '[#]' + ('[0-9]' * 10) priority = assignment['priority'] _LOGGER.info('Assignment: %s - %s', pattern, priority) self.assignments[pattern] = (priority, alloc)
def process_events(self, events): """Callback invoked on state change/admin event.""" # Events are sequential nodes in the form <prio>-<event>-<seq #> # # They are processed in order of (prio, seq_num, event) ordered = sorted([ tuple([event.split('-')[i] for i in [0, 2, 1]]) for event in events if re.match(r'\d+\-\w+\-\d+$', event) ]) for prio, seq, resource in ordered: _LOGGER.info('event: %s %s %s', prio, seq, resource) node_name = '-'.join([prio, resource, seq]) if resource == 'allocations': # TODO: changing allocations has potential of complete # reshuffle, so while ineffecient, reload # all apps as well. self.load_allocations() self.load_apps() elif resource == 'apps': # The event node contains list of apps to be re-evaluated. apps = zkutils.get_default(self.zkclient, z.path.event(node_name), default=[]) for app in apps: self.load_app(app) elif resource == 'cell': self.load_cell() elif resource == 'servers': servers = zkutils.get_default(self.zkclient, z.path.event(node_name), default=[]) if not servers: # If not specified, reload all. Use union of servers in # the model and in zookeeper. servers = (set(self.servers.keys()) ^ set(self.zkclient.get_children(z.SERVERS))) self.reload_servers(servers) elif resource == 'identity_groups': self.load_identity_groups() else: _LOGGER.warn('Unsupported event resource: %s', resource) for node in events: _LOGGER.info('Deleting event: %s', z.path.event(node)) zkutils.ensure_deleted(self.zkclient, z.path.event(node))
def _save_version(zkclient, hostname, version): """Save server version data to ZK. """ node_path = z.path.version_history(hostname) versions = zkutils.get_default(zkclient, node_path) if not versions: versions = [] versions.insert(0, version) zkutils.put(zkclient, node_path, versions[0:_MAX_VERSIONS])
def load_placement_data(self): """Restore app identities.""" for appname, app in self.cell.apps.items(): if app.server: placement_data = zkutils.get_default( self.zkclient, z.path.placement(app.server, appname)) if placement_data is not None: app.force_set_identity(placement_data.get('identity')) app.placement_expiry = placement_data.get('expires', 0)
def _watch_finished(finished): """Watch /finished nodes.""" for instance in finished: if instance in cell_state.finished: continue finished_data = zkutils.get_default( zkclient, z.path.finished(instance), {} ) cell_state.finished[instance] = finished_data
def _watch_finished(finished): """Watch /finished nodes.""" current = set(cell_state.finished) target = set(finished) for instance in target - current: finished_data = zkutils.get_default(zkclient, z.path.finished(instance), {}) cell_state.finished[instance] = finished_data for instance in current - target: del cell_state.finished[instance]
def _list_server_blackouts(zkclient, fmt): """List server blackouts.""" with_partition = '%p' in fmt with_version = '%v' in fmt blackouts = [] for node in zkclient.get_children(z.BLACKEDOUT_SERVERS): try: node_path = z.path.blackedout_server(node) data, metadata = zkutils.get_with_metadata(zkclient, node_path) except kazoo.client.NoNodeError: continue partition, version = None, None if with_partition: server_data = zkutils.get_default(zkclient, z.path.server(node)) if server_data and server_data.get('partition'): partition = server_data['partition'] if with_version: version_data = zkutils.get_default(zkclient, z.path.version(node)) if version_data and version_data.get('codepath'): version = version_data['codepath'] blackouts.append((metadata.created, node, partition, version, data)) # [%t] %h %r will be printed as below # [Thu, 05 May 2016 02:59:58 +0000] <hostname> - fields = ('t', 'h', 'p', 'v', 'r') formatter = _gen_formatter(fields, fmt) for when, node, partition, version, reason in reversed(sorted(blackouts)): cli.out( formatter.format(utils.strftime_utc(when), node, partition or '-', version or '-', reason or '-'))
def load_identity_groups(self): """Load identity groups.""" names = set(self.zkclient.get_children(z.IDENTITY_GROUPS)) extra = set(self.cell.identity_groups.keys()) - names _LOGGER.info('Removing identities: %r', extra) for name in extra: self.cell.remove_identity_group(name) for name in names: ident = zkutils.get_default(self.zkclient, z.path.identity_group(name)) if ident: count = ident.get('count', 0) _LOGGER.info('Configuring identity: %s, %s', name, count) self.cell.configure_identity_group(name, count)
def _create_ephemeral_with_retry(zkclient, path, data): """Create ephemeral node with retry.""" prev_data = None for _ in range(0, _EPHEMERAL_RETRY_COUNT): try: return zkutils.create(zkclient, path, data, acl=[_SERVERS_ACL], ephemeral=True) except kazoo.client.NodeExistsError: prev_data = zkutils.get_default(zkclient, path) _LOGGER.warning('Node exists, will retry: %s, data: %r', path, prev_data) time.sleep(_EPHEMERAL_RETRY_INTERVAL) raise exc.ContainerSetupError('%s:%s' % (path, prev_data), app_abort.AbortedReason.PRESENCE)
def _create_ephemeral_with_retry(zkclient, path, data): """Create ephemeral node with retry.""" prev_data = None for _ in range(0, 5): try: return zkutils.create(zkclient, path, data, acl=[_SERVERS_ACL], ephemeral=True) except kazoo.client.NodeExistsError: prev_data, metadata = zkutils.get_default(zkclient, path, need_metadata=True) _LOGGER.warn('Node exists, will retry: %s, data: %r, metadata: %r', path, prev_data, metadata) time.sleep(_EPHEMERAL_RETRY_INTERVAL) raise exc.ContainerSetupError('presence.%s:%s' % (path, prev_data))
def load_bucket(self, bucketname): """Load bucket info, assume parent is already created.""" # Do not load twice. if bucketname in self.buckets: return self.buckets[bucketname] _LOGGER.info('loading bucket: %s', bucketname) data = zkutils.get_default(self.zkclient, z.path.bucket(bucketname), default={}) traits = data.get('traits', 0) level = data.get('level', bucketname.split(':')[0]) bucket = scheduler.Bucket(bucketname, traits=traits, level=level) self.buckets[bucketname] = bucket parent_name = data.get('parent') if parent_name: parent = self.load_bucket(parent_name) parent.add_node(bucket) return bucket
def app_cmd(app, reason, clear): """Manage app blackouts.""" zkclient = context.GLOBAL.zk.conn blacklist = zkutils.get_default(zkclient, z.BLACKEDOUT_APPS) if not blacklist: blacklist = {} if app: if clear: blacklist.pop(app, None) else: if not reason: raise click.UsageError('--reason is required.') blacklist[app] = {'reason': reason, 'when': time.time()} zkutils.put(zkclient, z.BLACKEDOUT_APPS, data=blacklist) masterapi.create_event(zkclient, 0, 'apps_blacklist', None) for blacklisted, details in sorted(blacklist.items()): when = utils.strftime_utc(details['when']) cli.out('[%s] %s %s', when, blacklisted, details['reason'])
def load_app(self, appname): """Load single application data.""" # TODO: need to check if app is blacklisted. manifest = zkutils.get_default(self.zkclient, z.path.scheduled(appname)) if not manifest: self.cell.remove_app(appname) return priority, allocation = self.find_assignment(appname) if 'priority' in manifest and int(manifest['priority']) != -1: priority = int(manifest['priority']) # TODO: From scheduler perspective it is theoretically # possible to update data retention timeout. data_retention = get_data_retention(manifest) lease = get_lease(manifest) app = self.cell.apps.get(appname, None) if app: app.priority = priority app.data_retention_timeout = data_retention else: demand = resources(manifest) affinity = manifest.get('affinity') affinity_limits = manifest.get('affinity_limits', None) identity_group = manifest.get('identity_group') schedule_once = manifest.get('schedule_once') app = scheduler.Application(appname, priority, demand, affinity=affinity, affinity_limits=affinity_limits, identity_group=identity_group, schedule_once=schedule_once, data_retention_timeout=data_retention, lease=lease) self.cell.add_app(allocation, app)
def adjust_server_state(self, servername, readonly=False): """Set server state.""" server = self.servers.get(servername) if not server: return is_up = self.zkclient.exists(z.path.server_presence(servername)) placement_node = z.path.placement(servername) # Restore state as it was stored in server placement node. # # zkutils.get_default return tuple if need_metadata is True, default it # is False, so it will return dict. pylint complains about it, # and it should be fixed in zkutils. # # pylint: disable=R0204 state_since = zkutils.get_default(self.zkclient, placement_node) if not state_since: state_since = {'state': 'down', 'since': time.time()} state = scheduler.State(state_since['state']) since = state_since['since'] server.set_state(state, since) # If presence does not exist - adjust state to down. if not is_up: server.state = scheduler.State.down else: if server.state is not scheduler.State.frozen: server.state = scheduler.State.up # Record server state: state, since = server.get_state() if not readonly: zkutils.put(self.zkclient, placement_node, { 'state': state.value, 'since': since })
def get_default(self, path, default=None): """Return stored object or default if not found.""" return zkutils.get_default(self.zkclient, path, default=default)
def get_scheduled_stats(zkclient): """Return count of scheduled apps by proid.""" return zkutils.get_default(zkclient, z.SCHEDULED_STATS, {})
def get_app(zkclient, app_id): """Return scheduled app details by app_id.""" return zkutils.get_default(zkclient, _app_node(app_id))