def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: kazoo.client.KazooClient :type web_socket_clients: list """ self._cache = ApplicationDependenciesMessage() self._configuration = configuration self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients)
def __init__(self, configuration, web_socket_clients): """ :type configuration: zoom.www.config.configuration.Configuration :type web_socket_clients: list """ self.configuration = configuration self._web_socket_clients = web_socket_clients self._message_throttle = MessageThrottle(configuration, web_socket_clients) self.graphite = GraphiteAvailability( configuration.graphite_host, recheck=configuration.graphite_recheck) self.graphite_cache = {} self.dependencies = {} self.states = {}
def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.zoo_keeper.ZooKeeper :type web_socket_clients: list :type time_estimate_cache: zoom.www.cache.time_estimate_cache.TimeEstimateCache """ self._path_to_host_mapping = dict() self._configuration = configuration self._cache = ApplicationStatesMessage() self._cache.set_environment(self._configuration.environment) self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients)
def __init__(self, configuration, web_socket_clients): """ :type configuration: zoom.www.config.configuration.Configuration :type web_socket_clients: list """ self.configuration = configuration self._web_socket_clients = web_socket_clients self._message_throttle = MessageThrottle(configuration, web_socket_clients) self.graphite_cache = {} self.dependencies = {} self.states = {}
def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.www.zoo_keeper.ZooKeeper :type web_socket_clients: list """ self._cache = ApplicationDependenciesMessage() self._configuration = configuration self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients)
class TimeEstimateCache(object): def __init__(self, configuration, web_socket_clients): """ :type configuration: zoom.www.config.configuration.Configuration :type web_socket_clients: list """ self.configuration = configuration self._web_socket_clients = web_socket_clients self._message_throttle = MessageThrottle(configuration, web_socket_clients) self.graphite = GraphiteAvailability(configuration.graphite_host, recheck=configuration.graphite_recheck) self.graphite_cache = {} self.dependencies = {} self.states = {} def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def reload(self): self.graphite_cache.clear() self.load() def update_states(self, states): """ :type states: dict """ self.states.update(states) self.load(send=True) def update_dependencies(self, deps): """ :type deps: dict """ self.dependencies.update(deps) self.load(send=True) @TimeThis(__file__) def load(self, send=False): """ :type send: bool Whether to send messages to clients. :rtype: zoom.www.messages.global_mode_message.TimeEstimateMessage """ logging.debug("Recomputing Timing Estimates...") # Pre-define path in case we except out before it's declared path = None try: message = TimeEstimateMessage() cost = self._get_default_data() maxpath = "None" searchdata = {} if self.states and self.graphite.available: for path in self.dependencies.iterkeys(): data = self._get_max_cost(path, searchdata) if data['max'] > cost['max']: maxpath = path self._get_greatest_cost(cost, data) message.update({ 'maxtime': cost['max'], 'mintime': cost['min'], 'avetime': cost['ave'], 'maxpath': maxpath }) if all((send, self.dependencies, self.states)): self._message_throttle.add_message(message) return message except RuntimeError as e: message = TimeEstimateMessage() logging.exception(e) message.update({'error_msg': 'Likely circular dependency on ' + path}) if all((send, self.dependencies, self.states)): self._message_throttle.add_message(message) return message except Exception as e: logging.exception(e) def _get_max_cost(self, path, searchdata): """ :type path: str :type searchdata: dict :rtype: dict Example: {'ave': 0, 'max': 0, 'min': 0} """ # init internal search data cached_cost = searchdata.get(path, None) if cached_cost is not None: return cached_cost dep_data = self.dependencies.get(path, None) greatest_cost = self._get_default_data() # take greatest_cost and record the largest cost from all a path's # dependencies. if dep_data: for i in dep_data['dependencies']: # expecting: i = {'path': '/foo/bar', 'type': 'baz'} if i.get('type').lower() == PredicateType.ZOOKEEPERHASCHILDREN: cached_cost = self._get_max_cost(i.get("path", None), searchdata) self._get_greatest_cost(greatest_cost, cached_cost) elif i.get('type').lower() == PredicateType.ZOOKEEPERHASGRANDCHILDREN: grand_path = i.get("path") for key in self.dependencies.iterkeys(): if key.lower().startswith(grand_path) and key.lower() != grand_path: cached_cost = self._get_max_cost(key, searchdata) self._get_greatest_cost(greatest_cost, cached_cost) # if application is not running, add its cost (from graphite) to the # greatest cost of its dependencies if (self.states.get(path, None) is not None and self.states[path].get('application_status', None) != "running"): graphite_data = self.get_graphite_data(path) self._add_data(greatest_cost, graphite_data) searchdata.update({path: greatest_cost}) return greatest_cost def _get_default_data(self): return {'ave': 0, 'max': 0, 'min': 0} def _get_greatest_cost(self, d1, d2): """ Update greatest cost values with the greater between the current value and the cached value """ for i in ['ave', 'min', 'max']: d1[i] = max(d1[i], d2[i]) def _add_data(self, d1, d2): """ Add an apps own startup time (from graphite) to greatest cost if it is not running. """ for i in ['ave', 'min', 'max']: d1[i] += d2[i] def get_graphite_data(self, path): """ Grab startup times from graphite for a path. :type path: str :rtype: dict Example: {'min': 0, 'max': 0, 'ave': 0} """ if self.graphite_cache.get(path, None) is not None: return self.graphite_cache[path] try: app_path = path.split('/spot/software/state/')[1] app_path = app_path.replace('/', '.') url = ("http://{0}/render?format=json&from=-7d" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'max'),'max')" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'min'),'min')" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'avg'),'avg')" .format(self.configuration.graphite_host, app_path)) response = requests.get(url, timeout=.5) self.graphite_cache[path] = self._get_default_data() if response.status_code == httplib.OK: for data in response.json(): if "avg" in data['target']: self.graphite_cache[path]['ave'] = data['datapoints'][0][0] elif "max" in data['target']: self.graphite_cache[path]['max'] = data['datapoints'][-1][0] elif "min" in data['target']: self.graphite_cache[path]['min'] = data['datapoints'][0][0] else: logging.warn("Received graphite data {} with unknown " "target".format(data)) return self.graphite_cache[path] except Exception: logging.exception('Error getting startup data from graphite for ' 'path: {0}'.format(path)) return self._get_default_data()
class ApplicationStateCache(object): def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.zoo_keeper.ZooKeeper :type web_socket_clients: list :type time_estimate_cache: zoom.www.cache.time_estimate_cache.TimeEstimateCache """ self._path_to_host_mapping = dict() self._configuration = configuration self._cache = ApplicationStatesMessage() self._cache.set_environment(self._configuration.environment) self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients) @property def host_mapping(self): return self._path_to_host_mapping def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def load(self): """ :rtype: zoom.messages.application_states.ApplicationStatesMessage """ if not len(self._cache): self._load() return self._cache def reload(self): self._cache.clear() self._on_update_path(self._configuration.application_state_path) @TimeThis(__file__) def _load(self): self._cache.clear() self._walk(self._configuration.application_state_path, self._cache) logging.info( "Application state cache loaded from ZooKeeper {0}".format( self._configuration.application_state_path)) self._time_estimate_cache.update_states(self._cache.application_states) self._cache.remove_deletes() def _build_default_override_store(self): logging.debug('Override storage node not found, creating') _template = {} self._zoo_keeper.create(self._configuration.override_node, json.dumps(_template), makepath=True) def _update_override_info(self, path, override_key, override_value): """ Update zookeeper with override values passed in """ logging.debug( "_update_override_info: path: {0}, override_key: {1} override_value: {2}" .format(path, override_key, override_value)) try: override_str, stat = self._zoo_keeper.get( self._configuration.override_node) override_dict = json.loads(override_str) update = {override_key: override_value} state = override_dict.get(path, {}) state.update(update) override_dict.update({path: state}) self._zoo_keeper.set(self._configuration.override_node, json.dumps(override_dict)) except NoNodeError as err: logging.debug('Unable to find {0}, {1}'.format( self._configuration.override_node, err)) self._build_default_override_store() self._update_override_info(path, override_key, override_value) def manual_update(self, path, key, value): """ Manual override from client of specific value :type path: str :type key: str :type value: str """ state = self._cache.application_states.get(path, None) if state is not None: self._update_override_info(path, key, value) message = ApplicationStatesMessage() state[key] = value message.update({path: state}) self._message_throttle.add_message(message) @connected_with_return(None) def _walk(self, path, result): """ :type path: str :type result: zoom.www.messages.application_states.ApplicationStatesMessage """ try: children = self._zoo_keeper.get_children(path, watch=self._on_update) if children: for child in children: self._walk(zk_path_join(path, child), result) else: app_state = self._get_application_state(path) result.update( {app_state.configuration_path: app_state.to_dictionary()}) except NoNodeError: result.update({ path: ApplicationState(configuration_path=path, delete=True).to_dictionary(), }) except Exception: logging.exception('An unhandled Exception has occurred while ' 'running ApplicationStateCache.walk.') def _get_app_details(self, path): """ :type path: str :rtype: dict, kazoo.protocol.states.ZnodeStat """ raw_data, stat = self._zoo_keeper.get(path, watch=self._on_update) data = {} if raw_data: try: data = json.loads(raw_data) except ValueError: pass return data, stat def _get_application_state(self, path): """ :type path: str :rtype: zoom.entities.application_state.ApplicationState """ data, stat = self._get_app_details(path) # persistent node if stat.ephemeralOwner == 0: # watch node to see if children are created self._zoo_keeper.get_children(path, watch=self._on_update) host = data.get('host', 'Unknown') name = data.get('name', os.path.basename(path)) agent_path = zk_path_join(self._configuration.agent_state_path, host) # if the agent is down, update state and mode with unknown agent_up = bool( self._zoo_keeper.exists(agent_path, watch=self._on_agent_state_update)) valid = True if host in (None, 'Unknown'): data['state'] = 'invalid' data['mode'] = 'unknown' valid = False elif not agent_up: data['state'] = 'unknown' data['mode'] = 'unknown' valid = False elif agent_up: d, s = self._zoo_keeper.get(agent_path) registered_comps = json.loads(d).get('components', []) if name not in registered_comps: data['state'] = 'invalid' data['mode'] = 'unknown' valid = False self._update_mapping(host, {path: valid}) application_state = ApplicationState( application_name=name, configuration_path=path, application_status=ApplicationStatus.STOPPED, application_host=host, last_update=stat.last_modified, start_stop_time=data.get('start_stop_time', ''), error_state=data.get('state', 'unknown'), local_mode=data.get('mode', 'unknown'), login_user=data.get('login_user', 'Zoom'), read_only=data.get('read_only', False), last_command=self._get_last_command(data), pd_disabled=self._get_existing_attribute(path, 'pd_disabled'), grayed=self._get_existing_attribute(path, 'grayed'), platform=data.get('platform', 'unknown'), restart_count=data.get('restart_count', 0), load_times=self._time_estimate_cache.get_graphite_data(path)) # ephemeral node else: # watch node to see if it goes away self._zoo_keeper.get_children(os.path.dirname(path), watch=self._on_update) host = os.path.basename(path) # if it is running, path = /app/path/HOSTNAME # need to convert to /app/path to get the app_details config_path = os.path.dirname(path) parent_data, parent_stat = self._get_app_details(config_path) self._update_mapping(host, {config_path: True}) application_state = ApplicationState( application_name=parent_data.get( 'name', os.path.basename(config_path)), configuration_path=config_path, application_status=ApplicationStatus.RUNNING, application_host=host, last_update=stat.last_modified, start_stop_time=parent_data.get('start_stop_time', ''), error_state=parent_data.get('state', 'unknown'), local_mode=parent_data.get('mode', 'unknown'), login_user=parent_data.get('login_user', 'Zoom'), read_only=parent_data.get('read_only', False), last_command=self._get_last_command(parent_data), pd_disabled=self._get_existing_attribute( config_path, 'pd_disabled'), grayed=self._get_existing_attribute(config_path, 'grayed'), platform=parent_data.get('platform', 'unknown'), restart_count=parent_data.get('restart_count', 0), load_times=self._time_estimate_cache.get_graphite_data( config_path)) return application_state def _update_mapping(self, host, data): """ :type host: str :type data: dict {'/some/path', bool} """ d = self._path_to_host_mapping.get(host, {}) d.update(data) self._path_to_host_mapping[host] = d def _on_update(self, event): """ Callback to send updates via websocket on application state changes. :type event: kazoo.protocol.states.WatchedEvent """ self._on_update_path(event.path) def _on_update_path(self, path): try: message = ApplicationStatesMessage() self._walk(path, message) self._cache.update(message.application_states) self._cache.remove_deletes() self._message_throttle.add_message(message) self._time_estimate_cache.update_states( self._cache.application_states) except Exception: logging.exception('An unhandled Exception has occurred') def _on_agent_state_update(self, event): """ This is to capture when an agent goes down. :type event: kazoo.protocol.states.WatchedEvent """ host = os.path.basename(event.path) logging.info( 'Agent on host {0} has changed up/down state.'.format(host)) paths = self._path_to_host_mapping.get(host, {}) for p in paths.keys(): self._on_update_path(p) def _get_last_command(self, data): """ :type data: dict :rtype: str """ if data.get('state', 'Unknown') in ['starting', 'started']: return "Start" elif data.get('state', 'Unknown') in ['stopping', 'stopped']: return "Stop" else: return '' def _get_existing_attribute(self, path, attr): """ Look for existing value for some value in the app state cache. If there is an override value, use that. Else use the existing state. :type path: str :type attr: str """ state = self._cache.application_states.get(path, None) override = {} try: data, stat = self._zoo_keeper.get( self._configuration.override_node) override = json.loads(data) except (TypeError, ValueError) as err: logging.critical('There was a problem returning values from the ' 'override cache: {0}'.format(err)) setting = override.get(path, {}).get(attr, None) if setting is not None: return setting elif state is None: return None else: return state.get(attr)
class ApplicationDependencyCache(object): def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: kazoo.client.KazooClient :type web_socket_clients: list """ self._cache = ApplicationDependenciesMessage() self._configuration = configuration self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients) def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def load(self): """ :rtype: ApplicationDependenciesMessage """ try: if not self._cache.application_dependencies: self._load() return self._cache except Exception: logging.exception('An unhandled Exception has occurred') def reload(self): """ Clear cache, and re-walk agent config path. """ self._cache.clear() logging.info("Application dependency cache cleared") self._on_update_path(self._configuration.agent_configuration_path) @TimeThis(__file__) def _load(self): """ Walk full agent config path to get data. Load self._cache object """ self._cache.clear() self._walk(self._configuration.agent_configuration_path, self._cache) logging.info( "Application dependency cache loaded from ZooKeeper {0}".format( self._configuration.agent_configuration_path)) self._recalc_downstream_dependencies() self._time_estimate_cache.update_dependencies( self._cache.application_dependencies) @connected_with_return(None) def _walk(self, path, result): """ :type path: str :type result: ApplicationDependenciesMessage """ try: children = self._zoo_keeper.get_children(path, watch=self._on_update) if children: for child in children: self._walk(zk_path_join(path, child), result) else: self._get_application_dependency(path, result) except NoNodeError: logging.debug('Node at {0} no longer exists.'.format(path)) def _get_application_dependency(self, path, result): """ Load result object with application dependencies :type path: str :type result: ApplicationDependenciesMessage """ if self._zoo_keeper.exists(path): data, stat = self._zoo_keeper.get(path, watch=self._on_update) if not data: return try: root = ElementTree.fromstring(data) for node in root.findall('Automation/Component'): app_id = node.attrib.get('id') registrationpath = node.attrib.get('registrationpath', None) if registrationpath is None: registrationpath = zk_path_join( self._configuration.application_state_path, app_id) start_action = node.find('Actions/Action[@id="start"]') if start_action is None: logging.warn("No Start Action Found for {0}".format( registrationpath)) dependencies = list() else: dependencies = self._parse_dependencies(start_action) data = { "configuration_path": registrationpath, "dependencies": dependencies, "downstream": list() } result.update({registrationpath: data}) except Exception: logging.exception('An unhandled exception occurred') else: logging.warn("config path does not exist: {0}".format(path)) def _parse_dependencies(self, action): """ Parse dependencies out of XML :type action: xml.etree.ElementTree.Element :rtype: list """ # TODO: rename 'path' when it really isn't a path. this is a hack... # prev_was_not keeps track of whether the outer class was 'not' dependencies = [] prev_was_not = False for predicate in action.iter('Predicate'): pred_type = predicate.get('type').lower() pred_path = predicate.get('path', None) # pred_oper = predicate.get('operational', False) pred_oper = bool( verify_attribute(predicate, 'operational', none_allowed=True)) if pred_type == PredicateType.ZOOKEEPERHASCHILDREN: dependencies.append({ 'type': pred_type, 'path': pred_path, 'operational': pred_oper }) prev_was_not = False elif pred_type == PredicateType.ZOOKEEPERHASGRANDCHILDREN: dependencies.append({ 'type': pred_type, 'path': pred_path, 'operational': pred_oper }) prev_was_not = False elif pred_type == PredicateType.ZOOKEEPERGOODUNTILTIME: if len(pred_path.split('gut/')) > 1: dependencies.append({ 'type': pred_type, 'path': ("I should be up between: {0}".format( pred_path.split("gut/")[1])), 'operational': pred_oper }) else: logging.debug('Invalid GUT path: {0}'.format(pred_path)) prev_was_not = False elif pred_type == PredicateType.HOLIDAY: dependencies.append({ 'type': pred_type, 'path': ("Does NOT run on holidays" if prev_was_not else "Runs on holidays"), 'operational': pred_oper }) prev_was_not = False elif pred_type == PredicateType.WEEKEND: dependencies.append({ 'type': pred_type, 'path': ("Does NOT run on weekends" if prev_was_not else "Runs on weekends"), 'operational': pred_oper }) prev_was_not = False elif pred_type == PredicateType.TIMEWINDOW: begin = predicate.get('begin', None) end = predicate.get('end', None) weekdays = predicate.get('weekdays', None) msg = 'I should be up ' if begin is not None: msg += 'after: {0} '.format(begin) if end is not None: msg += 'until: {0}'.format(end) # only send dependency if there is something to send if begin is not None or end is not None: dependencies.append({ 'type': pred_type, 'path': msg, 'operational': pred_oper }) # pretend this is a weekend predicate for convenience if weekdays is not None: day_range = TimeWindow.parse_range(weekdays) if Weekdays.SATURDAY in day_range or Weekdays.SUNDAY in day_range: wk_msg = 'Runs on weekends' else: wk_msg = 'Does NOT run on weekends' dependencies.append({ 'type': PredicateType.WEEKEND, 'path': wk_msg, 'operational': pred_oper }) elif pred_type == PredicateType.NOT: prev_was_not = True return dependencies @TimeThis(__file__) def _recalc_downstream_dependencies(self, tries=0): """ Loop over existing cache and link upstream with downstream elements """ # clear existing downstream try: for data in self._cache.application_dependencies.itervalues(): downstream = data.get('downstream') del downstream[:] dep_copy = self._cache.application_dependencies.copy() for path, data in dep_copy.iteritems(): for key, value in self._cache.application_dependencies.iteritems( ): for dep in data.get('dependencies'): if dep['type'] == PredicateType.ZOOKEEPERHASGRANDCHILDREN: if key.startswith(dep['path']): value.get('downstream').append(path) elif dep['type'] == PredicateType.ZOOKEEPERHASCHILDREN: if dep['path'] == key: value.get('downstream').append(path) except RuntimeError: time.sleep(1) tries += 1 if tries < 3: self._recalc_downstream_dependencies(tries=tries) def _on_update(self, event): """ Callback to send updates via websocket on application state changes. :type event: kazoo.protocol.states.WatchedEvent """ self._on_update_path(event.path) def _on_update_path(self, path): try: message = ApplicationDependenciesMessage() self._walk(path, message) self._cache.update(message.application_dependencies) self._recalc_downstream_dependencies() self._message_throttle.add_message(message) self._time_estimate_cache.update_dependencies( self._cache.application_dependencies) except Exception: logging.exception('An unhandled Exception has occurred for path: ' '{0}'.format(path))
class ApplicationStateCache(object): def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.zoo_keeper.ZooKeeper :type web_socket_clients: list :type time_estimate_cache: zoom.www.cache.time_estimate_cache.TimeEstimateCache """ self._path_to_host_mapping = dict() self._configuration = configuration self._cache = ApplicationStatesMessage() self._cache.set_environment(self._configuration.environment) self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients) @property def host_mapping(self): return self._path_to_host_mapping def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def load(self): """ :rtype: zoom.messages.application_states.ApplicationStatesMessage """ if not len(self._cache): self._load() return self._cache def reload(self): self._cache.clear() self._on_update_path(self._configuration.application_state_path) @TimeThis(__file__) def _load(self): self._cache.clear() self._walk(self._configuration.application_state_path, self._cache) logging.info("Application state cache loaded from ZooKeeper {0}" .format(self._configuration.application_state_path)) self._time_estimate_cache.update_states( self._cache.application_states) self._cache.remove_deletes() def _build_default_override_store(self): logging.debug('Override storage node not found, creating') _template = {} self._zoo_keeper.create(self._configuration.override_node, json.dumps(_template), makepath=True) def _update_override_info(self, path, override_key, override_value): """ Update zookeeper with override values passed in """ logging.debug("_update_override_info: path: {0}, override_key: {1} override_value: {2}" .format(path, override_key, override_value)) try: override_str, stat = self._zoo_keeper.get(self._configuration.override_node) override_dict = json.loads(override_str) update = {override_key: override_value} state = override_dict.get(path, {}) state.update(update) override_dict.update({path: state}) self._zoo_keeper.set(self._configuration.override_node, json.dumps(override_dict)) except NoNodeError as err: logging.debug('Unable to find {0}, {1}' .format(self._configuration.override_node, err)) self._build_default_override_store() self._update_override_info(path, override_key, override_value) def manual_update(self, path, key, value): """ Manual override from client of specific value :type path: str :type key: str :type value: str """ state = self._cache.application_states.get(path, None) if state is not None: self._update_override_info(path, key, value) message = ApplicationStatesMessage() state[key] = value message.update({path: state}) self._message_throttle.add_message(message) @connected_with_return(None) def _walk(self, path, result): """ :type path: str :type result: zoom.www.messages.application_states.ApplicationStatesMessage """ try: children = self._zoo_keeper.get_children(path, watch=self._on_update) if children: for child in children: self._walk(zk_path_join(path, child), result) else: app_state = self._get_application_state(path) result.update( {app_state.configuration_path: app_state.to_dictionary()} ) except NoNodeError: result.update({path: ApplicationState(configuration_path=path, delete=True).to_dictionary(), }) except Exception: logging.exception('An unhandled Exception has occurred while ' 'running ApplicationStateCache.walk.') def _get_app_details(self, path): """ :type path: str :rtype: dict, kazoo.protocol.states.ZnodeStat """ raw_data, stat = self._zoo_keeper.get(path, watch=self._on_update) data = {} if raw_data: try: data = json.loads(raw_data) except ValueError: pass return data, stat def _get_application_state(self, path): """ :type path: str :rtype: zoom.entities.application_state.ApplicationState """ data, stat = self._get_app_details(path) # persistent node if stat.ephemeralOwner == 0: # watch node to see if children are created self._zoo_keeper.get_children(path, watch=self._on_update) host = data.get('host', 'Unknown') name = data.get('name', os.path.basename(path)) agent_path = zk_path_join(self._configuration.agent_state_path, host) # if the agent is down, update state and mode with unknown agent_up = bool(self._zoo_keeper.exists( agent_path, watch=self._on_agent_state_update)) valid = True if host in (None, 'Unknown'): data['state'] = 'invalid' data['mode'] = 'unknown' valid = False elif not agent_up: data['state'] = 'unknown' data['mode'] = 'unknown' valid = False elif agent_up: d, s = self._zoo_keeper.get(agent_path) registered_comps = json.loads(d).get('components', []) if name not in registered_comps: data['state'] = 'invalid' data['mode'] = 'unknown' valid = False self._update_mapping(host, {path: valid}) application_state = ApplicationState( application_name=name, configuration_path=path, application_status=ApplicationStatus.STOPPED, application_host=host, last_update=stat.last_modified, start_stop_time=data.get('start_stop_time', ''), error_state=data.get('state', 'unknown'), local_mode=data.get('mode', 'unknown'), login_user=data.get('login_user', 'Zoom'), read_only=data.get('read_only', False), last_command=self._get_last_command(data), pd_disabled=self._get_existing_attribute(path, 'pd_disabled'), grayed=self._get_existing_attribute(path, 'grayed'), platform=data.get('platform', 'unknown'), restart_count=data.get('restart_count', 0), load_times=self._time_estimate_cache.get_graphite_data(path) ) # ephemeral node else: # watch node to see if it goes away self._zoo_keeper.get_children(os.path.dirname(path), watch=self._on_update) host = os.path.basename(path) # if it is running, path = /app/path/HOSTNAME # need to convert to /app/path to get the app_details config_path = os.path.dirname(path) parent_data, parent_stat = self._get_app_details(config_path) self._update_mapping(host, {config_path: True}) application_state = ApplicationState( application_name=parent_data.get('name', os.path.basename(config_path)), configuration_path=config_path, application_status=ApplicationStatus.RUNNING, application_host=host, last_update=stat.last_modified, start_stop_time=parent_data.get('start_stop_time', ''), error_state=parent_data.get('state', 'unknown'), local_mode=parent_data.get('mode', 'unknown'), login_user=parent_data.get('login_user', 'Zoom'), read_only=parent_data.get('read_only', False), last_command=self._get_last_command(parent_data), pd_disabled=self._get_existing_attribute(config_path, 'pd_disabled'), grayed=self._get_existing_attribute(config_path, 'grayed'), platform=parent_data.get('platform', 'unknown'), restart_count=parent_data.get('restart_count', 0), load_times=self._time_estimate_cache.get_graphite_data(config_path) ) return application_state def _update_mapping(self, host, data): """ :type host: str :type data: dict {'/some/path', bool} """ d = self._path_to_host_mapping.get(host, {}) d.update(data) self._path_to_host_mapping[host] = d def _on_update(self, event): """ Callback to send updates via websocket on application state changes. :type event: kazoo.protocol.states.WatchedEvent """ self._on_update_path(event.path) def _on_update_path(self, path): try: message = ApplicationStatesMessage() self._walk(path, message) self._cache.update(message.application_states) self._cache.remove_deletes() self._message_throttle.add_message(message) self._time_estimate_cache.update_states( self._cache.application_states) except Exception: logging.exception('An unhandled Exception has occurred') def _on_agent_state_update(self, event): """ This is to capture when an agent goes down. :type event: kazoo.protocol.states.WatchedEvent """ host = os.path.basename(event.path) logging.info('Agent on host {0} has changed up/down state.'.format(host)) paths = self._path_to_host_mapping.get(host, {}) for p in paths.keys(): self._on_update_path(p) def _get_last_command(self, data): """ :type data: dict :rtype: str """ if data.get('state', 'Unknown') in ['starting', 'started']: return "Start" elif data.get('state', 'Unknown') in ['stopping', 'stopped']: return "Stop" else: return '' def _get_existing_attribute(self, path, attr): """ Look for existing value for some value in the app state cache. If there is an override value, use that. Else use the existing state. :type path: str :type attr: str """ state = self._cache.application_states.get(path, None) override = {} try: data, stat = self._zoo_keeper.get(self._configuration.override_node) override = json.loads(data) except (TypeError, ValueError) as err: logging.critical('There was a problem returning values from the ' 'override cache: {0}'.format(err)) setting = override.get(path, {}).get(attr, None) if setting is not None: return setting elif state is None: return None else: return state.get(attr)
class TimeEstimateCache(object): def __init__(self, configuration, web_socket_clients): """ :type configuration: zoom.www.config.configuration.Configuration :type web_socket_clients: list """ self.configuration = configuration self._web_socket_clients = web_socket_clients self._message_throttle = MessageThrottle(configuration, web_socket_clients) self.graphite = GraphiteAvailability( configuration.graphite_host, recheck=configuration.graphite_recheck) self.graphite_cache = {} self.dependencies = {} self.states = {} def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def reload(self): self.graphite_cache.clear() self.load() def update_states(self, states): """ :type states: dict """ self.states.update(states) self.load(send=True) def update_dependencies(self, deps): """ :type deps: dict """ self.dependencies.update(deps) self.load(send=True) @TimeThis(__file__) def load(self, send=False): """ :type send: bool Whether to send messages to clients. :rtype: zoom.www.messages.global_mode_message.TimeEstimateMessage """ logging.debug("Recomputing Timing Estimates...") # Pre-define path in case we except out before it's declared path = None try: message = TimeEstimateMessage() cost = self._get_default_data() maxpath = "None" searchdata = {} if self.states and self.graphite.available: for path in self.dependencies.iterkeys(): data = self._get_max_cost(path, searchdata) if data['max'] > cost['max']: maxpath = path self._get_greatest_cost(cost, data) message.update({ 'maxtime': cost['max'], 'mintime': cost['min'], 'avetime': cost['ave'], 'maxpath': maxpath }) if all((send, self.dependencies, self.states)): self._message_throttle.add_message(message) return message except RuntimeError as e: message = TimeEstimateMessage() logging.exception(e) message.update( {'error_msg': 'Likely circular dependency on ' + path}) if all((send, self.dependencies, self.states)): self._message_throttle.add_message(message) return message except Exception as e: logging.exception(e) def _get_max_cost(self, path, searchdata): """ :type path: str :type searchdata: dict :rtype: dict Example: {'ave': 0, 'max': 0, 'min': 0} """ # init internal search data cached_cost = searchdata.get(path, None) if cached_cost is not None: return cached_cost dep_data = self.dependencies.get(path, None) greatest_cost = self._get_default_data() # take greatest_cost and record the largest cost from all a path's # dependencies. if dep_data: for i in dep_data['dependencies']: # expecting: i = {'path': '/foo/bar', 'type': 'baz'} if i.get('type').lower() == PredicateType.ZOOKEEPERHASCHILDREN: cached_cost = self._get_max_cost(i.get("path", None), searchdata) self._get_greatest_cost(greatest_cost, cached_cost) elif i.get('type').lower( ) == PredicateType.ZOOKEEPERHASGRANDCHILDREN: grand_path = i.get("path") for key in self.dependencies.iterkeys(): if key.lower().startswith( grand_path) and key.lower() != grand_path: cached_cost = self._get_max_cost(key, searchdata) self._get_greatest_cost(greatest_cost, cached_cost) # if application is not running, add its cost (from graphite) to the # greatest cost of its dependencies if (self.states.get(path, None) is not None and self.states[path].get( 'application_status', None) != "running"): graphite_data = self.get_graphite_data(path) self._add_data(greatest_cost, graphite_data) searchdata.update({path: greatest_cost}) return greatest_cost def _get_default_data(self): return {'ave': 0, 'max': 0, 'min': 0} def _get_greatest_cost(self, d1, d2): """ Update greatest cost values with the greater between the current value and the cached value """ for i in ['ave', 'min', 'max']: d1[i] = max(d1[i], d2[i]) def _add_data(self, d1, d2): """ Add an apps own startup time (from graphite) to greatest cost if it is not running. """ for i in ['ave', 'min', 'max']: d1[i] += d2[i] def get_graphite_data(self, path): """ Grab startup times from graphite for a path. :type path: str :rtype: dict Example: {'min': 0, 'max': 0, 'ave': 0} """ if self.graphite_cache.get(path, None) is not None: return self.graphite_cache[path] try: app_path = path.split('/spot/software/state/')[1] app_path = app_path.replace('/', '.') url = ( "http://{0}/render?format=json&from=-7d" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'max'),'max')" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'min'),'min')" "&target=alias(aggregateLine(Infrastructure.startup.{1}.runtime,'avg'),'avg')" .format(self.configuration.graphite_host, app_path)) response = requests.get(url, timeout=.5) self.graphite_cache[path] = self._get_default_data() if response.status_code == httplib.OK: for data in response.json(): if "avg" in data['target']: self.graphite_cache[path]['ave'] = data['datapoints'][ 0][0] elif "max" in data['target']: self.graphite_cache[path]['max'] = data['datapoints'][ -1][0] elif "min" in data['target']: self.graphite_cache[path]['min'] = data['datapoints'][ 0][0] else: logging.warn("Received graphite data {} with unknown " "target".format(data)) return self.graphite_cache[path] except Exception: logging.exception('Error getting startup data from graphite for ' 'path: {0}'.format(path)) return self._get_default_data()
class ApplicationStateCache(object): def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.zoo_keeper.ZooKeeper :type web_socket_clients: list :type time_estimate_cache: zoom.www.cache.time_estimate_cache.TimeEstimateCache """ self._path_to_host_mapping = dict() self._configuration = configuration self._cache = ApplicationStatesMessage() self._cache.set_environment(self._configuration.environment) self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients) self._last_command = None def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def load(self): """ :rtype: zoom.messages.application_states.ApplicationStatesMessage """ if not len(self._cache): self._load() return self._cache def reload(self): self._cache.clear() self._on_update_path(self._configuration.application_state_path) def _load(self): self._cache.clear() self._walk(self._configuration.application_state_path, self._cache) logging.info("Application state cache loaded from ZooKeeper {0}" .format(self._configuration.application_state_path)) self._time_estimate_cache.update_states( self._cache.application_states) self._cache.remove_deletes() @connected_with_return(None) def _walk(self, path, result): """ :type path: str :type result: zoom.www.messages.application_states.ApplicationStatesMessage """ try: children = self._zoo_keeper.get_children(path, watch=self._on_update) if children: for child in children: self._walk(os.path.join(path, child), result) else: app_state = self._get_application_state(path) result.update( {app_state.configuration_path: app_state.to_dictionary()} ) except NoNodeError: logging.debug('Node at {0} no longer exists.'.format(path)) result.update({path: ApplicationState(configuration_path=path, delete=True).to_dictionary(), }) except Exception: logging.exception('An unhandled Exception has occurred while ' 'running ApplicationStateCache.walk.') def _get_app_details(self, path): """ :type path: str :rtype: dict, kazoo.protocol.states.ZnodeStat """ rawData, stat = self._zoo_keeper.get(path, watch=self._on_update) data = {} if rawData != '': try: data = json.loads(rawData) except ValueError: pass return data, stat def _get_application_state(self, path): """ :type path: str :rtype: zoom.entities.application_state.ApplicationState """ data, stat = self._get_app_details(path) # persistent node if stat.ephemeralOwner == 0: # watch node to see if children are created self._zoo_keeper.get_children(path, watch=self._on_update) host = data.get('host', 'Unknown') agent_path = os.path.join(self._configuration.agent_state_path, host) # if the agent is down, update state and mode with unknown if (host is None or not self._zoo_keeper.exists( agent_path, watch=self._on_agent_state_update)): data['state'] = 'unknown' data['mode'] = 'unknown' else: self._path_to_host_mapping[host] = path application_state = ApplicationState( application_name=data.get('name', os.path.basename(path)), configuration_path=path, application_status=ApplicationStatus.STOPPED, application_host=host, completion_time=stat.last_modified, trigger_time=data.get('trigger_time', ''), error_state=data.get('state', 'unknown'), local_mode=data.get('mode', 'unknown'), login_user=data.get('login_user', 'Zoom'), fqdn=data.get('fqdn', host), last_command=self._get_last_command(data) ) # ephemeral node else: # watch node to see if it goes away self._zoo_keeper.get_children(os.path.dirname(path), watch=self._on_update) host = os.path.basename(path) # if it is running, path = /app/path/HOSTNAME # need to convert to /app/path to get the app_details config_path = os.path.dirname(path) parent_data, parent_stat = self._get_app_details(config_path) self._path_to_host_mapping[host] = config_path application_state = ApplicationState( application_name=parent_data.get('name', os.path.basename(config_path)), configuration_path=config_path, application_status=ApplicationStatus.RUNNING, application_host=host, completion_time=stat.last_modified, trigger_time=parent_data.get('trigger_time', ''), error_state=parent_data.get('state', 'unknown'), local_mode=parent_data.get('mode', 'unknown'), login_user=parent_data.get('login_user', 'Zoom'), fqdn=parent_data.get('fqdn', host), last_command=self._get_last_command(parent_data) ) return application_state def _on_update(self, event): """ Callback to send updates via websocket on application state changes. :type event: kazoo.protocol.states.WatchedEvent """ self._on_update_path(event.path) def _on_update_path(self, path): try: message = ApplicationStatesMessage() self._walk(path, message) self._cache.update(message.application_states) self._cache.remove_deletes() self._message_throttle.add_message(message) self._time_estimate_cache.update_states( self._cache.application_states) except Exception: logging.exception('An unhandled Exception has occurred') def _on_agent_state_update(self, event): """ This is to capture when an agent goes down. :type event: kazoo.protocol.states.WatchedEvent """ host = os.path.basename(event.path) path = self._path_to_host_mapping.get(host, None) if path is not None: # if data is in the cache self._on_update_path(path) def _get_last_command(self, data): if data.get('state', 'Unknown') == 'starting': self._last_command = "Start" elif data.get('state', 'Unknown') == 'stopping': self._last_command = "Stop" else: pass return self._last_command
class ApplicationDependencyCache(object): def __init__(self, configuration, zoo_keeper, web_socket_clients, time_estimate_cache): """ :type configuration: zoom.config.configuration.Configuration :type zoo_keeper: zoom.www.zoo_keeper.ZooKeeper :type web_socket_clients: list """ self._cache = ApplicationDependenciesMessage() self._configuration = configuration self._zoo_keeper = zoo_keeper self._web_socket_clients = web_socket_clients self._time_estimate_cache = time_estimate_cache self._message_throttle = MessageThrottle(configuration, web_socket_clients) def start(self): self._message_throttle.start() def stop(self): self._message_throttle.stop() def load(self): """ :rtype: ApplicationDependenciesMessage """ try: if not self._cache.application_dependencies: self._load() return self._cache except Exception: logging.exception('An unhandled Exception has occurred') def reload(self): """ Clear cache, and re-walk agent config path. """ self._cache.clear() logging.info("Application dependency cache cleared") self._on_update_path(self._configuration.agent_configuration_path) def _load(self): """ Walk full agent config path to get data. Load self._cache object """ self._cache.clear() self._walk(self._configuration.agent_configuration_path, self._cache) logging.info("Application dependency cache loaded from ZooKeeper {0}" .format(self._configuration.agent_configuration_path)) self._recalc_downstream_dependencies() self._time_estimate_cache.update_dependencies( self._cache.application_dependencies) @connected_with_return(None) def _walk(self, path, result): """ :type path: str :type result: ApplicationDependenciesMessage """ try: children = self._zoo_keeper.get_children(path, watch=self._on_update) if children: for child in children: self._walk(os.path.join(path, child), result) else: self._get_application_dependency(path, result) except NoNodeError: logging.debug('Node at {0} no longer exists.'.format(path)) def _get_application_dependency(self, path, result): """ Load result object with application dependencies :type path: str :type result: ApplicationDependenciesMessage """ if self._zoo_keeper.exists(path): data, stat = self._zoo_keeper.get(path, watch=self._on_update) if not data: return try: root = ElementTree.fromstring(data) for node in root.findall('Automation/Component'): app_id = node.attrib.get('id') registrationpath = node.attrib.get('registrationpath', None) if registrationpath is None: registrationpath = os.path.join( self._configuration.application_state_path, app_id) start_action = node.find('Actions/Action[@id="start"]') if start_action is None: logging.warn("No Start Action Found for {0}" .format(registrationpath)) continue dependencies = self._parse_dependencies(start_action) data = { "configuration_path": registrationpath, "dependencies": dependencies, "downstream": list() } result.update({registrationpath: data}) except Exception: logging.exception('An unhandled exception occurred') else: logging.warn("config path does not exist: {0}".format(path)) def _parse_dependencies(self, action): """ Parse dependencies out of XML :type action: xml.etree.ElementTree.Element :rtype: list """ # TODO: rename 'path' when it really isn't a path. this is a hack... # prev_was_not keeps track of whether the outer class was 'not' dependencies = [] prev_was_not = False for predicate in action.iter('Predicate'): pred_type = predicate.get('type').lower() pred_path = predicate.get('path', None) if pred_type == PredicateType.ZOOKEEPERHASCHILDREN: dependencies.append({'type': pred_type, 'path': pred_path}) prev_was_not = False if pred_type == PredicateType.ZOOKEEPERHASGRANDCHILDREN: dependencies.append({'type': pred_type, 'path': pred_path}) prev_was_not = False if pred_type == PredicateType.ZOOKEEPERGOODUNTILTIME: if len(pred_path.split('gut/')) > 1: dependencies.append( {'type': pred_type, 'path': ("I should be up between: {0}" .format(pred_path.split("gut/")[1]))}) else: logging.debug('Invalid GUT path: {0}'.format(pred_path)) prev_was_not = False if pred_type == PredicateType.HOLIDAY: dependencies.append( {'type': pred_type, 'path': ("Does NOT run on holidays" if prev_was_not else "Runs on holidays")}) prev_was_not = False if pred_type == PredicateType.WEEKEND: dependencies.append( {'type': pred_type, 'path': ("Does NOT run on weekends" if prev_was_not else "Runs on weekends")}) prev_was_not = False if pred_type == PredicateType.TIME: start = predicate.get('start', None) stop = predicate.get('stop', None) msg = 'I should be up ' if start is not None: msg += 'after: {0} '.format(start) if stop is not None: msg += 'until: {0}'.format(stop) if not start and not stop: msg += '?' dependencies.append({'type': pred_type, 'path': msg}) if pred_type == PredicateType.NOT: prev_was_not = True return dependencies def _recalc_downstream_dependencies(self): """ Loop over existing cache and link upstream with downstream elements """ # clear existing downstream for data in self._cache.application_dependencies.itervalues(): downstream = data.get('downstream') del downstream[:] dep_copy = self._cache.application_dependencies.copy() for path, data in dep_copy.iteritems(): for key, value in self._cache.application_dependencies.iteritems(): for dep in data.get('dependencies'): if dep['type'] == PredicateType.ZOOKEEPERHASGRANDCHILDREN: if key.startswith(dep['path']): value.get('downstream').append(path) elif dep['type'] == PredicateType.ZOOKEEPERHASCHILDREN: if dep['path'] == key: value.get('downstream').append(path) def _on_update(self, event): """ Callback to send updates via websocket on application state changes. :type event: kazoo.protocol.states.WatchedEvent """ self._on_update_path(event.path) def _on_update_path(self, path): try: message = ApplicationDependenciesMessage() self._walk(path, message) self._recalc_downstream_dependencies() self._cache.update(message.application_dependencies) self._message_throttle.add_message(message) self._time_estimate_cache.update_dependencies( self._cache.application_dependencies) except Exception: logging.exception('An unhandled Exception has occurred')