def manage_initial_host_status_brok(self, b): """Prepare the known hosts cache""" host_name = b.data['host_name'] logger.debug("got initial host status: %s", host_name) self.hosts_cache[host_name] = { 'realm_name': sanitize_name(b.data.get('realm_name', b.data.get('realm', 'All'))), } if 'customs' in b.data: self.hosts_cache[host_name]['_GRAPHITE_PRE'] = \ sanitize_name(b.data['customs'].get('_GRAPHITE_PRE', None)) self.hosts_cache[host_name]['_GRAPHITE_GROUP'] = \ sanitize_name(b.data['customs'].get('_GRAPHITE_GROUP', None)) logger.debug("initial host status received: %s", host_name)
def manage_initial_service_status_brok(self, b): """Prepare the known services cache""" host_name = b.data['host_name'] service_description = b.data['service_description'] service_id = host_name + "/" + service_description logger.debug("got initial service status: %s", service_id) if host_name not in self.hosts_cache: logger.error("initial service status, host is unknown: %s.", service_id) return self.services_cache[service_id] = {} if 'customs' in b.data: self.services_cache[service_id]['_GRAPHITE_POST'] = \ sanitize_name(b.data['customs'].get('_GRAPHITE_POST', None)) logger.debug("initial service status received: %s", service_id)
def get_metrics_from_perfdata(self, service, perf_data): """Decode the performance data to build a metrics list""" result = [] metrics = PerfDatas(perf_data) for metric in metrics: logger.debug("service: %s, metric: %s (%s)", service, metric, metric.__dict__) if metric.name in ['time']: metric.name = "duration" name = sanitize_name(metric.name) name = self.multiple_values.sub(r'.\1', name) if not name: continue # get metric value and its thresholds values if they exist name_value = {name: metric.value, 'uom_' + name: metric.uom} # Get or ignore extra values depending upon module configuration if metric.warning and self.send_warning: name_value[name + '_warn'] = metric.warning if metric.critical and self.send_critical: name_value[name + '_crit'] = metric.critical if metric.min and self.send_min: name_value[name + '_min'] = metric.min if metric.max and self.send_max: name_value[name + '_max'] = metric.max for key, value in name_value.items(): result.append((key, value, metric.uom)) logger.debug("Metrics: %s - %s", service, result) return result
def __init__(self, mod_conf): # pylint: disable=too-many-branches """Module initialization mod_conf is a dictionary that contains: - all the variables declared in the module configuration - a 'properties' value that is the module properties as defined globally in this file :param mod_conf: module configuration file as a dictionary """ BaseModule.__init__(self, mod_conf) # pylint: disable=global-statement global logger logger = logging.getLogger('fusionsupervision.module.%s' % self.alias) logger.setLevel(getattr(mod_conf, 'log_level', logging.INFO)) logger.debug("inner properties: %s", self.__dict__) logger.debug("received configuration: %s", mod_conf.__dict__) logger.info("loaded by the %s '%s'", self.my_daemon.type, self.my_daemon.name) # Output file target self.output_file = getattr(mod_conf, 'output_file', '') if 'ALIGNAK_HOSTS_STATS_FILE' in os.environ: self.output_file = os.environ['ALIGNAK_HOSTS_STATS_FILE'] # Graphite / InfluxDB targets self.graphite_enabled = (getattr(mod_conf, 'graphite_enabled', '0') != '0') if isinstance(getattr(mod_conf, 'graphite_enabled', '0'), bool): self.graphite_enabled = getattr(mod_conf, 'graphite_enabled') self.influxdb_enabled = (getattr(mod_conf, 'influxdb_enabled', '0') != '0') if isinstance(getattr(mod_conf, 'influxdb_enabled', '0'), bool): self.influxdb_enabled = getattr(mod_conf, 'influxdb_enabled') if self.influxdb_enabled and not influxdb_lib: logger.info( "Sending metrics to InfluxDB is enabled but the influxdb Python " "library is not installed. You should 'pip install influxdb'! " "As of now, sending to influxdb is disabled.") self.influxdb_enabled = False logger.info( "targets configuration: graphite: %s, influxdb: %s, file: %s", self.graphite_enabled, self.influxdb_enabled, self.output_file) if self.output_file: logger.warning( "Storing metrics in an output file is configured. Do not forget " "to regularly clean this file to avoid important disk usage!") self.enabled = getattr(mod_conf, 'enabled', '0') != '0' if isinstance(getattr(mod_conf, 'enabled', '0'), bool): self.enabled = getattr(mod_conf, 'enabled') if not self.output_file and not self.graphite_enabled and not self.influxdb_enabled: logger.warning( "The metrics sending module is enabled but no target is defined. You " "should set one of the 'output_file', or 'graphite_enabled' or " "'influxdb_enabled' parameter to specify where the metrics " "must be pushed! As of now, the module is disabled.") self.enabled = False # Hosts and services internal cache # - contain the hosts and services names and specific parameters # - updated with the initial hosts/services status broks self.hosts_cache = {} self.services_cache = {} # Do not ignore unknown hosts/services. If set, this parameter will make the module # ignore the provided broks until the initial status broks are received # Then the module will only manage metrics if hosts/services are known in the internal cache self.ignore_unknown = getattr(mod_conf, 'ignore_unknown', '1') == '1' if isinstance(getattr(mod_conf, 'ignore_unknown', '0'), bool): self.ignore_unknown = getattr(mod_conf, 'ignore_unknown') logger.info("ignoring unknown: %s", self.ignore_unknown) # Separate performance data multiple values self.multiple_values = re.compile(r'_(\d+)$') # Internal metrics cache self.my_metrics = [] self.metrics_flush_count = int( getattr(mod_conf, 'metrics_flush_count', '256')) self.last_failure = 0 self.metrics_flush_pause = int( os.getenv('ALIGNAK_STATS_FLUSH_PAUSE', '10')) self.log_metrics_flush_pause = False # Specific filter for host and services names for Graphite self.illegal_char_hostname = re.compile(r'[^a-zA-Z0-9_\-]') # Graphite target self.graphite_host = getattr(mod_conf, 'graphite_host', 'localhost') self.graphite_port = int(getattr(mod_conf, 'graphite_port', '2004')) self.carbon = None logger.info("graphite host/port: %s:%d", self.graphite_host, self.graphite_port) # optional prefix / suffix in graphite for FusionSupervision Engine data source self.graphite_data_source = \ sanitize_name(getattr(mod_conf, 'graphite_data_source', '')) self.graphite_prefix = getattr(mod_conf, 'graphite_prefix', '') self.realms_prefix = (getattr(mod_conf, 'realms_prefix', '0') != '0') if isinstance(getattr(mod_conf, 'realms_prefix', '0'), bool): self.realms_prefix = getattr(mod_conf, 'realms_prefix') logger.info("graphite prefix: %s, realm prefix: %s data source: %s", self.graphite_prefix, self.realms_prefix, self.graphite_data_source) if self.graphite_enabled and not self.graphite_host: logger.warning( "Graphite host name is not set, no metrics will be sent to Graphite!" ) self.graphite_enabled = False # InfluxDB target self.influxdb_host = getattr(mod_conf, 'influxdb_host', 'localhost') self.influxdb_port = int(getattr(mod_conf, 'influxdb_port', '8086')) self.influxdb_database = getattr(mod_conf, 'influxdb_database', 'fusionsupervision') # Default is empty - do not used authenticated connection self.influxdb_username = getattr(mod_conf, 'influxdb_username', '') self.influxdb_password = getattr(mod_conf, 'influxdb_password', '') # Default is empty - do not use a specific retention self.influxdb_retention_name = \ getattr(mod_conf, 'influxdb_retention_name', '') self.influxdb_retention_duration = \ getattr(mod_conf, 'influxdb_retention_duration', 'INF') self.influxdb_retention_replication = \ getattr(mod_conf, 'influxdb_retention_replication', '1') self.influx = None logger.info("influxdb host/port: %s:%d", self.influxdb_host, self.influxdb_port) logger.info("influxdb database: %s, retention: %s:%s:%s", self.influxdb_database, self.influxdb_retention_name, self.influxdb_retention_duration, self.influxdb_retention_replication) # optional tags list in influxdb for FusionSupervision Engine data source self.influxdb_tags = getattr(mod_conf, 'influxdb_tags', None) if self.influxdb_tags: tags_list = {} tags = self.influxdb_tags.split(',') for tag in tags: if '=' in tag: tag = tag.split('=') tags_list[tag[0]] = tag[1] if tags_list: self.influxdb_tags = tags_list logger.info("influxdb tags: %s", self.influxdb_tags) if self.influxdb_enabled and not self.influxdb_host: logger.warning( "InfluxDB host name is not set, no metrics will be sent to InfluxDB!" ) self.influxdb_enabled = False # Used to reset check time into the scheduled time. # Carbon/graphite does not like latency data and creates blanks in graphs # Every data with "small" latency will be considered create at scheduled time self.ignore_latency_limit = int( getattr(mod_conf, 'ignore_latency_limit', '0')) if self.ignore_latency_limit < 0: self.ignore_latency_limit = 0 # service name to use for host check self.hostcheck = sanitize_name( getattr(mod_conf, 'host_check', 'hostcheck')) # Send warning, critical, min, max self.send_warning = bool(getattr(mod_conf, 'send_warning', False)) logger.info("send warning metrics: %d", self.send_warning) self.send_critical = bool(getattr(mod_conf, 'send_critical', False)) logger.info("send critical metrics: %d", self.send_critical) self.send_min = bool(getattr(mod_conf, 'send_min', False)) logger.info("send min metrics: %d", self.send_min) self.send_max = bool(getattr(mod_conf, 'send_max', False)) logger.info("send max metrics: %d", self.send_max) if not self.enabled: logger.warning( "inner metrics module is loaded but is not enabled.") return logger.info("metrics module is loaded and enabled")
def manage_host_check_result_brok(self, b): # pylint: disable=too-many-branches """An host check result brok has just arrived...""" host_name = b.data.get('host_name', None) if not host_name: return logger.debug("host check result: %s", host_name) # If host initial status brok has not been received, ignore ... if host_name not in self.hosts_cache and not self.ignore_unknown: logger.warning( "received host check result for an unknown host: %s", host_name) return # Decode received metrics metrics = self.get_metrics_from_perfdata('host_check', b.data['perf_data']) if not metrics: logger.debug("no metrics to send ...") return # If checks latency is ignored if self.ignore_latency_limit >= b.data['latency'] > 0: check_time = int(b.data['last_chk']) - int(b.data['latency']) else: check_time = int(b.data['last_chk']) # Custom hosts variables hname = sanitize_name(host_name) if host_name in self.hosts_cache: if self.hosts_cache[host_name].get('_GRAPHITE_GROUP', None): hname = ".".join( (self.hosts_cache[host_name].get('_GRAPHITE_GROUP'), hname)) if self.hosts_cache[host_name].get('_GRAPHITE_PRE', None): hname = ".".join( (self.hosts_cache[host_name].get('_GRAPHITE_PRE'), hname)) # Graphite data source if self.graphite_data_source: path = '.'.join((hname, self.graphite_data_source)) if self.hostcheck: path = '.'.join( (hname, self.graphite_data_source, self.hostcheck)) else: path = '.'.join((hname, self.hostcheck)) # Realm as a prefix if self.realms_prefix and self.hosts_cache[host_name].get( 'realm_name', None): path = '.'.join( (self.hosts_cache[host_name].get('realm_name'), path)) # Graphite prefix if self.graphite_prefix: path = '.'.join((self.graphite_prefix, path)) realm_name = None if host_name in self.hosts_cache: realm_name = self.hosts_cache[host_name].get('realm_name', None) # Send metrics self.send_to_tsdb(realm_name, host_name, self.hostcheck, metrics, check_time, path)
def manage_service_check_result_brok(self, b): # pylint: disable=too-many-branches """A service check result brok has just arrived ...""" host_name = b.data.get('host_name', None) service_description = b.data.get('service_description', None) if not host_name or not service_description: return service_id = host_name + "/" + service_description logger.debug("service check result: %s", service_id) # If host and service initial status broks have not been received, ignore ... if not self.ignore_unknown and host_name not in self.hosts_cache: logger.warning( "received service check result for an unknown host: %s", service_id) return if service_id not in self.services_cache and not self.ignore_unknown: logger.warning( "received service check result for an unknown service: %s", service_id) return # Decode received metrics metrics = self.get_metrics_from_perfdata(service_description, b.data['perf_data']) if not metrics: logger.debug("no metrics to send ...") return # If checks latency is ignored if self.ignore_latency_limit >= b.data['latency'] > 0: check_time = int(b.data['last_chk']) - int(b.data['latency']) else: check_time = int(b.data['last_chk']) # Custom hosts variables hname = sanitize_name(host_name) if host_name in self.hosts_cache: if self.hosts_cache[host_name].get('_GRAPHITE_GROUP', None): hname = ".".join( (self.hosts_cache[host_name].get('_GRAPHITE_GROUP'), hname)) if self.hosts_cache[host_name].get('_GRAPHITE_PRE', None): hname = ".".join( (self.hosts_cache[host_name].get('_GRAPHITE_PRE'), hname)) # Custom services variables desc = sanitize_name(service_description) if service_id in self.services_cache: if self.services_cache[service_id].get('_GRAPHITE_POST', None): desc = ".".join((desc, self.services_cache[service_id].get( '_GRAPHITE_POST', None))) # Graphite data source if self.graphite_data_source: path = '.'.join((hname, self.graphite_data_source, desc)) else: path = '.'.join((hname, desc)) realm_name = None if host_name in self.hosts_cache: realm_name = self.hosts_cache[host_name].get('realm_name', None) # Send metrics self.send_to_tsdb(realm_name, host_name, service_description, metrics, check_time, path)