def frequency(self): ''' Return True/False if allowed/not allowed to run - based on frequency configured or not (allow if no frequency) - based on previous run (persisted) if frequency configured Update PERSIST() object for next run, if run allowed this time ''' # get lastrun timestamp for this check or 0 freqpersist = cmt.PERSIST.get_key("frequency", {}) id = self.get_id() freqlastrun = freqpersist.get(id, 0) # frequency for this specific check ? f = self.conf.get('frequency', -1) if f == -1: debug2("Frequency: no Frequency at check level") # allowed to run again return True # if frequency configured, compare value / delta ; update persist if Run decided now = int(time.time()) delta = int(now - freqlastrun) if delta > f: freqpersist[id] = now cmt.PERSIST.set_key("frequency", freqpersist) debug("Frequency: allowed {} > {} (f={}, delta={})".format( now, freqlastrun, f, delta)) return True # too early debug("Frequency: not allowed : f={}, delta={}".format(f, delta)) return False
def conf_add_top_entries(conf): ''' complete conf with optionnal/default top level entries.''' # add missing top entries for item in cmt.DEFAULT_CONF_TOP_ENTRIES: if item not in conf: debug2("No {} entry in config ; added automatically.".format(item)) conf[item] = {} # new conf format Ver. 1.2.0 for item in cmt.GLOBAL_MODULE_MAP: if item not in conf: debug2("No {} entry in config ; added automatically.".format(item)) conf[item] = {}
def load_conf_remote(conf): ''' fetch a remote URL with additional conf ; merge ''' debug("Load remote conf") # load remote config from conf_url parameter if 'conf_url' in conf['global']: url = conf['global']['conf_url'] gro = conf['global'].get("cmt_group", "nogroup") nod = conf['global'].get("cmt_node", "nonode") if url.endswith("/txt/"): url = url + "{}_{}.txt".format(gro, nod) url = url + "?group={}&node={}".format(gro, nod) debug("Remote config URL : ", url) remote_txt = conf_load_http(url) remote_conf = None if remote_txt is not None: remote_conf = yaml.safe_load(remote_txt) if not type(remote_conf) is dict: remote_conf = None cachedconf = cmt.PERSIST.get_key("remote_conf_cache", None) cachedconf_age = cmt.PERSIST.get_key("remote_conf_cache_age", 0) if int(time.time() - cachedconf_age) > 86400: # too old cachedconf = None if remote_conf is None: remote_conf = cachedconf if remote_conf is not None: cmt.PERSIST.set_key("remote_conf_cache", remote_conf) cmt.PERSIST.set_key("remote_conf_cache_age", int(time.time())) cmt.PERSIST.save() conf_add_top_entries(remote_conf) conf_merge(conf, remote_conf) debug("Configuration loaded") debug2(json.dumps(conf, indent=2))
def is_module_alone_in_args(name): modules = cmt.ARGS['modules'][0] if name in modules and len(modules) == 1: return True debug2(name, "module not alone or not in ARGS") return False
def is_module_allowed_in_args(name): modules = cmt.ARGS['modules'][0] if name in modules or len(modules) == 0: return True debug2(name, "module not in ARGS") return False
def check(c): global defaults_file # get conf defaults_file = c.conf.get('defaults_file', '/opt/cmt/mysql.cnf') # is_master = c.conf.get("is_master", False) is True is_slave = c.conf.get("is_slave", False) is True max_behind = c.conf.get('max_behind', 900) try: #db=_mysql.connect(host=host,user=user,passwd=password) db = _mysql.connect(read_default_file=defaults_file) except Exception as e: c.severity = cmt.SEVERITY_CRITICAL c.add_message( "mysql - can't connect with conf {}".format(defaults_file)) debug("Error {}".format(e)) return c # ------------------------------------- # get global CONF vars = {} db.query("show variables;") lines = db.store_result().fetch_row(maxrows=0, how=0) for (k, v) in lines: k = k.decode() v = v.decode() vars[k] = v debug2("mysql-conf : ", c.check, ':', k, "=", v) #print(k,v) version = vars.get('version', 'n/a') c.add_item(checkitem.CheckItem('mysql_version', version)) # ------------------------------------- # get global VARS vars = {} db.query("show global status;") lines = db.store_result().fetch_row(maxrows=0, how=0) for (k, v) in lines: k = k.decode() v = v.decode() vars[k] = v debug2("mysql-status : ", c.check, ':', k, "=", v) #print(k,v) # Com_select # Com_insert # Com_update # Com_delete # Connections 175 # Memory_used 277515936 # Queries # Threads_cached 1 # Threads_connected 1 # Threads_created 16 # Threads_running 8 thread_c = int(vars.get('Threads_connected', 0)) c.add_item(checkitem.CheckItem('mysql_connection', thread_c)) thread_r = int(vars.get('Threads_running', 0)) c.add_item(checkitem.CheckItem('mysql_runner', thread_r)) mem = int(vars.get('Memory_used', 0)) c.add_item(checkitem.CheckItem('mysql_memory', mem, unit='bytes')) lastrun = cmt.PERSIST.get_key("cmt_last_run", 0) delta = int(time.time()) - int(lastrun) xconn = 0 xsel = 0 xwri = 0 xqu = 0 if delta < 900: xsel = get_derivative(c, vars, 'Com_select') c.add_item(checkitem.CheckItem('mysql_read_rate', xsel, 'r/sec')) x1 = get_derivative(c, vars, 'Com_insert') x2 = get_derivative(c, vars, 'Com_update') x3 = get_derivative(c, vars, 'Com_delete') xwri = x1 + x2 + x3 c.add_item(checkitem.CheckItem('mysql_write_rate', xwri, 'w/sec')) xqu = get_derivative(c, vars, 'Queries') c.add_item(checkitem.CheckItem('mysql_query_rate', xqu, 'q/sec')) xconn = get_derivative(c, vars, 'Connections') c.add_item( checkitem.CheckItem('mysql_cx_rate', xconn, 'connection/sec')) # ------------------------------------- # SLAVE INFO if is_slave: debug2("vars_slave query") #q = 'select 1;' #q = 'select host,user from mysql.user;' q = 'show slave status\G' r = subprocess_query(q).decode() # cut on newlines, remove trailing spaces, split on first space, get k,v vars_slave = {} lines = r.split('\n') for line in lines: akv = line.split(':') if len(akv) < 2: continue k = akv[0].rstrip().lstrip() v = akv[1].rstrip().lstrip() debug2("mysql-slave : ", c.check, ':', k, "=", v) vars_slave[k] = v io_running = vars_slave.get('Slave_IO_Running', 'No') c.add_item( checkitem.CheckItem('mysql_slave_io_run', io_running, "Slave_IO_Running")) sql_running = vars_slave.get('Slave_SQL_Running', 'No') c.add_item( checkitem.CheckItem('mysql_slave_sql_run', sql_running, "Slave_SQL_Running")) master_log = vars_slave.get('Master_Log_File', 'n/a') c.add_item( checkitem.CheckItem('mysql_slave_mpos', master_log, "Master_Log_File")) relay_log = vars_slave.get('Relay_Master_Log_File', 'n/a') c.add_item( checkitem.CheckItem('mysql_slave_rpos', relay_log, "Relay_Master_Log_File")) behind_str = vars_slave.get('Seconds_Behind_Master', "999999999") try: behind = int(behind_str) except: behind = 999999999 c.add_item( checkitem.CheckItem('mysql_slave_behind', behind, "Seconds_Behind_Master")) if io_running != "Yes": c.severity = cmt.SEVERITY_CRITICAL c.add_message("{} - slave IO not running".format(c.check)) return c if sql_running != "Yes": c.severity = cmt.SEVERITY_CRITICAL c.add_message("{} - slave SQL not running".format(c.check)) return c if behind > max_behind: c.severity = cmt.SEVERITY_CRITICAL c.add_message( "{} - slave too late behind master {} > {} secs".format( c.check, behind, max_behind)) return c c.add_message( "{} - slave {} sec. behind (limit = {}) - cx={} cx/s={} r/s={} w/s={} q/s={} mem={}" .format( c.check, behind, max_behind, thread_c, xconn, xsel, xwri, xqu, mem, )) return c # all OK c.add_message("{} - cx={} cx/s={} r/s={} w/s={} q/s={} mem={}".format( c.check, thread_c, xconn, xsel, xwri, xqu, mem, )) return c
def build_json_message(check, index=None, rawdata_prefix='raw'): '''Prepare a JSON message suitable to be sent to an Elatic server.''' json_data = '' # common values json_data += '"cmt_group":"{}"'.format(check.group) json_data += ',"cmt_node":"{}"'.format(check.node) json_data += ',"cmt_node_env":"{}"'.format(check.node_env) json_data += ',"cmt_node_role":"{}"'.format(check.node_role) json_data += ',"cmt_node_location":"{}"'.format(check.node_location) json_data += ',"cmt_version":"{}"'.format(check.version) json_data += ',"cmt_module":"{}"'.format(check.module) json_data += ',"cmt_check":"{}"'.format(check.check) # deprecated json_data += ',"cmt_id":"{}"'.format(check.get_id()) # rawadata / multi-event part if index is not None: json_data += ',"cmt_raw_id":{}'.format(index) event = check.multievent[index] m = "{} ".format(check.check) for k,v in event.items(): m = m + "; {}={}".format(k,v) #QUOTES BUG : m = m + json.dumps(event) json_data += ',"short_message":"{}"'.format(m) json_data += ',"cmt_message":"{}"'.format(m) for k,v in event.items(): try: float(v) json_data += ',"{}_{}_{}":{}'.format(rawdata_prefix, check.check, k, v) except Exception as e: json_data += ',"{}_{}_{}":"{}"'.format(rawdata_prefix, check.check, k, v) debug2("Build json data rawdata multievent: ", str(k), str(v)) # main / standard event else: # cmt_message : Check name + check.message + all items.alert_message m = "{} - ".format(check.module) m = m + check.get_message_as_str() json_data += ',"cmt_message":"{}"'.format(m) # check items key/values for item in check.checkitems: value2 = item.value if item.multiline: #print("********************************** multi") # not suitable for elastic line protocol value2 = value2.replace('\n', ' ; ') value2 = value2.replace('\r', ' ') # value2 = '["a","b","c"]' # json_data += ',"cmt_{}":{}'.format(item.name, value2) # continue try: float(value2) json_data += ',"cmt_{}":{}'.format(item.name, value2) except ValueError: json_data += ',"cmt_{}":"{}"'.format(item.name, value2) debug2("Build json data : ", str(item.name), str(value2)) json_data += ',"cmt_alert":{}'.format(check.alert) json_data += ',"cmt_severity":{}'.format(check.severity) json_data += ',"alert":"{}"'.format(cmt.get_alert_label(check.alert)) json_data += ',"severity":"{}"'.format(cmt.get_severity_label(check.severity)) json_data = '{' + json_data + '}' return json_data
def send_metrology(mycheck): ''' Send Check results (event, multiple CheckItems) to metrology servers. Or add to batch for batch sending at the end of run. ''' # avoid sending to bach multiple times(if multiple metrology servers) influxdb_already_batched = False for metro in cmt.CONF['metrology_servers']: metroconf = cmt.CONF['metrology_servers'][metro] metrotype = metroconf.get('type', 'unknown') send_rawdata = metroconf.get("send_rawdata", True) is True rawdata_prefix = metroconf.get("rawdata_prefix", "raw_") timerange = metroconf.get("enable", "yes") if not conf.is_timeswitch_on(timerange): debug("Metrology server disabled in conf : ", metro) if metrotype == "graylog_udp_gelf": gelf_data = build_gelf_message(mycheck) graylog_send_udp_gelf(metroconf=metroconf, data=gelf_data) debug("Data sent to metrology server ", metro) if send_rawdata: for index, val in enumerate(mycheck.multievent): gelf_data_multi = build_gelf_message(mycheck, index, rawdata_prefix=rawdata_prefix) graylog_send_udp_gelf(metroconf=metroconf, data=gelf_data_multi) debug2("Data sent to metrology server (multievent)", metro) elif metrotype == "graylog_http_gelf": gelf_data = build_gelf_message(mycheck) graylog_send_http_gelf(metroconf=metroconf, data=gelf_data) debug("Data sent to metrology server ", metro) if send_rawdata: for index, val in enumerate(mycheck.multievent): gelf_data_multi = build_gelf_message(mycheck, index, rawdata_prefix=rawdata_prefix) graylog_send_http_gelf(metroconf=metroconf, data=gelf_data_multi) debug2("Data sent to metrology server (multievent)", metro) elif metrotype == "elastic_http_json": json_data = build_json_message(mycheck) elastic_send_http_json(metroconf=metroconf, data=json_data) debug("Data sent to metrology server ", metro) if send_rawdata: for index, val in enumerate(mycheck.multievent): json_data_multi = build_json_message(mycheck, index, rawdata_prefix=rawdata_prefix) elastic_send_http_json(metroconf=metroconf, data=json_data_multi) elif metrotype == "influxdb": influxdb_data = build_influxdb_message(mycheck, metroconf) batch = metroconf.get("batch", True) is True if batch: if not influxdb_already_batched: influxdb_add_to_batch(influxdb_data) influxdb_already_batched = True debug("Data batched for influx servers") if send_rawdata: for index, val in enumerate(mycheck.multievent): influxdb_data_multi = build_influxdb_message(mycheck, metroconf, index=index) influxdb_add_to_batch(influxdb_data_multi) else: # already batched by another influx server pass else: # immediate send influxdb_send_http(metroconf=metroconf, data=influxdb_data) debug("Data sent to influx server ", metro) if send_rawdata: for index, val in enumerate(mycheck.multievent): influxdb_data_multi = build_influxdb_message(mycheck, metroconf, index=index) influxdb_send_http(metrconf=metroconf, data=influxdb_data_multi) else: debug("Unknown metrology server type in conf.")
def build_gelf_message(check, index=None, rawdata_prefix='raw'): '''Prepare a GELF JSON message suitable to be sent to a Graylog GELF server.''' graylog_data = '"version":"1.1"' graylog_data += ',"host":"{}_{}"'.format(check.group, check.node) # common values graylog_data += ',"cmt_group":"{}"'.format(check.group) graylog_data += ',"cmt_node":"{}"'.format(check.node) graylog_data += ',"cmt_node_env":"{}"'.format(check.node_env) graylog_data += ',"cmt_node_role":"{}"'.format(check.node_role) graylog_data += ',"cmt_node_location":"{}"'.format(check.node_location) graylog_data += ',"cmt_version":"{}"'.format(check.version) graylog_data += ',"cmt_module":"{}"'.format(check.module) graylog_data += ',"cmt_check":"{}"'.format(check.check) # deprecated graylog_data += ',"cmt_id":"{}"'.format(check.get_id()) # rawdata / multi-event part if index is not None: graylog_data += ',"cmt_raw_id":{}'.format(index) event = check.multievent[index] m = "{}".format(check.check) for k,v in event.items(): m = m + " ; {}={}".format(k,v) #QUOTES BUG : m = m + json.dumps(event) graylog_data += ',"short_message":"{}"'.format(m) graylog_data += ',"cmt_message":"{}"'.format(m) for k,v in event.items(): try: float(v) graylog_data += ',"{}_{}_{}":{}'.format(rawdata_prefix, check.check, k, v) except Exception as e: graylog_data += ',"{}_{}_{}":"{}"'.format(rawdata_prefix, check.check, k, v) debug2("Build gelf rawdata multievent: ", str(k), str(v)) # main / standard event else: # cmt_message : Check name + check.message + all items.alert_message m = "{} - ".format(check.module) m = m + check.get_message_as_str() graylog_data += ',"short_message":"{}"'.format(m) graylog_data += ',"cmt_message":"{}"'.format(m) # check items key/values for item in check.checkitems: try: float(item.value) graylog_data += ',"cmt_{}":{}'.format(item.name, item.value) except ValueError: graylog_data += ',"cmt_{}":"{}"'.format(item.name, item.value) debug2("Build gelf data : ", str(item.name), str(item.value)) graylog_data += ',"cmt_alert":{}'.format(check.alert) graylog_data += ',"cmt_severity":{}'.format(check.severity) graylog_data += ',"alert":"{}"'.format(cmt.get_alert_label(check.alert)) graylog_data += ',"severity":"{}"'.format(cmt.get_severity_label(check.severity)) # all messages graylog_data = '{' + graylog_data + '}' return graylog_data
def perform_check(checkname, modulename): debug2("Starting check : ", checkname) # Is module in GLOBAL MAP ? if modulename not in cmt.GLOBAL_MODULE_MAP: logit("Unknown module in configuration: ", modulename) return "continue" # get configuration for this check checkconf = cmt.CONF[modulename][checkname] # check enabled ? ts_check = checkconf.get('enable', 'yes') if not conf.is_timeswitch_on(ts_check): debug("check disabled by conf ", checkname) return "continue" # check if module is filtered in ARGS if not args.is_module_allowed_in_args(modulename): #check_result.result = "skip" #check_result.result_info = "module not requested (args)" return "continue" # prepare options sent to Module code my_opt = {} # some checks are exclusive / standalone # Check if it is a standalone check (one module, one check) if args.is_module_alone_in_args(modulename): my_opt["single_module_run"] = True else: my_opt["single_module_run"] = False # particular checkname requested ? (--check option) # NB : several modules can match for the same chechname which is not a PK accross full config my_opt["specific_checkname_run"] = False if cmt.ARGS["check"]: if cmt.ARGS["check"] == checkname: debug2(" specific check name : match %s" % checkname) my_opt["specific_checkname_run"] = True else: debug2(" specific check name : skip %s" % checkname) return "continue" # create check object check_result = Check(module=modulename, check=checkname, conf=checkconf, opt=my_opt) # Add tags/kv check_result.add_tags() # print header to CLI if cmt.ARGS['cron'] or cmt.ARGS['short']: pass else: check_result.print_to_cli_detail_head() # check if root privilege is required conf_rootreq = checkconf.get('root_required', False) is True if conf_rootreq: if (os.getuid() != 0): debug("check %s must run as root." % checkname) check_result.result = "skip" check_result.result_info = "must run as root" check_result.print_to_cli_skipped() return "continue" # verify frequency in cron mode if cmt.ARGS['cron']: if not check_result.frequency(): check_result.result = "skip" check_result.result_info = "check skipped (frequency)" check_result.print_to_cli_skipped() return "continue" # HERE / Future : give check_result the needed Module Conf, Global Conf ... # TODO : if --available, call different function # ********************************************************* # **** ACTUAL CHECK IS DONE HERE **** # ********************************************************* check_result = cmt.GLOBAL_MODULE_MAP[modulename]['check'](check_result) # --------------- # process results # --------------- # option = available => not a real run ; just display discovered target and quit if cmt.ARGS["available"]: return "break" # if check skipped by module itself if check_result.result == "skip": check_result.resul_info = check_result.message check_result.print_to_cli_skipped() debug(" skipped in module") return "continue" # adjust severity to severity_max for this check check_result.adjust_severity() # compute alert transition : NONE, NEW, ACTIVE, DOWN ; hysteresis check_result.compute_alert() # If pager enabled (at check level), and alert exists : set pager True check_result.compute_pager() # Print to CLI if cmt.ARGS['cron'] or cmt.ARGS['short']: check_result.print_to_cli_short() else: check_result.print_to_cli_detail() # keep returned Persist structure in check_result cmt.PERSIST.set_key(check_result.get_id(), check_result.persist) # Metrology if cmt.ARGS['cron'] or cmt.ARGS["report"]: # check_result.send_metrology() metrology.send_metrology(check_result) # add Check to report return check_result
def readPropertiesFile(fname): """ Uses ConfigReader to read all sections in to a dictionary. Each sections options will be kept as nested dictionary under each section key. e.g.: { 'WHITELIST': { '<j2eetype-1>': { '<value-1>', '<value-2>', : '<value-n>' } '<j2eetype-2>': { '<value-1>', '<value-2>', : '<value-n>' } } """ l.debug("looking for file: '%s'", fname) if not os.path.isfile(fname): l.debug("file not found: '%s'", fname) return {} # discover comments per regex re_comment = re.compile('^\s*#', re.IGNORECASE) # discover enumeration on first level: eg "DATASOURCE3" re_enumeration = re.compile('^([A-z]+)(\d+)\.', re.IGNORECASE) reader = ConfigParser.ConfigParser() l.debug("reading file: '%s'", fname) reader.read(fname) # read all sections and items therein allSectionsMap = {} sectionNames = reader.sections() sectionNames.sort() for sectionName in sectionNames: if l.isDebugEnabled(): l.logLF() l.debug("found section === %s === ", sectionName) sectionMap = {} allSectionsMap[sectionName.upper()] = sectionMap lastBlockNumber = -1 expectedBlockNumber = 0 configBlockNumber = 0 # read all option lines from current section into sectionMap dictionary # eg: "datasource1.connectionpool.reaptime = 7" sectionOptions = reader.options(sectionName) # in Python 2.1 (WAS8.x) the option() lines of a configParser section may be unsorted! # but I cant use default sort, because Datasource1, datasource10 will be sorted before Datasource2 if l.isHellLevelEnabled(): for optionKey in sectionOptions: l.debug666("read original-ordered section/line: %-20s : %s", sectionName, optionKey) sortedSectionOptions = sorted(sectionOptions, key=configReaderOptionSort) if l.isHellLevelEnabled(): for optionKey in sortedSectionOptions: l.debug666("read num.sorted section/line: %s :\t%s", sectionName, optionKey) for optionKey in sortedSectionOptions: if (re_comment.search(optionKey)) is not None: l.debug666("skipping comment: %s", optionKey) continue optionValue = reader.get(sectionName, optionKey) # now fix the *1st* level of enumeration, if it is not done sequentially. # eg. when somebody declares 20 datasources, but deletes the 3rd. # This way, one does not need to re-enumerate all the # other datasource entries in the config file. match = re_enumeration.search(optionKey) if (match) is not None: configBlockNumber = int(match.group(2)) if configBlockNumber > lastBlockNumber: # new block found -> increase block counter if l.isDebugEnabled(): l.logLF() l.debug("=== read new config block # %d ===", configBlockNumber) expectedBlockNumber = expectedBlockNumber + 1 lastBlockNumber = configBlockNumber l.debug2("config #:%2d expected #:%2d", configBlockNumber, expectedBlockNumber) # check for non-sequential block numbering! if configBlockNumber != expectedBlockNumber: l.debug("FIX block numbering: %d -> %d in option: '%s'", configBlockNumber, expectedBlockNumber, optionKey) optionKey = re_enumeration.sub( lambda m, num=expectedBlockNumber: "%s%s." % (m.group(1), num), optionKey) # and store the original block number in special hash key: originalNumberKey = "%s%d.%s" % (match.group(1), expectedBlockNumber, "__ORIGINAL_NUMBER") sectionMap[originalNumberKey.upper()] = configBlockNumber l.debug("data: %s = %s", optionKey, optionValue) # finally, add key to sectionMap hash: sectionMap[optionKey.upper()] = optionValue return allSectionsMap