def check_firewall_if(item, params, data): infotext_names = { "ip4_in_blocked": "Incoming IPv4 packets blocked: ", } this_time = time.time() for what, counter in data.items(): rate = get_rate("firewall_if-%s.%s" % (what, item), this_time, counter, onwrap=RAISE) if params.get("averaging"): backlog_minutes = params["averaging"] avgrate = get_average( "firewall_if-%s.%s" % (what, item), this_time, rate, backlog_minutes ) check_against = avgrate else: check_against = rate status, infotext, extraperf = check_levels( check_against, what, params.get(what), human_readable_func=lambda x: "%.2f pkts/s" % x, infoname=infotext_names[what], ) perfdata: List[Any] perfdata = [(what, rate)] + extraperf[:1] # type: ignore[operator] yield status, infotext, perfdata
def check_cpu_util(util, params, this_time=None, cores=None, perf_max=100): # Convert legacy param style to new dict style if params is None: params = {} elif isinstance(params, tuple): params = {"util": params} if this_time is None: this_time = time.time() # Old/mixed config may look like: # {'util': (80.0, 90.0), 'levels': None} # 'levels is None' means: Do not impose levels # 'util' from default levels if "levels" in params and "util" in params: levels = params.get('levels') else: levels = params.get("util") if levels is None: # legacy rules before 1.6 levels = params.get("levels") warn, crit = levels if isinstance(levels, tuple) else ( None, None) # only for perfdata perfdata = [("util", util, warn, crit, 0, perf_max)] # Averaging if "average" in params: util_avg = get_average("cpu_utilization.avg", this_time, util, params["average"]) perfdata.append(("util_average", util_avg, warn, crit, 0, perf_max)) state, infotext, extraperf = check_levels( util_avg, "util_average", levels, human_readable_func=get_percent_human_readable, infoname="Total CPU (%dmin average)" % params["average"]) else: state, infotext, extraperf = check_levels( util, "util", levels, human_readable_func=get_percent_human_readable, infoname="Total CPU") perfdata += extraperf[1:] # reference curve for predictive levels yield state, infotext, perfdata if "core_util_time_total" in params: threshold, warn, crit = params["core_util_time_total"] yield cpu_util_time(this_time, "total", util, threshold, warn, crit) if cores and any(x in params for x in [ "average_single", "core_util_graph", "core_util_time", "levels_single", ]): for core_index, (core, total_perc) in enumerate(cores): yield from _util_perfdata(core, total_perc, core_index, this_time, params)
def handle_graylog_messages(messages, params): msgs_levels_upper = params.get("msgs_upper", (None, None)) msgs_levels_lower = params.get("msgs_lower", (None, None)) yield check_levels( messages, "messages", msgs_levels_upper + msgs_levels_lower, human_readable_func=int, infoname="Total number of messages", ) avg_key = "msgs_avg" avg = params.get(avg_key, 30) msgs_avg_levels_upper = params.get("msgs_avg_upper", (None, None)) msgs_avg_levels_lower = params.get("msgs_avg_lower", (None, None)) this_time = time.time() rate = get_rate("graylog_%s.rate" % avg_key, this_time, messages) avg_rate = get_average("graylog_%s.avg" % avg_key, this_time, rate, avg) yield check_levels( avg_rate, avg_key, msgs_avg_levels_upper + msgs_avg_levels_lower, infoname="Average number of messages (%s)" % get_age_human_readable(avg * 60), ) diff_key = "msgs_diff" timespan = params.get(diff_key, 1800) diff_levels_upper = params.get("%s_upper" % diff_key, (None, None)) diff_levels_lower = params.get("%s_lower" % diff_key, (None, None)) diff = _get_value_diff("graylog_%s" % diff_key, messages, timespan) yield check_levels( diff, "graylog_diff", diff_levels_upper + diff_levels_lower, infoname="Total number of messages last %s" % get_age_human_readable(timespan), )
def _util_perfdata(core, total_perc, core_index, this_time, params): if "core_util_time" in params: threshold, warn, crit = params["core_util_time"] yield cpu_util_time(this_time, core, total_perc, threshold, warn, crit) config_single_avg = params.get('average_single', {}) metric_raw, metric_avg = cpu_util_core_name(core, core_index) if not params.get("core_util_graph"): metric_raw = None if not config_single_avg.get('show_graph'): metric_avg = None if config_single_avg.get('apply_levels'): levels_raw = None levels_avg = params.get('levels_single') else: levels_raw = params.get('levels_single') levels_avg = None yield from _check_single_core_util( total_perc, metric_raw, levels_raw, "Core %s" % core, ) time_avg = config_single_avg.get('time_average') if time_avg: yield from _check_single_core_util( get_average( "cpu_utilization_%d.avg" % core_index, this_time, total_perc, time_avg, ), metric_avg, levels_avg, "Core %s (%d-min average)" % (core, time_avg), )
def size_trend( check, item, resource, levels, used_mb, size_mb: float, timestamp=None, ): # pylint: disable=function-redefined """Trend computation for size related checks of disks, ram, etc. Trends are computed in two steps. In the first step the delta to the last check is computed, using a normal check_mk counter. In the second step an average over that counter is computed to make a long-term prediction. Note: This function is experimental and may change in future releases. Use at your own risk! Args: check (str): The name of the check, e.g. "df". item (str): The name of the item, e.g. the mountpoint "/" for df. resource (str): The resource in question, e.g. "disk", "ram", "swap". levels (dict): Level parameters for the trend computation. Items: "trend_range" : 24, # interval for the trend in hours "trend_perfdata" : True # generate perfomance data for trends "trend_bytes" : (10, 20), # change during trend_range "trend_shrinking_bytes": (16, 32), # Bytes of shrinking during trend_range "trend_perc" : (1, 2), # percent change during trend_range "trend_shrinking_perc" : (1, 2), # percent decreasing change during trend_range "trend_timeleft" : (72, 48) # time left in hours until full "trend_showtimeleft : True # display time left in infotext The item "trend_range" is required. All other items are optional. timestamp (float, optional): Time in secs used to calculate the rate and average. Defaults to "None". used_mb (float): Used space in MB. size_mb (float): Max. available space in MB. Returns: A tuple of (state, infotext, perfdata) for the trend computation. If a MKCounterWrapped occurs (i.e. there is not enough data present for the trend computation) the tuple (0, '', []) is returned. """ perfdata: List[ Union[ # Tuple[str, float], # Tuple[str, float, Optional[float], Optional[float], Optional[float], Optional[float]], ] ] state, infotext, perfdata, problems = 0, "", [], [] MB = 1024.0 * 1024.0 H24 = 60 * 60 * 24 range_hours = levels["trend_range"] range_sec = range_hours * 3600.0 if not timestamp: timestamp = time.time() # compute current rate in MB/s by computing delta since last check try: rate = get_rate( "%s.%s.delta" % (check, item), timestamp, used_mb, allow_negative=True, onwrap=RAISE ) except MKCounterWrapped: # need more data for computing a trend return 0, "", [] if levels.get("trend_perfdata"): perfdata.append(("growth", rate * H24)) # average trend in MB/s, initialized with zero (by default) rate_avg = get_average("%s.%s.trend" % (check, item), timestamp, rate, range_sec / 60.0) trend = rate_avg * range_sec sign = "+" if trend > 0 else "" infotext += ", trend: %s%s / %g hours" % ( sign, get_bytes_human_readable(trend * MB), range_hours, ) # levels for performance data warn_perf: Optional[float] = None crit_perf: Optional[float] = None # apply levels for absolute growth / interval trend_bytes = levels.get("trend_bytes") if trend_bytes: wa, cr = trend_bytes warn_perf, crit_perf = wa / MB, cr / MB if trend * MB >= wa: problems.append( "growing too fast (warn/crit at %s/%s per %.1f h)(!" % ( get_bytes_human_readable(wa), get_bytes_human_readable(cr), range_hours, ) ) state = max(1, state) if trend * MB >= cr: state = 2 problems[-1] += "!" problems[-1] += ")" tmp_state, tmp_problem = _check_shrinking( trend * MB, levels.get("trend_shrinking_bytes"), range_hours, get_bytes_human_readable, ) if tmp_state > 0: state = max(state, tmp_state) problems.append(tmp_problem) # apply levels for growth relative to filesystem size trend_perc: Optional[Tuple[float, float]] = levels.get("trend_perc") if trend_perc: wa_perc, cr_perc = trend_perc wa = wa_perc / 100.0 * size_mb cr = cr_perc / 100.0 * size_mb if warn_perf is not None: assert crit_perf is not None warn_perf = min(warn_perf, wa) crit_perf = min(crit_perf, cr) else: warn_perf, crit_perf = wa, cr if trend >= wa: problems.append( "growing too fast (warn/crit at %s/%s per %.1f h)(!" % ( get_percent_human_readable(wa_perc), get_percent_human_readable(cr_perc), range_hours, ) ) state = max(1, state) if trend >= cr: state = 2 problems[-1] += "!" problems[-1] += ")" tmp_state, tmp_problem = _check_shrinking( 100 * trend / size_mb, levels.get("trend_shrinking_perc"), range_hours, get_percent_human_readable, ) if tmp_state > 0: state = max(state, tmp_state) problems.append(tmp_problem) # compute time until filesystem is full (only for positive trend, of course) # The start value of hours_left is negative. The pnp graph and the perfometer # will interpret this as inifinite -> not growing hours_left = -1 if trend > 0: def format_hours(hours): if hours > 365 * 24: return "more than a year" elif hours > 90 * 24: return "%0d months" % (hours / (30 * 24)) # fixed: true-division elif hours > 4 * 7 * 24: # 4 weeks return "%0d weeks" % (hours / (7 * 24)) # fixed: true-division elif hours > 7 * 24: # 1 week return "%0.1f weeks" % (hours / (7 * 24)) # fixed: true-division elif hours > 2 * 24: # 2 days return "%0.1f days" % (hours / 24) # fixed: true-division return "%d hours" % hours hours_left = (size_mb - used_mb) / trend * range_hours hours_txt = format_hours(hours_left) timeleft = levels.get("trend_timeleft") if timeleft: wa, cr = timeleft if hours_left <= cr: state = 2 problems.append("only %s until %s full(!!)" % (hours_txt, resource)) elif hours_left <= wa: state = max(state, 1) problems.append("only %s until %s full(!)" % (hours_txt, resource)) elif hours_left <= wa * 2 or levels.get("trend_showtimeleft"): problems.append("time left until %s full: %s" % (resource, hours_txt)) elif levels.get("trend_showtimeleft"): problems.append("time left until %s full: %s" % (resource, hours_txt)) if levels.get("trend_perfdata"): perfdata.append( ( "trend", rate_avg * H24, (warn_perf / range_sec * H24) if warn_perf is not None else None, (crit_perf / range_sec * H24) if crit_perf is not None else None, 0, 1.0 * size_mb / range_hours, ) ) if levels.get("trend_showtimeleft"): perfdata.append(("trend_hoursleft", hours_left)) if problems: infotext += " - %s" % ", ".join(problems) return state, infotext, perfdata
def check_temperature_trend(temp, params, output_unit, crit, crit_lower, unique_name): def combiner(status, infotext): if "status" in dir(combiner): combiner.status = max(combiner.status, status) else: combiner.status = status if "infotext" in dir(combiner): combiner.infotext += ", " + infotext else: combiner.infotext = infotext try: trend_range_min = params["period"] this_time = time.time() # first compute current rate in C/s by computing delta since last check rate = get_rate("temp.%s.delta" % unique_name, this_time, temp, allow_negative=True) # average trend, initialize with zero (by default), rate_avg is in C/s rate_avg = get_average("temp.%s.trend" % unique_name, this_time, rate, trend_range_min) # rate_avg is growth in C/s, trend is in C per trend range minutes trend = float(rate_avg * trend_range_min * 60.0) sign = "+" if trend > 0 else "" combiner( 0, "rate: %s%s/%g min" % (sign, render_temp(trend, output_unit, True), trend_range_min)) if "trend_levels" in params: warn_upper_trend, crit_upper_trend = params["trend_levels"] else: warn_upper_trend = crit_upper_trend = None # it may be unclear to the user if he should specify temperature decrease as a negative # number or positive. This works either way. Having a positive lower bound makes no # sense anyway. if "trend_levels_lower" in params: warn_lower_trend, crit_lower_trend = [ abs(x) * -1 for x in params["trend_levels_lower"] ] else: warn_lower_trend = crit_lower_trend = None if crit_upper_trend is not None and trend > crit_upper_trend: combiner( 2, "rising faster than %s/%g min(!!)" % (render_temp( crit_upper_trend, output_unit, True), trend_range_min), ) elif warn_upper_trend is not None and trend > warn_upper_trend: combiner( 1, "rising faster than %s/%g min(!)" % (render_temp( warn_upper_trend, output_unit, True), trend_range_min), ) elif crit_lower_trend is not None and trend < crit_lower_trend: combiner( 2, "falling faster than %s/%g min(!!)" % (render_temp( crit_lower_trend, output_unit, True), trend_range_min), ) elif warn_lower_trend is not None and trend < warn_lower_trend: combiner( 1, "falling faster than %s/%g min(!)" % (render_temp( warn_lower_trend, output_unit, True), trend_range_min), ) if "trend_timeleft" in params: # compute time until temperature limit is reached # The start value of minutes_left is negative. The pnp graph and the perfometer # will interpret this as infinite -> not growing limit = crit if trend > 0 else crit_lower if limit: # crit levels may not be set, especially lower level diff_to_limit = limit - temp if rate_avg != 0.0: minutes_left = (diff_to_limit / rate_avg) / 60.0 # fixed: true-division else: minutes_left = float("inf") def format_minutes(minutes): if minutes > 60: # hours hours = int(minutes / 60.0) minutes += -int(hours) * 60 return "%dh %02dm" % (hours, minutes) return "%d minutes" % minutes warn, crit = params["trend_timeleft"] if minutes_left <= crit: combiner( 2, "%s until temp limit reached(!!)" % format_minutes(minutes_left)) elif minutes_left <= warn: combiner( 1, "%s until temp limit reached(!)" % format_minutes(minutes_left)) except MKCounterWrapped: pass return combiner.status, combiner.infotext
def check_diskstat_dict(item, params, disks): # Take care of previously discovered services if item in ("read", "write"): yield 3, "Sorry, the new version of this check does not " \ "support one service for read and one for write anymore." return this_time = time.time() disk = diskstat_select_disk(disks, item) if not disk: return # Averaging # Note: this check uses a simple method of averaging: As soon as averaging # is turned on the actual metrics are *replaced* by the averaged ones. No # duplication of performance data or check output here. This is because we # have so many metrics... prefix = "" averaging = params.get("average") # in seconds here! if averaging: avg_disk = {} # Do not modify our arguments!! for key, value in disk.items(): if isinstance(value, (int, float)): avg_disk[key] = get_average("diskstat.%s.%s.avg" % (item, key), this_time, value, averaging / 60.0) else: avg_disk[key] = value disk = avg_disk prefix = "%s average: " % get_age_human_readable(averaging) # Utilization if "utilization" in disk: util = disk.pop("utilization") yield check_levels(util, "disk_utilization", params.get("utilization"), human_readable_func=lambda x: get_percent_human_readable(x * 100.0), scale=0.01, statemarkers=False, infoname=prefix + "Utilization") # Throughput for what in "read", "write": if what + "_throughput" in disk: throughput = disk.pop(what + "_throughput") yield check_levels(throughput, "disk_" + what + "_throughput", params.get(what), unit="/s", scale=1048576, statemarkers=False, human_readable_func=get_bytes_human_readable, infoname=what.title()) # Average wait from end to end for what in ["wait", "read_wait", "write_wait"]: if "average_" + what in disk: wait = disk.pop("average_" + what) yield check_levels(wait, "disk_average_" + what, params.get(what), unit="ms", scale=0.001, statemarkers=False, infoname="Average %s" % what.title().replace("_", " ")) # Average disk latency if "latency" in disk: latency = disk.pop("latency") yield check_levels(latency, "disk_latency", params.get("latency"), unit="ms", scale=0.001, statemarkers=False, infoname='Latency') # Read/write disk latency for what in ["read", "write"]: latency_key = "%s_latency" % what if latency_key not in disk: continue latency = disk.pop(latency_key) if latency is not None: yield check_levels(latency, "disk_%s" % latency_key, params.get(latency_key), unit="ms", scale=0.001, statemarkers=False, infoname='%s latency' % what.title()) # Queue lengths for what, plugin_text in [ ("queue_length", "Queue Length"), ("read_ql", "Read Queue Length"), ("write_ql", "Write Queue Length"), ]: if what in disk: ql = disk.pop(what) yield check_levels(ql, "disk_" + what, params.get(what), statemarkers=False, infoname="Average %s" % plugin_text) # I/O operations for what in "read", "write": if what + "_ios" in disk: ios = disk.pop(what + "_ios") yield check_levels( ios, "disk_" + what + "_ios", params.get(what + "_ios"), unit="1/s", statemarkers=False, infoname="%s operations" % what.title(), ) # All the other metrics are currently not output in the plugin output - simply because # of their amount. They are present as performance data and will shown in graphs. # Send everything as performance data now. Sort keys alphabetically perfdata = [] for key in sorted(disk): value = disk[key] if isinstance(value, (int, float)): # Currently the levels are not shown in the perfdata perfdata.append(("disk_" + key, value)) if perfdata: yield 0, '', perfdata
def check_diskstat_line(this_time, item, params, line, mode='sectors'): average_range = params.get("average") if average_range == 0: average_range = None # disable averaging when 0 is set perfdata = [] infos = [] status = 0 node = line[0] if node is not None and node != "": infos.append("Node %s" % node) for what, ctr in [("read", line[2]), ("write", line[3])]: if node: countername = "diskstat.%s.%s.%s" % (node, item, what) else: countername = "diskstat.%s.%s" % (item, what) # unpack levels now, need also for perfdata levels = params.get(what) if isinstance(levels, tuple): warn, crit = levels else: warn, crit = None, None per_sec = get_rate(countername, this_time, int(ctr)) if mode == 'sectors': # compute IO rate in bytes/sec bytes_per_sec = per_sec * 512 elif mode == 'bytes': bytes_per_sec = per_sec dsname = what # compute average of the rate over ___ minutes if average_range is not None: perfdata.append((dsname, bytes_per_sec, warn, crit)) bytes_per_sec = get_average(countername + ".avg", this_time, bytes_per_sec, average_range) dsname += ".avg" # check levels state, text, extraperf = check_levels(bytes_per_sec, dsname, levels, scale=1048576, statemarkers=True, unit='/s', human_readable_func=get_bytes_human_readable, infoname=what) if text: infos.append(text) status = max(state, status) perfdata += extraperf # Add performance data for averaged IO if average_range is not None: perfdata = [perfdata[0], perfdata[2], perfdata[1], perfdata[3]] # Process IOs when available ios_per_sec = None if len(line) >= 6 and line[4] >= 0 and line[5] > 0: reads, writes = map(int, line[4:6]) if "read_ios" in params: warn, crit = params["read_ios"] if reads >= crit: infos.append('Read operations: %d (!!)' % (reads)) status = 2 elif reads >= warn: infos.append('Read operations: %d (!)' % (reads)) status = max(status, 1) else: warn, crit = None, None if "write_ios" in params: warn, crit = params["write_ios"] if writes >= crit: infos.append('Write operations: %d (!!)' % (writes)) status = 2 elif writes >= warn: infos.append('Write operations: %d (!)' % (writes)) status = max(status, 1) else: warn, crit = None, None ios = reads + writes ios_per_sec = get_rate(countername + ".ios", this_time, ios) infos.append("IOs: %.2f/sec" % ios_per_sec) if params.get("latency_perfdata"): perfdata.append(("ios", ios_per_sec)) # Do Latency computation if this information is available: if len(line) >= 7 and line[6] >= 0: timems = int(line[6]) timems_per_sec = get_rate(countername + ".time", this_time, timems) if not ios_per_sec: latency = 0.0 else: latency = timems_per_sec / ios_per_sec # fixed: true-division infos.append("Latency: %.2fms" % latency) if "latency" in params: warn, crit = params["latency"] if latency >= crit: status = 2 infos[-1] += "(!!)" elif latency >= warn: status = max(status, 1) infos[-1] += "(!)" else: warn, crit = None, None if params.get("latency_perfdata"): perfdata.append(("latency", latency, warn, crit)) # Queue Lengths (currently only Windows). Windows uses counters here. # I have not understood, why.... if len(line) >= 9: for what, ctr in [("read", line[7]), ("write", line[8])]: countername = "diskstat.%s.ql.%s" % (item, what) levels = params.get(what + "_ql") if levels: warn, crit = levels else: warn, crit = None, None qlx = get_rate(countername, this_time, int(ctr)) ql = qlx / 10000000.0 infos.append(what.title() + " Queue: %.2f" % ql) # check levels if levels is not None: if ql >= crit: status = 2 infos[-1] += "(!!)" elif ql >= warn: status = max(status, 1) infos[-1] += "(!)" if params.get("ql_perfdata"): perfdata.append((what + "_ql", ql)) return (status, ", ".join(infos), perfdata)
def check_memory(params, meminfo): if isinstance(params, tuple): params = {"levels": params} memtotal = MemBytes(meminfo['MemTotal']) memused = MemBytes(memtotal.kb - meminfo['MemFree']) if "SwapFree" in meminfo: swaptotal = MemBytes(meminfo['SwapTotal']) swapused = MemBytes(swaptotal.kb - meminfo['SwapFree']) perfdata = [('swap_used', swapused.bytes, None, None, 0, swaptotal.bytes)] else: swaptotal = None swapused = None perfdata = [] # Size of Pagetable on Linux can be relevant e.g. on ORACLE # servers with much memory, that do not use HugeTables. We account # that for used if 'PageTables' in meminfo: pagetables = MemBytes(meminfo['PageTables']) perfdata.append(('mem_lnx_page_tables', pagetables.bytes)) else: pagetables = None # Buffers and Cached are optional. On Linux both mean basically the same. caches = MemBytes(meminfo.get('Buffers', 0) + meminfo.get('Cached', 0)) ramused = MemBytes(memused.kb - caches.kb) perfdata.append(('mem_used', ramused.bytes, None, None, 0, memtotal.bytes)) perfdata.append(('mem_used_percent', 100. * ramused.bytes / memtotal.bytes, None, None, 0, 100.)) totalused, totalused_descr = _get_total_usage(ramused, swapused, pagetables) infotext = check_memory_element( totalused_descr, totalused.bytes, memtotal.bytes, None, label_total="RAM" if totalused_descr != "RAM" else "", )[1] # Take into account averaging average_min = params.get("average") if average_min: totalused_mb_avg = get_average("mem.used.total", time.time(), totalused.mb, average_min, initialize_zero=False) totalused_perc_avg = totalused_mb_avg / memtotal.mb * 100 infotext += ", %d min average %.1f%%" % (average_min, totalused_perc_avg) perfdata.append(('memusedavg', totalused_mb_avg)) comp_mb = totalused_mb_avg else: comp_mb = totalused.mb # Normalize levels and check them totalvirt = MemBytes((swaptotal.kb if swaptotal is not None else 0) + memtotal.kb) warn, crit = params.get("levels", (None, None)) mode = get_levels_mode_from_value(warn) warn_mb, crit_mb, levels_text = normalize_mem_levels( mode, abs(warn), abs(crit), totalvirt.mb, _perc_total=memtotal.mb, render_unit=1024**2, ) perfdata.append(('mem_lnx_total_used', totalused.bytes, warn_mb * 1024**2, crit_mb * 1024**2, 0, totalvirt.bytes)) # Check levels state = _compute_state(comp_mb, warn_mb, crit_mb) if state and levels_text: infotext = "%s (%s)" % (infotext, levels_text) yield state, infotext, perfdata if totalused_descr != "RAM": yield check_memory_element( "RAM", ramused.bytes, # <- caches subtracted memtotal.bytes, None, ) if swaptotal and swaptotal.bytes: yield check_memory_element( "Swap", swapused.bytes, swaptotal.bytes, None, ) if pagetables: yield 0, "Pagetables: %s" % pagetables.render(), [] # Add additional metrics, provided by Linux. if meminfo.get('Mapped'): for key, label, metric in ( ('Mapped', 'Mapped', 'mem_lnx_mapped'), ('Committed_AS', 'Committed', 'mem_lnx_committed_as'), ('Shmem', 'Shared', 'mem_lnx_shmem'), ): value = MemBytes(meminfo.get(key, 0)) yield 0, "%s: %s" % (label, value.render()), [(metric, value.bytes) ]