def check_cpu_util(util, params, this_time=None, cores=None, perf_max=100): # Convert legacy param style to new dict style if params is None: params = {} elif isinstance(params, tuple): params = {"util": params} if this_time is None: this_time = time.time() # Old/mixed config may look like: # {'util': (80.0, 90.0), 'levels': None} # 'levels is None' means: Do not impose levels # 'util' from default levels if "levels" in params and "util" in params: levels = params.get('levels') else: levels = params.get("util") if levels is None: # legacy rules before 1.6 levels = params.get("levels") warn, crit = levels if isinstance(levels, tuple) else ( None, None) # only for perfdata perfdata = [("util", util, warn, crit, 0, perf_max)] # Averaging if "average" in params: util_avg = get_average("cpu_utilization.avg", this_time, util, params["average"]) perfdata.append(("util_average", util_avg, warn, crit, 0, perf_max)) state, infotext, extraperf = check_levels( util_avg, "util_average", levels, human_readable_func=get_percent_human_readable, infoname="Total CPU (%dmin average)" % params["average"]) else: state, infotext, extraperf = check_levels( util, "util", levels, human_readable_func=get_percent_human_readable, infoname="Total CPU") perfdata += extraperf[1:] # reference curve for predictive levels yield state, infotext, perfdata if "core_util_time_total" in params: threshold, warn, crit = params["core_util_time_total"] yield cpu_util_time(this_time, "total", util, threshold, warn, crit) if cores and any(x in params for x in [ "average_single", "core_util_graph", "core_util_time", "levels_single", ]): for core_index, (core, total_perc) in enumerate(cores): yield from _util_perfdata(core, total_perc, core_index, this_time, params)
def wmi_yield_raw_counter( table: WMITable, row: Union[str, int], column: Union[str, int], infoname: Optional[str], perfvar: Optional[str], levels=None, unit: str = "", ): if row == "": row = 0 try: value = table.get(row, column) assert value except KeyError: return 3, "counter %r not present anymore" % ((row, column), ), [] return check_levels( int(value), perfvar, get_levels_quadruple(levels), infoname=infoname, unit=unit, human_readable_func=str, )
def check_cpu_load_generic(params, load, num_cpus=1, processor_type=ProcessorType.unspecified): # Prepare performance data levels = params.get("levels") if isinstance(levels, tuple): # fixed levels warn, crit = [p * num_cpus for p in levels] else: # predictive levels warn, crit = None, None perfdata = [('load' + str(z), l, warn, crit, 0, num_cpus) for (z, l) in [(1, load[0]), (5, load[1]), (15, load[2])]] state, infotext, perf = check_levels(load[2], 'load15', levels, factor=num_cpus, infoname="15 min load") perfdata += perf[1:] if num_cpus > 1: infotext += _format_cores_info(num_cpus, processor_type, load[2] / num_cpus) return state, infotext, perfdata
def check_aws_metrics( metric_infos: List[Dict[str, Union[float, Optional[str], Optional[Tuple], Optional[Callable]]]] ) -> Iterable[ServiceCheckResult]: go_stale = True for metric_info in metric_infos: metric_val = metric_info["metric_val"] if metric_val is None: continue go_stale = False yield check_levels( metric_val, # type: ignore[arg-type] metric_info.get("metric_name"), # type: ignore[arg-type] metric_info.get("levels"), human_readable_func=metric_info.get( "human_readable_func"), # type: ignore[arg-type] infoname=metric_info.get("info_name"), # type: ignore[arg-type] ) if go_stale: raise MKCounterWrapped("Currently no data from AWS")
def tolerance_check( *, set_sync_time: Optional[float], levels: Optional[Tuple[float, float]], notice_only: bool = False, ) -> Generator[Tuple[int, str], None, None]: if set_sync_time is not None: set_item_state("time_server", set_sync_time) return last_sync = get_item_state("time_server") now = time.time() pot_newline = "\n" if notice_only else "" label = "Time since last sync" if last_sync is None: set_item_state("time_server", now) yield 0, f"{pot_newline}{label}: N/A (started monitoring)" return state, text, _metric = check_levels( now - last_sync, None, levels, human_readable_func=render.timespan, infoname=label, ) yield state, text if state else f"{pot_newline}{text}"
def check_poe_data(params, poe_data): # data sanity-check if poe_data.poe_max < 0 or poe_data.poe_used < 0 or poe_data.poe_status not in range(1, 4): return 3, "Device returned faulty data: nominal power: %s, power consumption: %s, operational status: %s" % ( str(poe_data.poe_max), str(poe_data.poe_used), str(poe_data.poe_status), ) # PoE on device is turned ON if poe_data.poe_status == PoeStatus.ON: # calculate percentage of power consumption poe_used_percentage = ((float(poe_data.poe_used) / float(poe_data.poe_max)) * 100) if poe_data.poe_max > 0 else 0 return check_levels(poe_used_percentage, "power_usage_percentage", params.get("levels", poe_default_levels), human_readable_func=get_percent_human_readable, infoname="POE usage (%sW/%sW): " % (poe_data.poe_used, poe_data.poe_max)) # PoE on device is turned OFF if poe_data.poe_status == PoeStatus.OFF: return 0, "Operational status of the PSE is OFF" # PoE on device is FAULTY if poe_data.poe_status == PoeStatus.FAULTY: fault_detail = "" if poe_data.poe_status_detail: # optionally concat fault detail string fault_detail = " (%s)" % poe_data.poe_status_detail return 2, "Operational status of the PSE is FAULTY" + fault_detail
def check_netstat_generic(item, params, connections): found = 0 for proto, (local_ip, local_port), (remote_ip, remote_port), connstate in connections: # Beware: port numbers are strings here. match = True for k, v in [ ("local_ip", local_ip), ("local_port", local_port), ("remote_ip", remote_ip), ("remote_port", remote_port), ("proto", proto), ("state", connstate), ]: if k in params and str(params[k]) != v: match = False break if match: found += 1 warn_lower, crit_lower = params.get("min_states", (None, None)) warn_upper, crit_upper = params.get("max_states", (None, None)) yield check_levels(found, "connections", (warn_upper, crit_upper, warn_lower, crit_lower), infoname="Matching entries found", human_readable_func=lambda x: "%d" % x)
def wmi_yield_raw_persec( table: WMITable, row: Union[str, int], column: Union[str, int], infoname: Optional[str], perfvar: Optional[str], levels=None, ): if table is None: # This case may be when a check was discovered with a table which subsequently # disappeared again. We expect to get None in this case and return some "nothing happened" return 0, "", [] if row == "": row = 0 try: value = table.get(row, column) assert value except KeyError: return 3, "Item not present anymore", [] value_per_sec = get_rate("%s_%s" % (column, table.name), get_wmi_time(table, row), int(value)) return check_levels( value_per_sec, perfvar, get_levels_quadruple(levels), infoname=infoname, )
def check_firewall_if(item, params, data): infotext_names = { "ip4_in_blocked": "Incoming IPv4 packets blocked: ", } this_time = time.time() for what, counter in data.items(): rate = get_rate("firewall_if-%s.%s" % (what, item), this_time, counter, onwrap=RAISE) if params.get("averaging"): backlog_minutes = params["averaging"] avgrate = get_average( "firewall_if-%s.%s" % (what, item), this_time, rate, backlog_minutes ) check_against = avgrate else: check_against = rate status, infotext, extraperf = check_levels( check_against, what, params.get(what), human_readable_func=lambda x: "%.2f pkts/s" % x, infoname=infotext_names[what], ) perfdata: List[Any] perfdata = [(what, rate)] + extraperf[:1] # type: ignore[operator] yield status, infotext, perfdata
def check_aws_limits(aws_service, params, parsed_region_data): """ Generic check for checking limits of AWS resource. - levels: use plain resource_key - performance data: aws_%s_%s % AWS resource, resource_key """ long_output = [] levels_reached = set() max_state = 0 perfdata = [] for resource_key, resource_title, limit, amount, human_readable_func in parsed_region_data: try: p_limit, warn, crit = params[resource_key] except KeyError: yield 1, "Unknown resource %r" % str(resource_key) continue if p_limit is None: limit_ref = limit else: limit_ref = p_limit infotext = "%s: %s (of max. %s)" % ( resource_title, human_readable_func(amount), human_readable_func(limit_ref), ) perfvar = "aws_%s_%s" % (aws_service, resource_key) if _is_valid_aws_limits_perf_data(resource_key): perfdata.append((perfvar, amount)) if not limit_ref: continue state, extrainfo, _perfdata = check_levels( 100.0 * amount / limit_ref, None, (warn, crit), human_readable_func=get_percent_human_readable, infoname="Usage", ) max_state = max(state, max_state) if state: levels_reached.add(resource_title) infotext += ", %s%s" % (extrainfo, state_markers[state]) long_output.append(infotext) if levels_reached: yield max_state, "Levels reached: %s" % ", ".join( sorted(levels_reached)), perfdata else: yield 0, "No levels reached", perfdata if long_output: yield 0, "\n%s" % "\n".join(sorted(long_output))
def check_ups_capacity(_item, params, info): # To support inventories with the old version if isinstance(params, tuple): # old format with 2 params in tuple warn, crit = params cap_warn, cap_crit = (95, 90) elif isinstance(params, dict): # new dict format warn, crit = params.get('battime', (0, 0)) cap_warn, cap_crit = params.get('capacity', (95, 90)) else: warn, crit = (0, 0) cap_warn, cap_crit = (95, 90) minutes_on_bat, minutes_left, percent_fuel = ( int(num) if num.strip() else None # for num in info[0]) on_battery = minutes_left is not None and minutes_on_bat # Check time left on battery # `minutes_left` can be 0 which not always means that there's no time left but the device might # also just be on main power supply if on_battery: yield check_levels( minutes_left * 60, "capacity", (None, None, warn * 60, crit * 60), human_readable_func=get_age_human_readable, infoname="Minutes left", ) else: yield 0, "on mains" # Check percentual capacity - note that capacity will only be checked on battery if percent_fuel is not None: yield check_levels( percent_fuel, "percent", (None, None, cap_warn, cap_crit) if on_battery else None, human_readable_func=get_percent_human_readable, infoname="Percent", ) # Output time on battery if minutes_on_bat is not None and minutes_on_bat > 0: yield 0, "Time running on battery: %s" % get_age_human_readable(minutes_on_bat * 60)
def handle_graylog_messages(messages, params): msgs_levels_upper = params.get("msgs_upper", (None, None)) msgs_levels_lower = params.get("msgs_lower", (None, None)) yield check_levels( messages, "messages", msgs_levels_upper + msgs_levels_lower, human_readable_func=int, infoname="Total number of messages", ) avg_key = "msgs_avg" avg = params.get(avg_key, 30) msgs_avg_levels_upper = params.get("msgs_avg_upper", (None, None)) msgs_avg_levels_lower = params.get("msgs_avg_lower", (None, None)) this_time = time.time() rate = get_rate("graylog_%s.rate" % avg_key, this_time, messages) avg_rate = get_average("graylog_%s.avg" % avg_key, this_time, rate, avg) yield check_levels( avg_rate, avg_key, msgs_avg_levels_upper + msgs_avg_levels_lower, infoname="Average number of messages (%s)" % get_age_human_readable(avg * 60), ) diff_key = "msgs_diff" timespan = params.get(diff_key, 1800) diff_levels_upper = params.get("%s_upper" % diff_key, (None, None)) diff_levels_lower = params.get("%s_lower" % diff_key, (None, None)) diff = _get_value_diff("graylog_%s" % diff_key, messages, timespan) yield check_levels( diff, "graylog_diff", diff_levels_upper + diff_levels_lower, infoname="Total number of messages last %s" % get_age_human_readable(timespan), )
def ipmi_common_check_levels(sensorname, val, params, unit=""): for this_sensorname, levels in params.get("numerical_sensor_levels", []): if this_sensorname == sensorname and levels: levels_tuple = levels.get('upper', (None, None)) + levels.get( 'lower', (None, None)) yield check_levels(val, None, levels_tuple, unit=unit, infoname=sensorname) break
def check_fan(rpm, params): if isinstance(params, tuple): params = {"lower": params} levels = params.get("upper", (None, None)) + params["lower"] return check_levels(rpm, "fan" if params.get("output_metrics") else None, levels, unit="RPM", human_readable_func=int, infoname="Speed")
def check_uptime_seconds(params, uptime_sec): if params is None: # legacy: support older versions of parameters params = {} params = params.get("max", (None, None)) + params.get("min", (None, None)) return check_levels(uptime_sec, "uptime", params, human_readable_func=lambda x: timedelta(seconds=int(x)), infoname="Up since %s, uptime" % time.strftime("%c", time.localtime(time.time() - uptime_sec)))
def _check_single_core_util(util, metric, levels, infoname): state, infotext, perfdata = check_levels( util, metric, levels, human_readable_func=get_percent_human_readable, infoname=infoname, ) if not state: infotext = "" if infotext or perfdata: yield state, infotext, perfdata
def wmi_yield_raw_fraction(table, row, column, infoname, perfvar, levels=None): try: average = wmi_calculate_raw_average(table, row, column, 100) except KeyError: return 3, "item not present anymore", [] return check_levels( average, perfvar, _get_levels_quadruple(levels), infoname=infoname, human_readable_func=get_percent_human_readable, boundaries=(0, 100), )
def check_printer_io(item, params, parsed: Section, what): tray = parsed.get(item) if tray is None: return yield 0, tray.description if tray.states.offline: yield 2, "Offline" if tray.states.transitioning: yield 0, "Transitioning" yield ( _STATES_MAP[tray.states.availability], "Status: %s" % tray.states.availability.name.replace("_", " ").capitalize(), ) yield ( tray.states.alerts, "Alerts: %s" % ["None", "Non-Critical", "Critical"][tray.states.alerts], ) if tray.level in [-1, -2] or tray.level < -3: return # totally skip this info when level is unknown or not limited if tray.capacity_max in (-2, -1, 0): # -2: unknown, -1: no restriction, 0: due to saveint yield 0, 'Capacity: %s%s' % (tray.level, tray.capacity_unit) return yield 0, f"Maximal capacity: {tray.capacity_max}{tray.capacity_unit}" how = 'remaining' if what == 'input' else 'filled' if tray.level == -3: yield 0, f"At least one {how}" return yield check_levels( 100.0 * tray.level / tray.capacity_max, # to percent None, # no metric # levels[0], levels[1]: warn/crit output (upper) # levels[3], levels[4]: warn/crit input (lower) ((None, None) if what == 'input' else ()) + params["capacity_levels"], infoname=how.capitalize(), human_readable_func=get_percent_human_readable, )
def check_aws_error_rate(error_rate, request_rate, metric_name_rate, metric_name_perc, levels, display_text): yield (0, '%s: %s' % (display_text, aws_get_counts_rate_human_readable(error_rate)), [(metric_name_rate, error_rate)]) try: errors_perc = 100.0 * error_rate / request_rate except ZeroDivisionError: errors_perc = 0 yield check_levels(errors_perc, metric_name_perc, levels, human_readable_func=get_percent_human_readable, infoname="%s of total requests" % display_text)
def _check_inodes(levels, inodes_total, inodes_avail): if not inodes_total: return inodes_warn_variant, inodes_crit_variant = levels["inodes_levels"] inodes_warn_abs, inodes_crit_abs, human_readable_func = ( # Levels in absolute numbers ( inodes_total - inodes_warn_variant, inodes_total - inodes_crit_variant, get_number_with_precision, ) if isinstance(inodes_warn_variant, int) else # Levels in percent ( (100 - inodes_warn_variant) / 100.0 * inodes_total, (100 - inodes_crit_variant) / 100.0 * inodes_total, lambda x: get_percent_human_readable(100.0 * x / inodes_total), ) if isinstance(inodes_warn_variant, float) else # (None, None, get_number_with_precision)) inode_status, inode_text, inode_perf = check_levels( inodes_total - inodes_avail, 'inodes_used', (inodes_warn_abs, inodes_crit_abs), boundaries=(0, inodes_total), human_readable_func=human_readable_func, infoname="Inodes Used", ) # Only show inodes if they are at less then 50% show_inodes = levels["show_inodes"] inodes_avail_perc = 100.0 * inodes_avail / inodes_total infotext = ( "%s, inodes available: %s/%s" % ( inode_text, get_number_with_precision(inodes_avail), get_percent_human_readable(inodes_avail_perc), ) # if any(( show_inodes == "always", show_inodes == "onlow" and (inode_status or inodes_avail_perc < 50), show_inodes == "onproblem" and inode_status, )) else "") yield inode_status, infotext, inode_perf
def check_humidity(humidity, params): if isinstance(params, dict): levels = (params.get("levels") or (None, None)) + (params.get("levels_lower") or (None, None)) elif isinstance(params, (list, tuple)): # old params = (crit_low , warn_low, warn, crit) levels = (params[2], params[3], params[1], params[0]) else: levels = None return check_levels( humidity, "humidity", levels, human_readable_func=get_percent_human_readable, boundaries=(0, 100), )
def wmi_yield_raw_average_timer(table, row, column, infoname, perfvar, levels=None): try: average = wmi_calculate_raw_average_time( table, row, column) / table.frequency() # fixed: true-division except KeyError: return 3, "item not present anymore", [] return check_levels( average, perfvar, _get_levels_quadruple(levels), infoname=infoname, )
def wmi_yield_raw_average(table, row, column, infoname, perfvar, levels=None, perfscale=1.0): try: average = wmi_calculate_raw_average(table, row, column, 1) * perfscale except KeyError: return 3, "item not present anymore", [] return check_levels( average, perfvar, _get_levels_quadruple(levels), infoname=infoname, human_readable_func=get_age_human_readable, )
def cpu_util_time(this_time, core, perc, threshold, warn_core, crit_core): core_state_name = "cpu.util.core.high.%s" % core if perc > threshold: timestamp = get_item_state(core_state_name, 0) high_load_duration = (this_time - timestamp) state, infotext, _ = check_levels( high_load_duration, "%s_is_under_high_load_for" % core, # Not used (warn_core, crit_core), human_readable_func=get_age_human_readable, infoname="%s is under high load for" % core) if timestamp == 0: set_item_state(core_state_name, this_time) elif state: return state, infotext, [] return 0, "", [] clear_item_state(core_state_name) return 0, "", []
def check_azure_metric( # pylint: disable=too-many-locals resource, metric_key, cmk_key, display_name, levels=None, levels_lower=None, use_rate=False): metric = resource.get('metrics', {}).get(metric_key) if metric is None: return None if use_rate: countername = "%s.%s" % (resource['id'], metric_key) value = get_rate(countername, time.time(), metric.value) unit = "%s_rate" % metric.unit else: value = metric.value unit = metric.unit if value is None: return 3, "Metric %s is 'None'" % display_name, [] # convert to SI-unit if unit == "milli_seconds": value /= 1000. elif unit == "seconds_rate": # we got seconds, but we computed the rate -> seconds per second: # how long happend something / time period = percent of the time # e.g. CPU time: how much percent of of the time was the CPU busy. value *= 100. unit = "percent" return check_levels( value, cmk_key, (levels or (None, None)) + (levels_lower or (None, None)), infoname=display_name, human_readable_func=_AZURE_METRIC_FMT.get( unit, str), # type: ignore[arg-type] boundaries=(0, None), )
def check_aws_metrics( metric_infos: List[Dict[str, Union[float, Optional[str], Optional[Tuple], Optional[Callable]]]] ) -> Iterable[ServiceCheckResult]: go_stale = True for metric_info in metric_infos: metric_val = metric_info['metric_val'] if metric_val is None: continue go_stale = False yield check_levels(metric_val, metric_info.get('metric_name'), metric_info.get('levels'), human_readable_func=metric_info.get('human_readable_func'), infoname=metric_info.get('info_name')) if go_stale: raise MKCounterWrapped("Currently no data from AWS")
def wmi_yield_raw_average( table: WMITable, row: Union[str, int], column: str, infoname: Optional[str], perfvar: Optional[str], levels=None, perfscale: float = 1.0, ): try: average = wmi_calculate_raw_average(table, row, column, 1) * perfscale except KeyError: return 3, "item not present anymore", [] return check_levels( average, perfvar, get_levels_quadruple(levels), infoname=infoname, human_readable_func=get_age_human_readable, )
def wmi_yield_raw_counter(table, row, column, infoname, perfvar, levels=None, unit=""): if row == "": row = 0 try: value = int(table.get(row, column)) except KeyError: return 3, "counter %r not present anymore" % ((row, column), ), [] return check_levels( value, perfvar, _get_levels_quadruple(levels), infoname=infoname, unit=unit, human_readable_func=str, )
def check_ipmi_common_detailed(item, params, data, what, status_txt_mapping): val = data["value"] unit = data["unit"] or "" status_txt = data["status_txt"] crit_low = data["crit_low"] warn_low = data["warn_low"] warn_high = data["warn_high"] crit_high = data["crit_high"] status = status_txt_mapping(status_txt) for wato_status_txt, wato_status in params.get("sensor_states", []): if status_txt.startswith(wato_status_txt): status = wato_status break yield status, "Status: %s" % status_txt perfdata = [] if val is not None: if what == "ipmitool": old_perf_val = str(val) + unit perfdata = [(item, old_perf_val, warn_high, crit_high)] elif what == "freeipmi" and \ ("temperature" in item.lower() or "temp" in item.lower() or unit == 'C'): # Do not save performance data for FANs. This produces # much data and is - in my opinion - useless. perfdata = [("value", val, None, crit_high)] status, infotext, _ = check_levels( val, None, (warn_high, crit_high, warn_low, crit_low), unit) yield status, infotext, perfdata yield from ipmi_common_check_levels(item, val, params, unit) # Sensor reports 'nc' ('non critical'), so we set the state to WARNING if status_txt.startswith('nc'): yield 1, ""
def wmi_yield_raw_average_timer( table: WMITable, row: Union[str, int], column: str, infoname: Optional[str], perfvar: Optional[str], levels=None, ): assert table.frequency try: average = (wmi_calculate_raw_average_time( table, row, column, ) / table.frequency) # fixed: true-division except KeyError: return 3, "item not present anymore", [] return check_levels( average, perfvar, get_levels_quadruple(levels), infoname=infoname, )