def __init__(self, hostname, namespace, diag_hostname): self._diag_updater = DiagnosticUpdater( name=namespace + 'mem', display_name=diag_hostname + ' memory', ) self._namespace = namespace self._mutex = threading.Lock() self._mem_level_warn = rospy.get_param('~mem_level_warn', mem_level_warn) self._mem_level_error = rospy.get_param('~mem_level_error', mem_level_error) self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.name = 'Memory Usage' self._usage_stat.level = 1 self._usage_stat.hardware_id = hostname self._usage_stat.message = 'No Data' self._usage_stat.values = [ KeyValue(key='Update Status', value='No Data'), KeyValue(key='Time Since Last Update', value='N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self._last_usage_time = 0 # Start checking everything self.check_usage()
def __init__(self, hostname, namespace, diag_hostname): self._mutex = threading.Lock() self._hostname = hostname self._namespace = namespace self._no_temp = rospy.get_param('~no_hdd_temp', False) self._no_temp_warn = rospy.get_param('~no_hdd_temp_warn', False) self._hdd_level_warn = rospy.get_param('~hdd_level_warn', hdd_level_warn) self._hdd_level_error = rospy.get_param('~hdd_level_error', hdd_level_error) self._hdd_temp_warn = rospy.get_param('~hdd_temp_warn', hdd_temp_warn) self._hdd_temp_error = rospy.get_param('~hdd_temp_error', hdd_temp_error) self._last_temp_time = 0 self._temp_timer = None self._diag_updater = DiagnosticUpdater( name=namespace + 'hdd', display_name=diag_hostname + ' HDD', ) self._temp_stat = None self._temp_diagnostic = None if not self._no_temp: self._temp_stat = DiagnosticStatus() self._temp_stat.name = "HDD Temperature" self._temp_stat.level = DiagnosticStatus.ERROR self._temp_stat.hardware_id = hostname self._temp_stat.message = 'No Data' self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._temp_diagnostic = GenericDiagnostic('/temp') self._temp_diagnostic.add_to_updater(self._diag_updater) self.check_temps() self._last_usage_time = 0 self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.level = DiagnosticStatus.ERROR self._usage_stat.hardware_id = hostname self._usage_stat.name = 'HDD Usage' self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self.check_disk_usage()
def __init__(self, hostname, namespace, diag_hostname): self._diag_updater = DiagnosticUpdater( name=namespace + 'net', display_name=diag_hostname + ' network', ) self._namespace = namespace self._mutex = threading.Lock() self._net_level_warn = rospy.get_param('~net_level_warn', net_level_warn) self._net_capacity = rospy.get_param('~net_capacity', net_capacity) self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.name = 'Network Usage' self._usage_stat.level = 1 self._usage_stat.hardware_id = hostname self._usage_stat.message = 'No Data' self._usage_stat.values = [KeyValue(key = 'Update Status', value = 'No Data' ), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self._last_usage_time = 0 self.check_usage()
class MemMonitor(): def __init__(self, hostname, namespace, diag_hostname): self._diag_updater = DiagnosticUpdater( name=namespace + 'mem', display_name=diag_hostname + ' memory', ) self._namespace = namespace self._mutex = threading.Lock() self._mem_level_warn = rospy.get_param('~mem_level_warn', mem_level_warn) self._mem_level_error = rospy.get_param('~mem_level_error', mem_level_error) self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.name = 'Memory Usage' self._usage_stat.level = 1 self._usage_stat.hardware_id = hostname self._usage_stat.message = 'No Data' self._usage_stat.values = [ KeyValue(key='Update Status', value='No Data'), KeyValue(key='Time Since Last Update', value='N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self._last_usage_time = 0 # Start checking everything self.check_usage() ## Must have the lock to cancel everything def cancel_timers(self): if self._usage_timer: self._usage_timer.cancel() def check_memory(self): values = [] level = DiagnosticStatus.OK msg = '' mem_dict = {0: 'OK', 1: 'Low memory', 2: 'Very low memory'} try: p = subprocess.Popen('free -tm', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() retcode = p.returncode if retcode != 0: values.append( KeyValue(key="\"free -tm\" Call Error", value=str(retcode))) return DiagnosticStatus.ERROR, values rows = stdout.split('\n') data = rows[1].split() total_mem_physical = int(data[1]) used_mem_physical = int(data[2]) free_mem_physical = int(data[3]) shared_mem = int(data[4]) buff_cache_mem = int(data[5]) available_mem = int(data[6]) total_mem_used = total_mem_physical - available_mem data = rows[2].split() total_mem_swap = data[1] used_mem_swap = data[2] free_mem_swap = data[3] data = rows[3].split() total_mem = data[1] used_mem = data[2] free_mem = data[3] level = DiagnosticStatus.OK # mem_usage = float(available_mem)/float(total_mem_physical) mem_usage = float(total_mem_used) / float(total_mem_physical) if (mem_usage < self._mem_level_warn): level = DiagnosticStatus.OK elif (mem_usage < self._mem_level_error): level = DiagnosticStatus.WARN else: level = DiagnosticStatus.ERROR values.append(KeyValue(key='Memory Status', value=mem_dict[level])) values.append( KeyValue(key='Total Memory (Physical)', value="%sM" % total_mem_physical)) values.append( KeyValue(key='Used Memory (Physical)', value="%sM" % used_mem_physical)) values.append( KeyValue(key='Buff/Cache Memory (Used)', value="%sM" % buff_cache_mem)) values.append( KeyValue(key='Available Memory', value="%sM" % available_mem)) values.append( KeyValue(key='Percent Used', value="%s%%" % int(mem_usage * 100))) msg = mem_dict[level] except Exception, e: rospy.logerr(traceback.format_exc()) msg = 'Memory usage check error' values.append(KeyValue(key=msg, value=str(e))) level = DiagnosticStatus.ERROR return level, msg, values
class NetMonitor(): def __init__(self, hostname, namespace, diag_hostname): self._diag_updater = DiagnosticUpdater( name=namespace + 'net', display_name=diag_hostname + ' network', ) self._namespace = namespace self._mutex = threading.Lock() self._net_level_warn = rospy.get_param('~net_level_warn', net_level_warn) self._net_capacity = rospy.get_param('~net_capacity', net_capacity) self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.name = 'Network Usage' self._usage_stat.level = 1 self._usage_stat.hardware_id = hostname self._usage_stat.message = 'No Data' self._usage_stat.values = [KeyValue(key = 'Update Status', value = 'No Data' ), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self._last_usage_time = 0 self.check_usage() def cancel_timers(self): if self._usage_timer: self._usage_timer.cancel() def check_network(self): values = [] net_dict = {0: 'OK', 1: 'High network usage', 2: 'Network down', 3: 'Network check error'} try: p = subprocess.Popen('ifstat -q -S 1 1', stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) stdout, stderr = p.communicate() retcode = p.returncode if retcode != 0: values.append(KeyValue(key = "\"ifstat -q -S 1 1\" Call Error", value = str(retcode))) return DiagnosticStatus.ERROR, net_dict[3], values rows = stdout.split('\n') data = rows[0].split() ifaces = [] for i in range(0, len(data)): ifaces.append(data[i]) data = rows[2].split() kb_in = [] kb_out = [] for i in range(0, len(data), 2): kb_in.append(data[i]) kb_out.append(data[i + 1]) level = DiagnosticStatus.OK for i in range(0, len(ifaces)): values.append(KeyValue(key = 'Interface Name', value = ifaces[i])) (retcode, cmd_out) = get_sys_net(ifaces[i], 'operstate') if retcode == 0: values.append(KeyValue(key = 'State', value = cmd_out)) ifacematch = re.match('eth[0-9]+', ifaces[i]) if ifacematch and (cmd_out == 'down' or cmd_out == 'dormant'): level = DiagnosticStatus.ERROR values.append(KeyValue(key = 'Input Traffic', value = str(float(kb_in[i]) / 1024) + " (MB/s)")) values.append(KeyValue(key = 'Output Traffic', value = str(float(kb_out[i]) / 1024) + " (MB/s)")) net_usage_in = float(kb_in[i]) / 1024 / self._net_capacity net_usage_out = float(kb_out[i]) / 1024 / self._net_capacity if net_usage_in > self._net_level_warn or\ net_usage_out > self._net_level_warn: level = DiagnosticStatus.WARN (retcode, cmd_out) = get_sys_net(ifaces[i], 'mtu') if retcode == 0: values.append(KeyValue(key = 'MTU', value = cmd_out)) (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'rx_bytes') if retcode == 0: values.append(KeyValue(key = 'Total received MB', value = str(float(cmd_out) / 1024 / 1024))) (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'tx_bytes') if retcode == 0: values.append(KeyValue(key = 'Total transmitted MB', value = str(float(cmd_out) / 1024 / 1024))) (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'collisions') if retcode == 0: values.append(KeyValue(key = 'Collisions', value = cmd_out)) (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'rx_errors') if retcode == 0: values.append(KeyValue(key = 'Rx Errors', value = cmd_out)) (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'tx_errors') if retcode == 0: values.append(KeyValue(key = 'Tx Errors', value = cmd_out)) except Exception, e: rospy.logerr(traceback.format_exc()) msg = 'Network Usage Check Error' values.append(KeyValue(key = msg, value = str(e))) level = DiagnosticStatus.ERROR diag_msg = net_dict[level] return level, diag_msg, values
def ntp_monitor(namespace, offset=500, self_offset=500, diag_hostname=None, error_offset=5000000): rospy.init_node(NAME, anonymous=True) diag_updater = DiagnosticUpdater( name=namespace + 'ntp', display_name=diag_hostname + ' NTP', ) hostname = socket.gethostname() if diag_hostname is None: diag_hostname = hostname ntp_hostname = rospy.get_param('~reference_host', 'ntp.ubuntu.com') offset = rospy.get_param('~offset_tolerance', 500) error_offset = rospy.get_param('~error_offset_tolerance', 5000000) stat = DiagnosticStatus() stat.level = 0 stat.name = "NTP Offset" stat.message = "OK" stat.hardware_id = hostname stat.values = [] stat_diagnostic = GenericDiagnostic('/offset') stat_diagnostic.add_to_updater(diag_updater) # self_stat = DiagnosticStatus() # self_stat.level = DiagnosticStatus.OK # self_stat.name = "NTP self-offset for "+ diag_hostname # self_stat.message = "OK" # self_stat.hardware_id = hostname # self_stat.values = [] while not rospy.is_shutdown(): for st, host, off in [(stat, ntp_hostname, offset)]: try: p = Popen(["ntpdate", "-q", host], stdout=PIPE, stdin=PIPE, stderr=PIPE) res = p.wait() (o, e) = p.communicate() except OSError, (errno, msg): if errno == 4: break #ctrl-c interrupt else: raise if (res == 0): measured_offset = float(re.search("offset (.*),", o).group(1)) * 1000000 st.level = DiagnosticStatus.OK st.message = "OK" st.values = [ KeyValue("Offset (us)", str(measured_offset)), KeyValue("Offset tolerance (us)", str(off)), KeyValue("Offset tolerance (us) for Error", str(error_offset)) ] if (abs(measured_offset) > off): st.level = DiagnosticStatus.WARN st.message = "NTP offset too high" if (abs(measured_offset) > error_offset): st.level = DiagnosticStatus.ERROR st.message = "NTP offset too high" else: # Warning (not error), since this is a non-critical failure. st.level = DiagnosticStatus.WARN st.message = "Error running ntpdate (returned %d)" % res st.values = [ KeyValue("Offset (us)", "N/A"), KeyValue("Offset tolerance (us)", str(off)), KeyValue("Offset tolerance (us) for Error", str(error_offset)), KeyValue("Output", o), KeyValue("Errors", e) ] # Convert from ROS diagnostics to mbot_diagnostics for publishing. stat_diagnostic.set_status(Status(stat.level), stat.message) for diag_val in stat.values: stat_diagnostic.set_metric(diag_val.key, diag_val.value) time.sleep(1)
class hdd_monitor(): def __init__(self, hostname, namespace, diag_hostname): self._mutex = threading.Lock() self._hostname = hostname self._namespace = namespace self._no_temp = rospy.get_param('~no_hdd_temp', False) self._no_temp_warn = rospy.get_param('~no_hdd_temp_warn', False) self._hdd_level_warn = rospy.get_param('~hdd_level_warn', hdd_level_warn) self._hdd_level_error = rospy.get_param('~hdd_level_error', hdd_level_error) self._hdd_temp_warn = rospy.get_param('~hdd_temp_warn', hdd_temp_warn) self._hdd_temp_error = rospy.get_param('~hdd_temp_error', hdd_temp_error) self._last_temp_time = 0 self._temp_timer = None self._diag_updater = DiagnosticUpdater( name=namespace + 'hdd', display_name=diag_hostname + ' HDD', ) self._temp_stat = None self._temp_diagnostic = None if not self._no_temp: self._temp_stat = DiagnosticStatus() self._temp_stat.name = "HDD Temperature" self._temp_stat.level = DiagnosticStatus.ERROR self._temp_stat.hardware_id = hostname self._temp_stat.message = 'No Data' self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._temp_diagnostic = GenericDiagnostic('/temp') self._temp_diagnostic.add_to_updater(self._diag_updater) self.check_temps() self._last_usage_time = 0 self._usage_timer = None self._usage_stat = DiagnosticStatus() self._usage_stat.level = DiagnosticStatus.ERROR self._usage_stat.hardware_id = hostname self._usage_stat.name = 'HDD Usage' self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), KeyValue(key = 'Time Since Last Update', value = 'N/A') ] self._usage_diagnostic = GenericDiagnostic('/usage') self._usage_diagnostic.add_to_updater(self._diag_updater) self.check_disk_usage() ## Must have the lock to cancel everything def cancel_timers(self): if self._temp_timer: self._temp_timer.cancel() self._temp_timer = None if self._usage_timer: self._usage_timer.cancel() self._usage_timer = None def check_temps(self): if rospy.is_shutdown(): with self._mutex: self.cancel_timers() return diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) , KeyValue(key = 'Time Since Last Update', value = '0' ) ] diag_level = DiagnosticStatus.OK diag_message = 'OK' temp_ok, drives, makes, temps = get_hddtemp_data() for index in range(0, len(drives)): temp = temps[index] if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE: temp_level = DiagnosticStatus.ERROR temp_ok = False elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE: temp_level = DiagnosticStatus.OK temp = "Removed" else: temp_level = DiagnosticStatus.OK if float(temp) >= self._hdd_temp_warn: temp_level = DiagnosticStatus.WARN if float(temp) >= self._hdd_temp_error: temp_level = DiagnosticStatus.ERROR diag_level = max(diag_level, temp_level) diag_strs.append(KeyValue(key = 'Disk %d Temperature Status' % index, value = temp_dict[temp_level])) diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index])) diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index])) diag_strs.append(KeyValue(key = 'Disk %d Temperature' % index, value = str(temp)+"DegC")) if not temp_ok: diag_level = DiagnosticStatus.ERROR with self._mutex: self._last_temp_time = rospy.get_time() self._temp_stat.values = diag_strs self._temp_stat.level = diag_level # Give No Data message if we have no reading self._temp_stat.message = temp_dict[diag_level] if not temp_ok: self._temp_stat.message = 'Error' if self._no_temp_warn and temp_ok: self._temp_stat.level = DiagnosticStatus.OK if not rospy.is_shutdown(): self._temp_timer = threading.Timer(10.0, self.check_temps) self._temp_timer.start() else: self.cancel_timers() def check_disk_usage(self): if rospy.is_shutdown(): with self._mutex: self.cancel_timers() return diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), KeyValue(key = 'Time Since Last Update', value = '0' ) ] diag_level = DiagnosticStatus.OK diag_message = 'OK' try: p = subprocess.Popen(["df", "-Pht", "ext4"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() retcode = p.returncode if (retcode == 0 or retcode == 1): diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK')) rows = stdout.split('\n') del rows[0] row_count = 0 for row in rows: if len(row.split()) < 2: continue if unicode(row.split()[0]) == "none": continue row_count += 1 g_available = row.split()[-3] g_use = row.split()[-2] name = row.split()[0] size = row.split()[1] mount_pt = row.split()[-1] hdd_usage = float(g_use.replace("%", ""))*1e-2 if (hdd_usage < self._hdd_level_warn): level = DiagnosticStatus.OK elif (hdd_usage < self._hdd_level_error): level = DiagnosticStatus.WARN else: level = DiagnosticStatus.ERROR diag_vals.append(KeyValue( key = 'Disk %d Name' % row_count, value = name)) diag_vals.append(KeyValue( key = 'Disk %d Size' % row_count, value = size)) diag_vals.append(KeyValue( key = 'Disk %d Available' % row_count, value = g_available)) diag_vals.append(KeyValue( key = 'Disk %d Use' % row_count, value = g_use)) diag_vals.append(KeyValue( key = 'Disk %d Status' % row_count, value = stat_dict[level])) diag_vals.append(KeyValue( key = 'Disk %d Mount Point' % row_count, value = mount_pt)) diag_level = max(diag_level, level) diag_message = usage_dict[diag_level] else: diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed')) diag_level = DiagnosticStatus.ERROR diag_message = stat_dict[diag_level] except: rospy.logerr(traceback.format_exc()) diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception')) diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc())) diag_level = DiagnosticStatus.ERROR diag_message = stat_dict[diag_level] # Update status with self._mutex: self._last_usage_time = rospy.get_time() self._usage_stat.values = diag_vals self._usage_stat.message = diag_message self._usage_stat.level = diag_level if not rospy.is_shutdown(): self._usage_timer = threading.Timer(5.0, self.check_disk_usage) self._usage_timer.start() else: self.cancel_timers() def publish_stats(self): with self._mutex: # Convert from ROS diagnostics to mbot_diagnostics for publishing. if not self._no_temp: update_status_stale(self._temp_stat, self._last_temp_time) self._temp_diagnostic.set_status( Status(self._temp_stat.level), self._temp_stat.message, ) for diag_val in self._temp_stat.values: self._temp_diagnostic.set_metric(diag_val.key, diag_val.value) update_status_stale(self._usage_stat, self._last_usage_time) self._usage_diagnostic.set_status( Status(self._usage_stat.level), self._usage_stat.message, ) for diag_val in self._usage_stat.values: self._usage_diagnostic.set_metric(diag_val.key, diag_val.value)