예제 #1
0
    def __init__(self, hostname, namespace, diag_hostname):
        self._diag_updater = DiagnosticUpdater(
            name=namespace + 'mem',
            display_name=diag_hostname + ' memory',
        )

        self._namespace = namespace

        self._mutex = threading.Lock()

        self._mem_level_warn = rospy.get_param('~mem_level_warn',
                                               mem_level_warn)
        self._mem_level_error = rospy.get_param('~mem_level_error',
                                                mem_level_error)

        self._usage_timer = None

        self._usage_stat = DiagnosticStatus()
        self._usage_stat.name = 'Memory Usage'
        self._usage_stat.level = 1
        self._usage_stat.hardware_id = hostname
        self._usage_stat.message = 'No Data'
        self._usage_stat.values = [
            KeyValue(key='Update Status', value='No Data'),
            KeyValue(key='Time Since Last Update', value='N/A')
        ]
        self._usage_diagnostic = GenericDiagnostic('/usage')
        self._usage_diagnostic.add_to_updater(self._diag_updater)

        self._last_usage_time = 0

        # Start checking everything
        self.check_usage()
예제 #2
0
    def __init__(self, hostname, namespace, diag_hostname):
        self._mutex = threading.Lock()

        self._hostname = hostname
        self._namespace = namespace
        self._no_temp = rospy.get_param('~no_hdd_temp', False)
        self._no_temp_warn = rospy.get_param('~no_hdd_temp_warn', False)
        self._hdd_level_warn = rospy.get_param('~hdd_level_warn', hdd_level_warn)
        self._hdd_level_error = rospy.get_param('~hdd_level_error', hdd_level_error)
        self._hdd_temp_warn = rospy.get_param('~hdd_temp_warn', hdd_temp_warn)
        self._hdd_temp_error = rospy.get_param('~hdd_temp_error', hdd_temp_error)

        self._last_temp_time = 0
        self._temp_timer = None

        self._diag_updater = DiagnosticUpdater(
            name=namespace + 'hdd',
            display_name=diag_hostname + ' HDD',
        )
        self._temp_stat = None
        self._temp_diagnostic = None
        if not self._no_temp:
            self._temp_stat = DiagnosticStatus()
            self._temp_stat.name = "HDD Temperature"
            self._temp_stat.level = DiagnosticStatus.ERROR
            self._temp_stat.hardware_id = hostname
            self._temp_stat.message = 'No Data'
            self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
                                      KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
            self._temp_diagnostic = GenericDiagnostic('/temp')
            self._temp_diagnostic.add_to_updater(self._diag_updater)
            self.check_temps()

        self._last_usage_time = 0
        self._usage_timer = None
        self._usage_stat = DiagnosticStatus()
        self._usage_stat.level = DiagnosticStatus.ERROR
        self._usage_stat.hardware_id = hostname
        self._usage_stat.name = 'HDD Usage'
        self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
        self._usage_diagnostic = GenericDiagnostic('/usage')
        self._usage_diagnostic.add_to_updater(self._diag_updater)
        self.check_disk_usage()
예제 #3
0
  def __init__(self, hostname, namespace, diag_hostname):
    self._diag_updater = DiagnosticUpdater(
      name=namespace + 'net',
      display_name=diag_hostname + ' network',
    )

    self._namespace = namespace
    self._mutex = threading.Lock()
    self._net_level_warn = rospy.get_param('~net_level_warn', net_level_warn)
    self._net_capacity = rospy.get_param('~net_capacity', net_capacity)
    self._usage_timer = None
    self._usage_stat = DiagnosticStatus()
    self._usage_stat.name = 'Network Usage'
    self._usage_stat.level = 1
    self._usage_stat.hardware_id = hostname
    self._usage_stat.message = 'No Data'
    self._usage_stat.values = [KeyValue(key = 'Update Status',
                               value = 'No Data' ),
                               KeyValue(key = 'Time Since Last Update',
                               value = 'N/A') ]
    self._usage_diagnostic = GenericDiagnostic('/usage')
    self._usage_diagnostic.add_to_updater(self._diag_updater)
    self._last_usage_time = 0
    self.check_usage()
예제 #4
0
class MemMonitor():
    def __init__(self, hostname, namespace, diag_hostname):
        self._diag_updater = DiagnosticUpdater(
            name=namespace + 'mem',
            display_name=diag_hostname + ' memory',
        )

        self._namespace = namespace

        self._mutex = threading.Lock()

        self._mem_level_warn = rospy.get_param('~mem_level_warn',
                                               mem_level_warn)
        self._mem_level_error = rospy.get_param('~mem_level_error',
                                                mem_level_error)

        self._usage_timer = None

        self._usage_stat = DiagnosticStatus()
        self._usage_stat.name = 'Memory Usage'
        self._usage_stat.level = 1
        self._usage_stat.hardware_id = hostname
        self._usage_stat.message = 'No Data'
        self._usage_stat.values = [
            KeyValue(key='Update Status', value='No Data'),
            KeyValue(key='Time Since Last Update', value='N/A')
        ]
        self._usage_diagnostic = GenericDiagnostic('/usage')
        self._usage_diagnostic.add_to_updater(self._diag_updater)

        self._last_usage_time = 0

        # Start checking everything
        self.check_usage()

    ## Must have the lock to cancel everything
    def cancel_timers(self):
        if self._usage_timer:
            self._usage_timer.cancel()

    def check_memory(self):
        values = []
        level = DiagnosticStatus.OK
        msg = ''

        mem_dict = {0: 'OK', 1: 'Low memory', 2: 'Very low memory'}

        try:
            p = subprocess.Popen('free -tm',
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=True)
            stdout, stderr = p.communicate()
            retcode = p.returncode

            if retcode != 0:
                values.append(
                    KeyValue(key="\"free -tm\" Call Error",
                             value=str(retcode)))
                return DiagnosticStatus.ERROR, values

            rows = stdout.split('\n')
            data = rows[1].split()
            total_mem_physical = int(data[1])
            used_mem_physical = int(data[2])
            free_mem_physical = int(data[3])
            shared_mem = int(data[4])
            buff_cache_mem = int(data[5])
            available_mem = int(data[6])
            total_mem_used = total_mem_physical - available_mem
            data = rows[2].split()
            total_mem_swap = data[1]
            used_mem_swap = data[2]
            free_mem_swap = data[3]
            data = rows[3].split()
            total_mem = data[1]
            used_mem = data[2]
            free_mem = data[3]

            level = DiagnosticStatus.OK
            # mem_usage = float(available_mem)/float(total_mem_physical)
            mem_usage = float(total_mem_used) / float(total_mem_physical)
            if (mem_usage < self._mem_level_warn):
                level = DiagnosticStatus.OK
            elif (mem_usage < self._mem_level_error):
                level = DiagnosticStatus.WARN
            else:
                level = DiagnosticStatus.ERROR

            values.append(KeyValue(key='Memory Status', value=mem_dict[level]))
            values.append(
                KeyValue(key='Total Memory (Physical)',
                         value="%sM" % total_mem_physical))
            values.append(
                KeyValue(key='Used Memory (Physical)',
                         value="%sM" % used_mem_physical))
            values.append(
                KeyValue(key='Buff/Cache Memory (Used)',
                         value="%sM" % buff_cache_mem))
            values.append(
                KeyValue(key='Available Memory', value="%sM" % available_mem))
            values.append(
                KeyValue(key='Percent Used',
                         value="%s%%" % int(mem_usage * 100)))

            msg = mem_dict[level]
        except Exception, e:
            rospy.logerr(traceback.format_exc())
            msg = 'Memory usage check error'
            values.append(KeyValue(key=msg, value=str(e)))
            level = DiagnosticStatus.ERROR

        return level, msg, values
예제 #5
0
class NetMonitor():
  def __init__(self, hostname, namespace, diag_hostname):
    self._diag_updater = DiagnosticUpdater(
      name=namespace + 'net',
      display_name=diag_hostname + ' network',
    )

    self._namespace = namespace
    self._mutex = threading.Lock()
    self._net_level_warn = rospy.get_param('~net_level_warn', net_level_warn)
    self._net_capacity = rospy.get_param('~net_capacity', net_capacity)
    self._usage_timer = None
    self._usage_stat = DiagnosticStatus()
    self._usage_stat.name = 'Network Usage'
    self._usage_stat.level = 1
    self._usage_stat.hardware_id = hostname
    self._usage_stat.message = 'No Data'
    self._usage_stat.values = [KeyValue(key = 'Update Status',
                               value = 'No Data' ),
                               KeyValue(key = 'Time Since Last Update',
                               value = 'N/A') ]
    self._usage_diagnostic = GenericDiagnostic('/usage')
    self._usage_diagnostic.add_to_updater(self._diag_updater)
    self._last_usage_time = 0
    self.check_usage()

  def cancel_timers(self):
    if self._usage_timer:
      self._usage_timer.cancel()

  def check_network(self):
    values = []
    net_dict = {0: 'OK', 1: 'High network usage', 2: 'Network down', 3: 'Network check error'}
    try:
      p = subprocess.Popen('ifstat -q -S 1 1',
                           stdout = subprocess.PIPE,
                           stderr = subprocess.PIPE, shell = True)
      stdout, stderr = p.communicate()
      retcode = p.returncode
      if retcode != 0:
        values.append(KeyValue(key = "\"ifstat -q -S 1 1\" Call Error",
          value = str(retcode)))
        return DiagnosticStatus.ERROR, net_dict[3], values
      rows = stdout.split('\n')
      data = rows[0].split()
      ifaces = []
      for i in range(0, len(data)):
        ifaces.append(data[i])
      data = rows[2].split()
      kb_in = []
      kb_out = []
      for i in range(0, len(data), 2):
        kb_in.append(data[i])
        kb_out.append(data[i + 1])
      level = DiagnosticStatus.OK
      for i in range(0, len(ifaces)):
        values.append(KeyValue(key = 'Interface Name',
          value = ifaces[i]))
        (retcode, cmd_out) = get_sys_net(ifaces[i], 'operstate')
        if retcode == 0:
          values.append(KeyValue(key = 'State', value = cmd_out))
          ifacematch = re.match('eth[0-9]+', ifaces[i])
          if ifacematch and (cmd_out == 'down' or cmd_out == 'dormant'):
            level = DiagnosticStatus.ERROR
        values.append(KeyValue(key = 'Input Traffic',
          value = str(float(kb_in[i]) / 1024) + " (MB/s)"))
        values.append(KeyValue(key = 'Output Traffic',
          value = str(float(kb_out[i]) / 1024) + " (MB/s)"))
        net_usage_in = float(kb_in[i]) / 1024 / self._net_capacity
        net_usage_out = float(kb_out[i]) / 1024 / self._net_capacity
        if net_usage_in > self._net_level_warn or\
          net_usage_out > self._net_level_warn:
          level = DiagnosticStatus.WARN
        (retcode, cmd_out) = get_sys_net(ifaces[i], 'mtu')
        if retcode == 0:
          values.append(KeyValue(key = 'MTU', value = cmd_out))
        (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'rx_bytes')
        if retcode == 0:
          values.append(KeyValue(key = 'Total received MB',
            value = str(float(cmd_out) / 1024 / 1024)))
        (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'tx_bytes')
        if retcode == 0:
          values.append(KeyValue(key = 'Total transmitted MB',
            value = str(float(cmd_out) / 1024 / 1024)))
        (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'collisions')
        if retcode == 0:
          values.append(KeyValue(key = 'Collisions', value = cmd_out))
        (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'rx_errors')
        if retcode == 0:
          values.append(KeyValue(key = 'Rx Errors', value = cmd_out))
        (retcode, cmd_out) = get_sys_net_stat(ifaces[i], 'tx_errors')
        if retcode == 0:
          values.append(KeyValue(key = 'Tx Errors', value = cmd_out))
    except Exception, e:
      rospy.logerr(traceback.format_exc())
      msg = 'Network Usage Check Error'
      values.append(KeyValue(key = msg, value = str(e)))
      level = DiagnosticStatus.ERROR
    diag_msg = net_dict[level]
    return level, diag_msg, values
예제 #6
0
def ntp_monitor(namespace,
                offset=500,
                self_offset=500,
                diag_hostname=None,
                error_offset=5000000):
    rospy.init_node(NAME, anonymous=True)
    diag_updater = DiagnosticUpdater(
        name=namespace + 'ntp',
        display_name=diag_hostname + ' NTP',
    )

    hostname = socket.gethostname()
    if diag_hostname is None:
        diag_hostname = hostname

    ntp_hostname = rospy.get_param('~reference_host', 'ntp.ubuntu.com')
    offset = rospy.get_param('~offset_tolerance', 500)
    error_offset = rospy.get_param('~error_offset_tolerance', 5000000)

    stat = DiagnosticStatus()
    stat.level = 0
    stat.name = "NTP Offset"
    stat.message = "OK"
    stat.hardware_id = hostname
    stat.values = []
    stat_diagnostic = GenericDiagnostic('/offset')
    stat_diagnostic.add_to_updater(diag_updater)

    #    self_stat = DiagnosticStatus()
    #    self_stat.level = DiagnosticStatus.OK
    #    self_stat.name = "NTP self-offset for "+ diag_hostname
    #    self_stat.message = "OK"
    #    self_stat.hardware_id = hostname
    #    self_stat.values = []

    while not rospy.is_shutdown():
        for st, host, off in [(stat, ntp_hostname, offset)]:
            try:
                p = Popen(["ntpdate", "-q", host],
                          stdout=PIPE,
                          stdin=PIPE,
                          stderr=PIPE)
                res = p.wait()
                (o, e) = p.communicate()
            except OSError, (errno, msg):
                if errno == 4:
                    break  #ctrl-c interrupt
                else:
                    raise
            if (res == 0):
                measured_offset = float(re.search("offset (.*),",
                                                  o).group(1)) * 1000000
                st.level = DiagnosticStatus.OK
                st.message = "OK"
                st.values = [
                    KeyValue("Offset (us)", str(measured_offset)),
                    KeyValue("Offset tolerance (us)", str(off)),
                    KeyValue("Offset tolerance (us) for Error",
                             str(error_offset))
                ]

                if (abs(measured_offset) > off):
                    st.level = DiagnosticStatus.WARN
                    st.message = "NTP offset too high"
                if (abs(measured_offset) > error_offset):
                    st.level = DiagnosticStatus.ERROR
                    st.message = "NTP offset too high"

            else:
                # Warning (not error), since this is a non-critical failure.
                st.level = DiagnosticStatus.WARN
                st.message = "Error running ntpdate (returned %d)" % res
                st.values = [
                    KeyValue("Offset (us)", "N/A"),
                    KeyValue("Offset tolerance (us)", str(off)),
                    KeyValue("Offset tolerance (us) for Error",
                             str(error_offset)),
                    KeyValue("Output", o),
                    KeyValue("Errors", e)
                ]

        # Convert from ROS diagnostics to mbot_diagnostics for publishing.
        stat_diagnostic.set_status(Status(stat.level), stat.message)
        for diag_val in stat.values:
            stat_diagnostic.set_metric(diag_val.key, diag_val.value)

        time.sleep(1)
예제 #7
0
class hdd_monitor():
    def __init__(self, hostname, namespace, diag_hostname):
        self._mutex = threading.Lock()

        self._hostname = hostname
        self._namespace = namespace
        self._no_temp = rospy.get_param('~no_hdd_temp', False)
        self._no_temp_warn = rospy.get_param('~no_hdd_temp_warn', False)
        self._hdd_level_warn = rospy.get_param('~hdd_level_warn', hdd_level_warn)
        self._hdd_level_error = rospy.get_param('~hdd_level_error', hdd_level_error)
        self._hdd_temp_warn = rospy.get_param('~hdd_temp_warn', hdd_temp_warn)
        self._hdd_temp_error = rospy.get_param('~hdd_temp_error', hdd_temp_error)

        self._last_temp_time = 0
        self._temp_timer = None

        self._diag_updater = DiagnosticUpdater(
            name=namespace + 'hdd',
            display_name=diag_hostname + ' HDD',
        )
        self._temp_stat = None
        self._temp_diagnostic = None
        if not self._no_temp:
            self._temp_stat = DiagnosticStatus()
            self._temp_stat.name = "HDD Temperature"
            self._temp_stat.level = DiagnosticStatus.ERROR
            self._temp_stat.hardware_id = hostname
            self._temp_stat.message = 'No Data'
            self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
                                      KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
            self._temp_diagnostic = GenericDiagnostic('/temp')
            self._temp_diagnostic.add_to_updater(self._diag_updater)
            self.check_temps()

        self._last_usage_time = 0
        self._usage_timer = None
        self._usage_stat = DiagnosticStatus()
        self._usage_stat.level = DiagnosticStatus.ERROR
        self._usage_stat.hardware_id = hostname
        self._usage_stat.name = 'HDD Usage'
        self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
        self._usage_diagnostic = GenericDiagnostic('/usage')
        self._usage_diagnostic.add_to_updater(self._diag_updater)
        self.check_disk_usage()

    ## Must have the lock to cancel everything
    def cancel_timers(self):
        if self._temp_timer:
            self._temp_timer.cancel()
            self._temp_timer = None

        if self._usage_timer:
            self._usage_timer.cancel()
            self._usage_timer = None

    def check_temps(self):
        if rospy.is_shutdown():
            with self._mutex:
                self.cancel_timers()
            return

        diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
                      KeyValue(key = 'Time Since Last Update', value = '0' ) ]
        diag_level = DiagnosticStatus.OK
        diag_message = 'OK'

        temp_ok, drives, makes, temps = get_hddtemp_data()

        for index in range(0, len(drives)):
            temp = temps[index]

            if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
                temp_level = DiagnosticStatus.ERROR
                temp_ok = False
            elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
                temp_level = DiagnosticStatus.OK
                temp = "Removed"
            else:
                temp_level = DiagnosticStatus.OK
                if float(temp) >= self._hdd_temp_warn:
                    temp_level = DiagnosticStatus.WARN
                if float(temp) >= self._hdd_temp_error:
                    temp_level = DiagnosticStatus.ERROR

            diag_level = max(diag_level, temp_level)

            diag_strs.append(KeyValue(key = 'Disk %d Temperature Status' % index, value = temp_dict[temp_level]))
            diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
            diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
            diag_strs.append(KeyValue(key = 'Disk %d Temperature' % index, value = str(temp)+"DegC"))

        if not temp_ok:
            diag_level = DiagnosticStatus.ERROR

        with self._mutex:
            self._last_temp_time = rospy.get_time()
            self._temp_stat.values = diag_strs
            self._temp_stat.level = diag_level

            # Give No Data message if we have no reading
            self._temp_stat.message = temp_dict[diag_level]
            if not temp_ok:
                self._temp_stat.message = 'Error'

            if self._no_temp_warn and temp_ok:
                self._temp_stat.level = DiagnosticStatus.OK

            if not rospy.is_shutdown():
                self._temp_timer = threading.Timer(10.0, self.check_temps)
                self._temp_timer.start()
            else:
                self.cancel_timers()

    def check_disk_usage(self):
        if rospy.is_shutdown():
            with self._mutex:
                self.cancel_timers()
            return

        diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
                      KeyValue(key = 'Time Since Last Update', value = '0' ) ]
        diag_level = DiagnosticStatus.OK
        diag_message = 'OK'

        try:
            p = subprocess.Popen(["df", "-Pht", "ext4"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            retcode = p.returncode

            if (retcode == 0 or retcode == 1):
                diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
                rows = stdout.split('\n')
                del rows[0]
                row_count = 0

                for row in rows:
                    if len(row.split()) < 2:
                        continue
                    if unicode(row.split()[0]) == "none":
                        continue

                    row_count += 1
                    g_available = row.split()[-3]
                    g_use = row.split()[-2]
                    name = row.split()[0]
                    size = row.split()[1]
                    mount_pt = row.split()[-1]

                    hdd_usage = float(g_use.replace("%", ""))*1e-2
                    if (hdd_usage < self._hdd_level_warn):
                        level = DiagnosticStatus.OK
                    elif (hdd_usage < self._hdd_level_error):
                        level = DiagnosticStatus.WARN
                    else:
                        level = DiagnosticStatus.ERROR

                    diag_vals.append(KeyValue(
                            key = 'Disk %d Name' % row_count, value = name))
                    diag_vals.append(KeyValue(
                            key = 'Disk %d Size' % row_count, value = size))
                    diag_vals.append(KeyValue(
                            key = 'Disk %d Available' % row_count, value = g_available))
                    diag_vals.append(KeyValue(
                            key = 'Disk %d Use' % row_count, value = g_use))
                    diag_vals.append(KeyValue(
                            key = 'Disk %d Status' % row_count, value = stat_dict[level]))
                    diag_vals.append(KeyValue(
                            key = 'Disk %d Mount Point' % row_count, value = mount_pt))

                    diag_level = max(diag_level, level)
                    diag_message = usage_dict[diag_level]

            else:
                diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
                diag_level = DiagnosticStatus.ERROR
                diag_message = stat_dict[diag_level]


        except:
            rospy.logerr(traceback.format_exc())

            diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
            diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))

            diag_level = DiagnosticStatus.ERROR
            diag_message = stat_dict[diag_level]

        # Update status
        with self._mutex:
            self._last_usage_time = rospy.get_time()
            self._usage_stat.values = diag_vals
            self._usage_stat.message = diag_message
            self._usage_stat.level = diag_level

            if not rospy.is_shutdown():
                self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
                self._usage_timer.start()
            else:
                self.cancel_timers()


    def publish_stats(self):
        with self._mutex:
            # Convert from ROS diagnostics to mbot_diagnostics for publishing.
            if not self._no_temp:
                update_status_stale(self._temp_stat, self._last_temp_time)
                self._temp_diagnostic.set_status(
                    Status(self._temp_stat.level),
                    self._temp_stat.message,
                )
                for diag_val in self._temp_stat.values:
                    self._temp_diagnostic.set_metric(diag_val.key, diag_val.value)

            update_status_stale(self._usage_stat, self._last_usage_time)
            self._usage_diagnostic.set_status(
                Status(self._usage_stat.level),
                self._usage_stat.message,
            )
            for diag_val in self._usage_stat.values:
                self._usage_diagnostic.set_metric(diag_val.key, diag_val.value)