Пример #1
0
class HenStatusChecker(Daemon):
    """
    \brief Implements the StatusDaemon external interface
    This class contains the methods called when requests are recieved by the
    Daemon (inherited).
    """
    __version = "Hen Status Daemon v0.1"
    __checker_timer = None
    __checker_lock = None

    __stoppedDaemons = None
    __runningDaemons = None
    __checkerThreads = None
    __doneList = None
    
    __cli_commands_xml = None
    __cli_commands = None

    def __init__(self):
        """\brief Registers remote methods and starts update thread (timer)
        """
        Daemon.__init__(self)
        self.__registerMethods()
        self.__checker_lock = threading.Lock()

        self.__stoppedDaemons = []
        self.__runningDaemons = []
        self.__checkerThreads = {}
        self.__doneList = []
        self.__cli_commands = {}

    def getCLICommandXML(self,prot,seq,ln,payload):
        """\brief Returns the complete XML interpretation of the CLI commands
        available from all the running daemons, plus the standard CLI functions
        such as "exit" and "help".
        """
        if not self.__cli_commands_xml: 
            # This should never happen
            prot.sendReply(500, seq, "No commands found by daemon!")
        

    def getHenStatus(self,prot,seq,ln,payload):
        log.debug("getHenStatus() called.")
        self.__checker_lock.acquire()
        results = "Content-type: text/xml\n"
        results += "Cache-Control: no-store, no-cache, must-revalidate\n\n"
        results += "<processmanagement>\n"
        results += "\t<running>\n"
        for daemon in self.__runningDaemons:
            results += "\t\t<process name=\"%s\" />\n" % str(daemon)
        results += "\t</running>\n"
        results += "\t<stopped>\n"
        for daemon in self.__stoppedDaemons:
            results += "\t\t<process name=\"%s\" />\n" % str(daemon)
        results += "\t</stopped>\n"
        results += "</processmanagement>\n"
        self.__checker_lock.release()
        prot.sendReply(200, seq, results)

    def killDaemon(self,prot,seq,ln,payload):
        prot.sendReply(200, seq, "Killing Daemon!")
        os.abort()

    def getVersion(self,prot,seq,ln,payload):
        """\brief Returns version"""
        payload = self.__version
        prot.sendReply(200, seq, payload)
    
    def stopDaemon(self,prot,seq,ln,payload):
        """\brief Stops the daemon and all threads
        This method will first stop any more incoming queries, then wait for
        any update tasks to complete, before stopping itself.
        """
        log.debug("stopDaemon called.")
        prot.sendReply(200, seq, "Accepted stop request.")
        log.debug("Stopping Checker Timer")
        self.__checker_timer.stop()
        self.acceptConnections(False)
        log.debug("Stopping Hen Status Daemon (self)")
        Daemon.stop(self)
    
    def startCheckerTimer(self):
        self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \
                                    self.checkHenStatus, True)
        self.__checker_timer.start()
    
    def stopCheckerTimer(self):
        self.__checker_timer.stop()
        
    def checkerTimerIsRunning(self):
        if self.__checker_timer:
            if self.__checker_timer.isAlive():
                return True
        return False
    
    def __registerMethods(self):
        log.debug("Registering method handlers...")
        self.registerMethodHandler("get_version", self.getVersion)
        #self.registerMethodHandler("stop_daemon", self.stopDaemon)
        #self.registerMethodHandler("kill_daemon", self.killDaemon)
        self.registerMethodHandler("get_henstatus", self.getHenStatus)
        self.registerMethodHandler("get_cli_command_xml", self.getCLICommandXML)
    
    def __createStatusThreads(self):
        for (daemon, method) in DaemonStatus().getAllDaemonStatusMethods():
            doneEvent = threading.Event()
            self.__checkerThreads[daemon] = \
                DaemonStatusChecker(method, doneEvent, STATUS_TIMEOUT)
            self.__checkerThreads[daemon].start()
            self.__doneList.append(doneEvent)

    def __waitForResults(self):
        while 1:
            done = True
            for doneEvent in self.__doneList:
                if not doneEvent.isSet():
                    done = False
            if done:
                break
            time.sleep(2)
    
    def __collectResults(self):
        for daemon in self.__checkerThreads.keys():
            if self.__checkerThreads[daemon].isOnline():
                self.__runningDaemons.append(daemon)
            else:
                self.__stoppedDaemons.append(daemon)

    def __generateCommandXML(self):
        self.__cli_commands_xml = "<testbedcommands>"
        
        # TODO: !!!

        for daemon in self.__runningDaemons:
            pass
        
    def checkHenStatus(self):
        log.debug("checkHenStatus() called.")
        self.__checker_lock.acquire()
        self.__stoppedDaemons = []
        self.__runningDaemons = []
        self.__checkerThreads = {}
        self.__doneList = []
        self.__createStatusThreads()
        self.__waitForResults()
        self.__collectResults()
        self.__generateCommandXML()
        self.__checker_lock.release()
Пример #2
0
class MonitorDB:
    HOSTONLINE = 1
    HOSTOFFLINE = 0

    __db_dir = None
    __status_lock = None
    __flush_lock = None
    __update_lock = None
    __writebuf_lock = None
    __open_file_lock = None
    __writes_since_flush = None
    __node_fds = None
    __node_locks = None
    __write_buf = None
    __flush_timer = None
    # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..}
    # newest data at the front of the list (inserted at index 0)
    __sensor_history = None
    __sensor_max = None

    __sensor_check_interval = None

    # host status dictionary. Format: {nodeid:status, ...}
    #        Where status is: 0 = OK, 1 = OFFLINE
    __host_status = None

    def __init__(self):
        self.__writes_since_flush = 0
        self.__status_lock = threading.RLock()
        self.__flush_lock = threading.RLock()
        self.__update_lock = threading.RLock()
        self.__writebuf_lock = threading.RLock()
        self.__open_file_lock = threading.RLock()
        self.__node_fds = {}
        self.__node_locks = {}
        self.__write_buf = {}
        self.__sensor_history = {}
        self.__sensor_max = {}
        self.__host_status = {}

    def getStorageStats(self):
        stats = {}
        stats["__sensor_history length"] = len(self.__sensor_history)
        histCount = 0
        for sensorDict in self.__sensor_history.values():
            histCount += len(sensorDict.keys())
        stats["Total history entries"] = histCount
        stats["__sensor_max length"] = len(self.__sensor_max)
        stats["__write_buf length"] = len(self.__write_buf)
        stats["__host_status length"] = len(self.__host_status)
        stats["__node_locks length"] = len(self.__node_locks)
        stats["__node_fds length"] = len(self.__node_fds)
        stats["__writes_since_flush"] = self.__writes_since_flush
        return stats

    def startTimer(self):
        log.debug("MonitorDB: Starting flush timer")
        self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites)
        self.__flush_timer.start()

    def stopTimer(self):
        """\brief Stop the write flush timer.
        """
        log.debug("MonitorDB: stopping flush timer")
        self.__flush_timer.stop()
        self.flushWrites()

    def setSensorCheckInterval(self, interval):
        log.debug("Setting sensor check interval to %s seconds" %
                  str(interval))
        self.__sensor_check_interval = interval

    def setDBDirectory(self, dbDir):
        if len(dbDir) <= 0:
            log.critical(
                "setDBDirectory(): No directory provided, using \"/\"")
            dbDir = "/"
        if dbDir[len(dbDir) - 1] != "/":
            dbDir = dbDir + "/"
        log.debug("setDBDirectory(): Creating dir [" + dbDir + "]")
        if not os.path.exists(dbDir):
            os.mkdir(dbDir)
        log.info("MonitorDB: Using DB directory [" + dbDir + "]")
        self.__db_dir = dbDir

    def setHostStatus(self, nodeid, status):
        self.__status_lock.acquire()
        self.__host_status[nodeid] = status
        self.__status_lock.release()

    def getHostStatus(self, nodeid):
        if self.__host_status.has_key(nodeid):
            return self.__host_status[nodeid]
        else:
            return -1

    def getHostStatuses(self):
        return self.__host_status

    def getHostHistory(self, nodeid):
        history = {}
        if self.__sensor_history.has_key(nodeid):
            return str(self.__sensor_history[nodeid])

    def getAllCurrentSensorReadings(self):
        """\brief Returns the current/max values of sensors for all nodes.
        The results are in the form:
            {nodeid:{sensorid:(type,time,val,maxval,status),..},..}
        """
        results = {}
        for nodeid in self.__sensor_history.keys():
            results[nodeid] = {}
            for sensorid in self.__sensor_history[nodeid].keys():
                if len(self.__sensor_history[nodeid][sensorid]) < 1:
                    return results
                (htype,htime,hval,hstat) = \
                    self.__sensor_history[nodeid][sensorid][0]
                if (self.__sensor_max[nodeid]).has_key(sensorid):
                    results[nodeid][sensorid] = (htype, htime, hval, \
                         (self.__sensor_max[nodeid])[sensorid], hstat)
                else:
                    results[nodeid][sensorid] = \
                            (htype, htime, hval, -1, hstat)
        return results

    def getLastNumSensorReadings(self, nodeid, sensorid, n_readings):
        """\brief Returns the last 'n_readings' readings for the given sensor of
        the given node
        The results are sorted, most recent first, in the form:
            [(time,value), (time,value), ...]
        """
        results = []
        # Check for node history.
        if self.__sensor_history.has_key(nodeid):
            if self.__sensor_history[nodeid].has_key(sensorid):
                readings = (self.__sensor_history[nodeid])[sensorid]
                # Is the history in memory sufficient for the query ?
                if len(readings) >= n_readings:
                    return self.__historyToTimeValPairs( \
                                        readings[0:n_readings - 1])
        # If not, resort to files
        log.debug("getLastNumSensorReadings(): resorting to file lookup")
        files = self.__getOrderedFileNames(self.__db_dir + nodeid)
        for file in files:
            file_readings = self.__readFromFile(nodeid, file, sensorid)
            if file_readings:
                results += file_readings
                if len(results) >= n_readings:
                    return results[0:n_readings - 1]
        # if we're here, we didn't get enough readings! return what we've got.
        return results

    def getSensorReadingsSinceTime(self, nodeid, sensorid, sinceTime):
        """\brief Returns all the sensor readings since the given time.
        The results are sorted, most recent first, in the form:
            [(time,value), (time,value), ...]
        """
        results = []
        # Check for node history.
        if self.__sensor_history.has_key(nodeid):
            if self.__sensor_history[nodeid].has_key(sensorid):
                readings = (self.__sensor_history[nodeid])[sensorid]
                # Is the history in memory sufficient for the query ?
                if len(readings) > 0:
                    (type, time, val, status) = readings[0]
                    if time < sinceTime:
                        return self.__filterTimePairs( \
                           self.__historyToTimeValPairs(readings), sinceTime)
        # If we're here, we need to read from file.
        log.debug("getSensorReadingsSinceTime(): resorting to file lookup")
        files = self.__getOrderedFileNames(self.__db_dir + nodeid)
        for file in files:
            file_readings = self.__readFromFile(nodeid, file, sensorid)
            if file_readings:
                results += file_readings
                results.sort()
                results.reverse()
                (time, val) = file_readings[len(file_readings) - 1]
                if time < sinceTime:
                    return self.__filterTimePairs(results, sinceTime)
        # if we're here, we didn't get enough readings! return what we've got.
        results.sort()
        results.reverse()
        return self.__filterTimePairs(results, sinceTime)

    def __historyToTimeValPairs(self, readings):
        """\brief Converts a list in history format (type, time, val, status) to
            a list in (time,val) format.
        """
        def map_function(self, (type, time, val, status)):
            return (time, val)

        return map(map_function, readings)
Пример #3
0
class SwitchDB:
    HOSTONLINE = 1
    HOSTOFFLINE = 0

    __hm = None
    __db_dir = None
    __status_lock = None
    __flush_lock = None
    __update_lock = None
    __writebuf_lock = None
    __open_file_lock = None
    __writes_since_flush = None
    __node_fds = None
    __node_locks = None
    __write_buf = None
    __flush_timer = None
    # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..}
    __sensor_history = None
    __sensor_max = None

    # fdb format: {mac:(name,{(nodeid,port):[trunk,time]}
    __fdb = None
    # links format :
    __links = None
    __links_map_lock = None
    __links_map = None
    #
    __graph = None
    
    # host status dictionary. Format: {nodeid:status, ...}
    #        Where status is: 0 = OK, 1 = OFFLINE
    __host_status = None

    def __init__(self):
        self.__writes_since_flush = 0
        self.__status_lock = threading.RLock()
        self.__flush_lock = threading.RLock()
        self.__update_lock = threading.RLock()
        self.__writebuf_lock = threading.RLock()
        self.__open_file_lock = threading.RLock()
        self.__links_map_lock = threading.RLock()
        self.__node_fds = {}
        self.__node_locks = {}
        self.__write_buf = {}
        self.__sensor_history = {}
        self.__sensor_max = {}
        self.__fdb = {}
        self.__links = {}
        self.__graph = None
        self.__host_status = {}

    def setHenManager(self,hm):
        self.__hm = hm

    def getStorageStats(self):
        stats = {}
        stats["__sensor_history length"] = len(self.__sensor_history)
        histCount = 0
        for sensorDict in self.__sensor_history.values():
            histCount += len(sensorDict.keys())
        stats["Total history entries"] = histCount
        stats["__sensor_max length"] = len(self.__sensor_max)
        stats["__write_buf length"] = len(self.__write_buf)
        stats["__host_status length"] = len(self.__host_status)
        stats["__node_locks length"] = len(self.__node_locks)
        stats["__node_fds length"] = len(self.__node_fds)
        stats["__writes_since_flush"] = self.__writes_since_flush
        return stats

    def startTimer(self):
        log.debug("SwitchDB: Starting flush timer")
        self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites)
        self.__flush_timer.start()

    def stopTimer(self):
        """\brief Stop the write flush timer.
        """
        log.debug("SwitchDB: stopping flush timer")
        self.__flush_timer.stop()
        self.flushWrites()

    def locateMac(self,m,unique=False):
        mac = m.upper()
        if self.__fdb.has_key(mac):
            res = []  
            for i in self.__fdb[mac][1]:
                if unique:
                    if self.__fdb[mac][1][i][0] == False:
                        res.append(i)
                else:
                    res.append(i)
            if unique:
                if (len(res) == 1):
                    return res
                else:
                    return None
            else:
                return res
        return None

    def setDBDirectory(self, dbDir):
        if len(dbDir) <= 0:
            log.critical("setDBDirectory(): No directory provided, using \"/\"")
            dbDir = "/"
        if dbDir[len(dbDir) - 1] != "/":
            dbDir = dbDir + "/"
        log.debug("setDBDirectory(): Creating dir [" + dbDir + "]")
        if not os.path.exists(dbDir):
            os.mkdir(dbDir)
        log.info("SwitchDB: Using DB directory [" + dbDir + "]")
        self.__db_dir = dbDir

    def setHostStatus(self, nodeid, status):
        self.__status_lock.acquire()
        self.__host_status[nodeid] = status
        self.__status_lock.release()

    def getHostStatus(self, nodeid):
        if self.__host_status.has_key(nodeid):
            return self.__host_status[nodeid]
        else:
            return -1

    def getHostStatuses(self):
        return self.__host_status

    def getHostHistory(self, nodeid):
        history = {}
        if self.__sensor_history.has_key(nodeid):
            return str(self.__sensor_history[nodeid])

    def getLinks(self):
        return self.__links

    def fromMacGetId(self,mac):
        if self.__hm == None:
            log.critical("HM not set in switchdb")
            return None
        try:
            nodes_dict = self.__hm.getNodes("all","all")
        except:
            log.critical("Unable to get nodes from hm")
            return None
        for nodetype in nodes_dict:
            for node in nodes_dict[nodetype].values():
                interface_dict = node.getInterfaces()
                for interface_type in interface_dict:
                    if interface_dict[interface_type] != None :
                        for interface in interface_dict[interface_type]:
                            if (str(interface.getMAC().upper().strip()) == str(mac.upper().strip())):
                                return (node.getNodeID(),interface.getInterfaceID())
        return None #("unknown","unknown")

    def isExternal(self,switch,port):
        # fix this
        #return False
        if self.__links == None:
            return False
        for link in self.__links.values():
            if link[0] != "external":
                continue
            try:
                for member in link[2]:
                    #if switch == "switch10":
                    #    log.debug("SWITCH10 "+str((member,switch,port)))
                    if ((str(switch) == str(member[0])) and (str(port) == str(member[1]))):
                        return True
                    return False
            except Exception, e:
                print "error in isExternal ",e
                traceback.print_exc(file=sys.stdout)
                return False
Пример #4
0
class MonitorDB:
    HOSTONLINE = 1
    HOSTOFFLINE = 0

    __db_dir = None
    __status_lock = None
    __flush_lock = None
    __update_lock = None
    __writebuf_lock = None
    __open_file_lock = None
    __writes_since_flush = None
    __node_fds = None
    __node_locks = None
    __write_buf = None
    __flush_timer = None
    # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..}
    # newest data at the front of the list (inserted at index 0)
    __sensor_history = None
    __sensor_max = None
    
    __sensor_check_interval = None

    # host status dictionary. Format: {nodeid:status, ...}
    #        Where status is: 0 = OK, 1 = OFFLINE
    __host_status = None

    def __init__(self):
        self.__writes_since_flush = 0
        self.__status_lock = threading.RLock()
        self.__flush_lock = threading.RLock()
        self.__update_lock = threading.RLock()
        self.__writebuf_lock = threading.RLock()
        self.__open_file_lock = threading.RLock()
        self.__node_fds = {}
        self.__node_locks = {}
        self.__write_buf = {}
        self.__sensor_history = {}
        self.__sensor_max = {}
        self.__host_status = {}

    def getStorageStats(self):
        stats = {}
        stats["__sensor_history length"] = len(self.__sensor_history)
        histCount = 0
        for sensorDict in self.__sensor_history.values():
            histCount += len(sensorDict.keys())
        stats["Total history entries"] = histCount
        stats["__sensor_max length"] = len(self.__sensor_max)
        stats["__write_buf length"] = len(self.__write_buf)
        stats["__host_status length"] = len(self.__host_status)
        stats["__node_locks length"] = len(self.__node_locks)
        stats["__node_fds length"] = len(self.__node_fds)
        stats["__writes_since_flush"] = self.__writes_since_flush
        return stats

    def startTimer(self):
        log.debug("MonitorDB: Starting flush timer")
        self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites)
        self.__flush_timer.start()

    def stopTimer(self):
        """\brief Stop the write flush timer.
        """
        log.debug("MonitorDB: stopping flush timer")
        self.__flush_timer.stop()
        self.flushWrites()

    def setSensorCheckInterval(self, interval):
        log.debug("Setting sensor check interval to %s seconds" % str(interval))
        self.__sensor_check_interval = interval

    def setDBDirectory(self, dbDir):
        if len(dbDir) <= 0:
            log.critical("setDBDirectory(): No directory provided, using \"/\"")
            dbDir = "/"
        if dbDir[len(dbDir) - 1] != "/":
            dbDir = dbDir + "/"
        log.debug("setDBDirectory(): Creating dir [" + dbDir + "]")
        if not os.path.exists(dbDir):
            os.mkdir(dbDir)
        log.info("MonitorDB: Using DB directory [" + dbDir + "]")
        self.__db_dir = dbDir

    def setHostStatus(self, nodeid, status):
        self.__status_lock.acquire()
        self.__host_status[nodeid] = status
        self.__status_lock.release()

    def getHostStatus(self, nodeid):
        if self.__host_status.has_key(nodeid):
            return self.__host_status[nodeid]
        else:
            return -1

    def getHostStatuses(self):
        return self.__host_status

    def getHostHistory(self, nodeid):
        history = {}
        if self.__sensor_history.has_key(nodeid):
            return str(self.__sensor_history[nodeid])

    def getAllCurrentSensorReadings(self):
        """\brief Returns the current/max values of sensors for all nodes.
        The results are in the form:
            {nodeid:{sensorid:(type,time,val,maxval,status),..},..}
        """
        results = {}
        for nodeid in self.__sensor_history.keys():
            results[nodeid] = {}
            for sensorid in self.__sensor_history[nodeid].keys():
                if len(self.__sensor_history[nodeid][sensorid]) < 1:
                    return results
                (htype,htime,hval,hstat) = \
                    self.__sensor_history[nodeid][sensorid][0]
                if (self.__sensor_max[nodeid]).has_key(sensorid):
                    results[nodeid][sensorid] = (htype, htime, hval, \
                         (self.__sensor_max[nodeid])[sensorid], hstat)
                else:
                    results[nodeid][sensorid] = \
                            (htype, htime, hval, -1, hstat)
        return results

    def getLastNumSensorReadings(self, nodeid, sensorid, n_readings):
        """\brief Returns the last 'n_readings' readings for the given sensor of
        the given node
        The results are sorted, most recent first, in the form:
            [(time,value), (time,value), ...]
        """
        results = []
        # Check for node history.
        if self.__sensor_history.has_key(nodeid):
            if self.__sensor_history[nodeid].has_key(sensorid):
                readings = (self.__sensor_history[nodeid])[sensorid]
                # Is the history in memory sufficient for the query ?
                if len(readings) >= n_readings:
                    return self.__historyToTimeValPairs( \
                                        readings[0:n_readings - 1])
        # If not, resort to files
        log.debug("getLastNumSensorReadings(): resorting to file lookup")
        files = self.__getOrderedFileNames(self.__db_dir + nodeid)
        for file in files:
            file_readings = self.__readFromFile(nodeid, file, sensorid)
            if file_readings:
                results += file_readings
                if len(results) >= n_readings:
                    return results[0:n_readings - 1]
        # if we're here, we didn't get enough readings! return what we've got.
        return results

    def getSensorReadingsSinceTime(self, nodeid, sensorid, sinceTime):
        """\brief Returns all the sensor readings since the given time.
        The results are sorted, most recent first, in the form:
            [(time,value), (time,value), ...]
        """
        results = []
        # Check for node history.
        if self.__sensor_history.has_key(nodeid):
            if self.__sensor_history[nodeid].has_key(sensorid):
                readings = (self.__sensor_history[nodeid])[sensorid]
                # Is the history in memory sufficient for the query ?
                if len(readings) > 0:
                    (type,time,val,status) = readings[0]
                    if time < sinceTime:
                        return self.__filterTimePairs( \
                           self.__historyToTimeValPairs(readings), sinceTime)
        # If we're here, we need to read from file.
        log.debug("getSensorReadingsSinceTime(): resorting to file lookup")
        files = self.__getOrderedFileNames(self.__db_dir + nodeid)
        for file in files:
            file_readings = self.__readFromFile(nodeid, file, sensorid)
            if file_readings:
                results += file_readings
                results.sort()
                results.reverse()
                (time,val) = file_readings[len(file_readings) - 1]
                if time < sinceTime:
                    return self.__filterTimePairs(results, sinceTime)
        # if we're here, we didn't get enough readings! return what we've got.
        results.sort()
        results.reverse()
        return self.__filterTimePairs(results, sinceTime)

    def __historyToTimeValPairs(self, readings):
        """\brief Converts a list in history format (type, time, val, status) to
            a list in (time,val) format.
        """
        def map_function(self, (type, time, val, status)):
            return (time, val)
        return map(map_function, readings)
Пример #5
0
class EmergencyChecker(Daemon):
    """
    \brief Implements the EmergencyD external interface
    This class contains the methods called when requests are recieved by the
    Daemon (inherited).
    """
    __version = "Emergency Daemon v0.1"
    __checker_timer = None
    __checker_lock = None
    __config = None
    __config_path = None
    
    __warning_email_addresses = None
    __critical_email_addresses = None
    __run_level = None
    
    def __init__(self, config=CONFIG_FILE):
        """\brief Registers remote methods and starts update thread (timer)
        """
        Daemon.__init__(self)
        self.__config_path = config
        self.__parseConfig(self.__config_path)
        self.__registerMethods()
        self.__checker_lock = threading.Lock()
    
    def testMethod(self,prot,seq,ln,payload):
        log.debug("testMethod() called.")
        sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 
                    32.6)
        self.__handleCriticalStatus("computer32", sdata)
        prot.sendReply(200, seq, "Test method called.")
    
    def manualRunChecker(self,prot,seq,ln,payload):
        log.debug("manualRunChecker() called.")
        prot.sendReply(200, seq, "Running runChecker.")
        self.runChecker()
        
    def manualStopUpdateTimer(self,prot,seq,ln,payload):
        log.debug("manualStopUpdateTimer() called.")
        if not self.updateTimerIsRunning():
            prot.sendReply(400, seq, "The update timer isn't running!") 
        else:
            self.stopUpdateTimer()
            prot.sendReply(200, seq, "Update Timer stopped.")
        
    def manualStartUpdateTimer(self,prot,seq,ln,payload):
        log.debug("manualStartUpdateTimer() called.")
        if self.updateTimerIsRunning():
            prot.sendReply(400, seq, "The update timer is already running!") 
        else:
            self.startUpdateTimer()
            prot.sendReply(200, seq, "Update Timer started.")
            
    def setRunLevel(self,prot,seq,ln,payload):
        log.debug("setRunLevel() called.")
        new_run_level = int(payload)
        if not self.__validRunLevel(new_run_level):
            log.debug("Invalid run_level (%d) given" % new_run_level)
            prot.sendReply(400, seq, "Invalid run_level given.")
            return
        if new_run_level < self.__run_level:
            payload = "Lowering run_level from %d to %d." % \
                      (self.__run_level, new_run_level)
        elif new_run_level > self.__run_level:
            payload = "Raising run_level from %d to %d." % \
                      (self.__run_level, new_run_level)
        else:
            payload = "No change in run_level."
        log.info(payload)
        prot.sendReply(200, seq, payload)
        self.__run_level = new_run_level
    
    def getRunLevel(self,prot,seq,ln,payload):
        log.debug("getRunLevel() called.")
        prot.sendReply(200, seq, str(self.__run_level))

    def reloadConfig(self,prot,seq,ln,payload):
        log.debug("reloadConfig() called.")
        self.__parseConfig(config)
        prot.sendReply(200, seq, "Reload of config file completed.")        

    def killDaemon(self,prot,seq,ln,payload):
        prot.sendReply(200, seq, "Killing Daemon!")
        os.abort()

    def getVersion(self,prot,seq,ln,payload):
        """\brief Returns version"""
        payload = self.__version
        prot.sendReply(200, seq, payload)
    
    def stopDaemon(self,prot,seq,ln,payload):
        """\brief Stops the daemon and all threads
        This method will first stop any more incoming queries, then wait for
        any update tasks to complete, before stopping itself.
        """
        log.debug("stopDaemon called.")
        prot.sendReply(200, seq, "Accepted stop request.")
        log.debug("Stopping Checker Timer")
        self.__checker_timer.stop()
        self.acceptConnections(False)
        log.debug("Stopping Emergency Daemon (self)")
        Daemon.stop(self)
    
    def startUpdateTimer(self):
        self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \
                                    self.runChecker, True)
        self.__checker_timer.start()
    
    def stopUpdateTimer(self):
        self.__checker_timer.stop()
        
    def updateTimerIsRunning(self):
        if self.__checker_timer:
            if self.__checker_timer.isAlive():
                return True
        return False
    
    def runChecker(self):
        log.debug("runChecker() called.")
        log.debug("Acquiring checker lock.")
        self.__checker_lock.acquire()
        p = Protocol(None)
        if DaemonStatus().monitorDaemonIsOnline(5):
            p.open(MONITORD_HOST, MONITORD_PORT)
            p.sendRequest("get_currentsensorreadings","",self.nodeStatusHandler)
            p.readAndProcess()
        else:
            log.info("Monitor Daemon is not online!") # TODO: Email
        self.__checker_lock.release()
        log.debug("Released checker lock.")
        
    def nodeStatusHandler(self,code,seq,size,payload):
        if (code != 200) or (len(payload) == 0):
            # TODO: Warn someone that monitord isn't working properly
            log.critical("Incorrect payload received from monitor daemon!")
        sensor_dom = xml.dom.minidom.parseString(payload)
        node_readings = sensor_dom.getElementsByTagName("nodereading")
        for nodereading in node_readings:
            self.checkNodeReadings(nodereading)
        sensor_dom.unlink()

    def checkNodeReadings(self, nodereading):
        nodeid = nodereading.attributes["id"].value
        overallstatus = nodereading.attributes["overallstatus"].value
        if overallstatus == 0:
            return
        readings = nodereading.getElementsByTagName("reading")
        for reading in readings:
            # (status, sensorid, sensortype, timeinsecs, sensorvalue, 
            #        sensormaxvalue)
            sdata = self.__parseXMLReading(reading)
            if sdata[0] == 0:
                continue
            elif sdata[0] == 1:
                log.critical("ALERT: [%s][%s] has WARNING status with " + \
                      "curval=[%f], highval=[%f]" % \
                      (nodeid, sdata[1], sdata[4], sdata[5]))
                self.__handleWarningStatus(nodeid, sdata)
            elif sdata[0] >= 2:
                log.critical("ALERT: [%s][%s] has CRITICAL status with " + \
                      "curval=[%f], highval=[%f]" % \
                      (nodeid, sdata[1], sdata[4], sdata[5]))
                self.__handleCriticalStatus(nodeid, sdata)
            else:
                log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \
                      % (nodeid, sdata[1], sdata[0]))
                self.__handleUnknownStatus(nodeid, sdata)
   
    def __makeEmailNodeMessage(self, status, nodeid, sdata, action):
        return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2], 
                status, datetime.datetime.fromtimestamp(float(sdata[3])).\
                strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \
                self.__run_level, action))
    
    def __handleWarningStatus(self, nodeid, sdata):
        """\brief Handles a sensor warning status, by sending out a warning 
        email to the warning_email_addresses recipients.
        """
        message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action)

        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, 
                                                  action)
            log.warning(message)
        elif self.__run_level >= 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, 
                                                  action)
            log.warning(message)
            self.__sendEmail(self.__warning_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\
                     % (nodeid, sdata[1]), message)
    
    def __handleCriticalStatus(self, nodeid, sdata):
        """\brief Handles a critical warning status, by attempting to shut down
        the node, and then send out an email to the critical_email_addresses 
        recipients, with the results and details.
        """
        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, 
                                                  action)
            log.critical(message)
        elif self.__run_level == 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, 
                                                  action)
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\
                     % (nodeid, sdata[1]), message)
        else:
            action = "Email warning, and attempted powerdown of node."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, 
                                                  action)
            powerdown_status = self.__attemptPowerDown(nodeid)
            message += "Output from powerdown attempt:\n" + powerdown_status
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\
                     % (nodeid, sdata[1]), message)
    
    def __handleUnknownStatus(self, nodeid, sdata):
        """\brief Handles an unknown sensor status, by sending out an email to
        the warning_email_addresses recipients with details.
        """
        unknown_state_message = "A node in an UNKNOWN state indicates a system"\
                            + "error. Please notify the author immediately."
        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, 
                                                  action)
            message += unknown_state_message
            log.critical(message)
        elif self.__run_level >= 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, 
                                                  action)
            message += unknown_state_message
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\
                     % (nodeid, sdata[1]), message)
            
    def __attemptPowerDown(self, nodeid):
        """\brief Attempts to power down the node specified by nodeid, and
        returns a string of the results
        \p nodeid - id of node that needs powering down
        """
        (status, output) = commands.getstatusoutput(\
                            "%s %s" % (POWERDOWN_COMMAND, nodeid))
        return output   
    
    def __sendEmail(self, recipients, subject, message):
        """\brief Sends an email with the provided message to the provided list
        of recipients
        \p recipients - list of recipients.
        \p subject - message subject
        \p message - string message to be sent.
        """
        if len(recipients) == 0:
            log.critical("__sendEmail(): Error: No recipients given.")
            return
        message = ("To: %s\r\nSubject: %s\r\n\r\n"
                   % (", ".join(recipients), subject)) + message
        try:
            server = smtplib.SMTP('localhost')
            server.sendmail("", recipients, message)
            server.quit()
        except Exception, e:
            log.critical("__sendEmail() exception: %s" % str(e))
Пример #6
0
class EmergencyChecker(Daemon):
    """
    \brief Implements the EmergencyD external interface
    This class contains the methods called when requests are recieved by the
    Daemon (inherited).
    """
    __version = "Emergency Daemon v0.1"
    __checker_timer = None
    __checker_lock = None
    __config = None
    __config_path = None

    __warning_email_addresses = None
    __critical_email_addresses = None
    __run_level = None

    def __init__(self, config=CONFIG_FILE):
        """\brief Registers remote methods and starts update thread (timer)
        """
        Daemon.__init__(self)
        self.__config_path = config
        self.__parseConfig(self.__config_path)
        self.__registerMethods()
        self.__checker_lock = threading.Lock()

    def testMethod(self, prot, seq, ln, payload):
        log.debug("testMethod() called.")
        sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 32.6)
        self.__handleCriticalStatus("computer32", sdata)
        prot.sendReply(200, seq, "Test method called.")

    def manualRunChecker(self, prot, seq, ln, payload):
        log.debug("manualRunChecker() called.")
        prot.sendReply(200, seq, "Running runChecker.")
        self.runChecker()

    def manualStopUpdateTimer(self, prot, seq, ln, payload):
        log.debug("manualStopUpdateTimer() called.")
        if not self.updateTimerIsRunning():
            prot.sendReply(400, seq, "The update timer isn't running!")
        else:
            self.stopUpdateTimer()
            prot.sendReply(200, seq, "Update Timer stopped.")

    def manualStartUpdateTimer(self, prot, seq, ln, payload):
        log.debug("manualStartUpdateTimer() called.")
        if self.updateTimerIsRunning():
            prot.sendReply(400, seq, "The update timer is already running!")
        else:
            self.startUpdateTimer()
            prot.sendReply(200, seq, "Update Timer started.")

    def setRunLevel(self, prot, seq, ln, payload):
        log.debug("setRunLevel() called.")
        new_run_level = int(payload)
        if not self.__validRunLevel(new_run_level):
            log.debug("Invalid run_level (%d) given" % new_run_level)
            prot.sendReply(400, seq, "Invalid run_level given.")
            return
        if new_run_level < self.__run_level:
            payload = "Lowering run_level from %d to %d." % \
                      (self.__run_level, new_run_level)
        elif new_run_level > self.__run_level:
            payload = "Raising run_level from %d to %d." % \
                      (self.__run_level, new_run_level)
        else:
            payload = "No change in run_level."
        log.info(payload)
        prot.sendReply(200, seq, payload)
        self.__run_level = new_run_level

    def getRunLevel(self, prot, seq, ln, payload):
        log.debug("getRunLevel() called.")
        prot.sendReply(200, seq, str(self.__run_level))

    def reloadConfig(self, prot, seq, ln, payload):
        log.debug("reloadConfig() called.")
        self.__parseConfig(config)
        prot.sendReply(200, seq, "Reload of config file completed.")

    def killDaemon(self, prot, seq, ln, payload):
        prot.sendReply(200, seq, "Killing Daemon!")
        os.abort()

    def getVersion(self, prot, seq, ln, payload):
        """\brief Returns version"""
        payload = self.__version
        prot.sendReply(200, seq, payload)

    def stopDaemon(self, prot, seq, ln, payload):
        """\brief Stops the daemon and all threads
        This method will first stop any more incoming queries, then wait for
        any update tasks to complete, before stopping itself.
        """
        log.debug("stopDaemon called.")
        prot.sendReply(200, seq, "Accepted stop request.")
        log.debug("Stopping Checker Timer")
        self.__checker_timer.stop()
        self.acceptConnections(False)
        log.debug("Stopping Emergency Daemon (self)")
        Daemon.stop(self)

    def startUpdateTimer(self):
        self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \
                                    self.runChecker, True)
        self.__checker_timer.start()

    def stopUpdateTimer(self):
        self.__checker_timer.stop()

    def updateTimerIsRunning(self):
        if self.__checker_timer:
            if self.__checker_timer.isAlive():
                return True
        return False

    def runChecker(self):
        log.debug("runChecker() called.")
        log.debug("Acquiring checker lock.")
        self.__checker_lock.acquire()
        p = Protocol(None)
        if DaemonStatus().monitorDaemonIsOnline(5):
            p.open(MONITORD_HOST, MONITORD_PORT)
            p.sendRequest("get_currentsensorreadings", "",
                          self.nodeStatusHandler)
            p.readAndProcess()
        else:
            log.info("Monitor Daemon is not online!")  # TODO: Email
        self.__checker_lock.release()
        log.debug("Released checker lock.")

    def nodeStatusHandler(self, code, seq, size, payload):
        if (code != 200) or (len(payload) == 0):
            # TODO: Warn someone that monitord isn't working properly
            log.critical("Incorrect payload received from monitor daemon!")
        sensor_dom = xml.dom.minidom.parseString(payload)
        node_readings = sensor_dom.getElementsByTagName("nodereading")
        for nodereading in node_readings:
            self.checkNodeReadings(nodereading)
        sensor_dom.unlink()

    def checkNodeReadings(self, nodereading):
        nodeid = nodereading.attributes["id"].value
        overallstatus = nodereading.attributes["overallstatus"].value
        if overallstatus == 0:
            return
        readings = nodereading.getElementsByTagName("reading")
        for reading in readings:
            # (status, sensorid, sensortype, timeinsecs, sensorvalue,
            #        sensormaxvalue)
            sdata = self.__parseXMLReading(reading)
            if sdata[0] == 0:
                continue
            elif sdata[0] == 1:
                log.critical("ALERT: [%s][%s] has WARNING status with " + \
                      "curval=[%f], highval=[%f]" % \
                      (nodeid, sdata[1], sdata[4], sdata[5]))
                self.__handleWarningStatus(nodeid, sdata)
            elif sdata[0] >= 2:
                log.critical("ALERT: [%s][%s] has CRITICAL status with " + \
                      "curval=[%f], highval=[%f]" % \
                      (nodeid, sdata[1], sdata[4], sdata[5]))
                self.__handleCriticalStatus(nodeid, sdata)
            else:
                log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \
                      % (nodeid, sdata[1], sdata[0]))
                self.__handleUnknownStatus(nodeid, sdata)

    def __makeEmailNodeMessage(self, status, nodeid, sdata, action):
        return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2],
                status, datetime.datetime.fromtimestamp(float(sdata[3])).\
                strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \
                self.__run_level, action))

    def __handleWarningStatus(self, nodeid, sdata):
        """\brief Handles a sensor warning status, by sending out a warning 
        email to the warning_email_addresses recipients.
        """
        message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action)

        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata,
                                                  action)
            log.warning(message)
        elif self.__run_level >= 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata,
                                                  action)
            log.warning(message)
            self.__sendEmail(self.__warning_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\
                     % (nodeid, sdata[1]), message)

    def __handleCriticalStatus(self, nodeid, sdata):
        """\brief Handles a critical warning status, by attempting to shut down
        the node, and then send out an email to the critical_email_addresses 
        recipients, with the results and details.
        """
        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata,
                                                  action)
            log.critical(message)
        elif self.__run_level == 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata,
                                                  action)
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\
                     % (nodeid, sdata[1]), message)
        else:
            action = "Email warning, and attempted powerdown of node."
            message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata,
                                                  action)
            powerdown_status = self.__attemptPowerDown(nodeid)
            message += "Output from powerdown attempt:\n" + powerdown_status
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\
                     % (nodeid, sdata[1]), message)

    def __handleUnknownStatus(self, nodeid, sdata):
        """\brief Handles an unknown sensor status, by sending out an email to
        the warning_email_addresses recipients with details.
        """
        unknown_state_message = "A node in an UNKNOWN state indicates a system"\
                            + "error. Please notify the author immediately."
        if self.__run_level == 0:
            action = "No action - currently running in dry-run mode."
            message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata,
                                                  action)
            message += unknown_state_message
            log.critical(message)
        elif self.__run_level >= 1:
            action = "Email warning, but no direct action."
            message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata,
                                                  action)
            message += unknown_state_message
            log.critical(message)
            self.__sendEmail(self.__critical_email_addresses,
                     "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\
                     % (nodeid, sdata[1]), message)

    def __attemptPowerDown(self, nodeid):
        """\brief Attempts to power down the node specified by nodeid, and
        returns a string of the results
        \p nodeid - id of node that needs powering down
        """
        (status, output) = commands.getstatusoutput(\
                            "%s %s" % (POWERDOWN_COMMAND, nodeid))
        return output

    def __sendEmail(self, recipients, subject, message):
        """\brief Sends an email with the provided message to the provided list
        of recipients
        \p recipients - list of recipients.
        \p subject - message subject
        \p message - string message to be sent.
        """
        if len(recipients) == 0:
            log.critical("__sendEmail(): Error: No recipients given.")
            return
        message = ("To: %s\r\nSubject: %s\r\n\r\n" %
                   (", ".join(recipients), subject)) + message
        try:
            server = smtplib.SMTP('localhost')
            server.sendmail("", recipients, message)
            server.quit()
        except Exception, e:
            log.critical("__sendEmail() exception: %s" % str(e))