class HenStatusChecker(Daemon): """ \brief Implements the StatusDaemon external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Hen Status Daemon v0.1" __checker_timer = None __checker_lock = None __stoppedDaemons = None __runningDaemons = None __checkerThreads = None __doneList = None __cli_commands_xml = None __cli_commands = None def __init__(self): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__registerMethods() self.__checker_lock = threading.Lock() self.__stoppedDaemons = [] self.__runningDaemons = [] self.__checkerThreads = {} self.__doneList = [] self.__cli_commands = {} def getCLICommandXML(self,prot,seq,ln,payload): """\brief Returns the complete XML interpretation of the CLI commands available from all the running daemons, plus the standard CLI functions such as "exit" and "help". """ if not self.__cli_commands_xml: # This should never happen prot.sendReply(500, seq, "No commands found by daemon!") def getHenStatus(self,prot,seq,ln,payload): log.debug("getHenStatus() called.") self.__checker_lock.acquire() results = "Content-type: text/xml\n" results += "Cache-Control: no-store, no-cache, must-revalidate\n\n" results += "<processmanagement>\n" results += "\t<running>\n" for daemon in self.__runningDaemons: results += "\t\t<process name=\"%s\" />\n" % str(daemon) results += "\t</running>\n" results += "\t<stopped>\n" for daemon in self.__stoppedDaemons: results += "\t\t<process name=\"%s\" />\n" % str(daemon) results += "\t</stopped>\n" results += "</processmanagement>\n" self.__checker_lock.release() prot.sendReply(200, seq, results) def killDaemon(self,prot,seq,ln,payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self,prot,seq,ln,payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self,prot,seq,ln,payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Hen Status Daemon (self)") Daemon.stop(self) def startCheckerTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.checkHenStatus, True) self.__checker_timer.start() def stopCheckerTimer(self): self.__checker_timer.stop() def checkerTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def __registerMethods(self): log.debug("Registering method handlers...") self.registerMethodHandler("get_version", self.getVersion) #self.registerMethodHandler("stop_daemon", self.stopDaemon) #self.registerMethodHandler("kill_daemon", self.killDaemon) self.registerMethodHandler("get_henstatus", self.getHenStatus) self.registerMethodHandler("get_cli_command_xml", self.getCLICommandXML) def __createStatusThreads(self): for (daemon, method) in DaemonStatus().getAllDaemonStatusMethods(): doneEvent = threading.Event() self.__checkerThreads[daemon] = \ DaemonStatusChecker(method, doneEvent, STATUS_TIMEOUT) self.__checkerThreads[daemon].start() self.__doneList.append(doneEvent) def __waitForResults(self): while 1: done = True for doneEvent in self.__doneList: if not doneEvent.isSet(): done = False if done: break time.sleep(2) def __collectResults(self): for daemon in self.__checkerThreads.keys(): if self.__checkerThreads[daemon].isOnline(): self.__runningDaemons.append(daemon) else: self.__stoppedDaemons.append(daemon) def __generateCommandXML(self): self.__cli_commands_xml = "<testbedcommands>" # TODO: !!! for daemon in self.__runningDaemons: pass def checkHenStatus(self): log.debug("checkHenStatus() called.") self.__checker_lock.acquire() self.__stoppedDaemons = [] self.__runningDaemons = [] self.__checkerThreads = {} self.__doneList = [] self.__createStatusThreads() self.__waitForResults() self.__collectResults() self.__generateCommandXML() self.__checker_lock.release()
class MonitorDB: HOSTONLINE = 1 HOSTOFFLINE = 0 __db_dir = None __status_lock = None __flush_lock = None __update_lock = None __writebuf_lock = None __open_file_lock = None __writes_since_flush = None __node_fds = None __node_locks = None __write_buf = None __flush_timer = None # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..} # newest data at the front of the list (inserted at index 0) __sensor_history = None __sensor_max = None __sensor_check_interval = None # host status dictionary. Format: {nodeid:status, ...} # Where status is: 0 = OK, 1 = OFFLINE __host_status = None def __init__(self): self.__writes_since_flush = 0 self.__status_lock = threading.RLock() self.__flush_lock = threading.RLock() self.__update_lock = threading.RLock() self.__writebuf_lock = threading.RLock() self.__open_file_lock = threading.RLock() self.__node_fds = {} self.__node_locks = {} self.__write_buf = {} self.__sensor_history = {} self.__sensor_max = {} self.__host_status = {} def getStorageStats(self): stats = {} stats["__sensor_history length"] = len(self.__sensor_history) histCount = 0 for sensorDict in self.__sensor_history.values(): histCount += len(sensorDict.keys()) stats["Total history entries"] = histCount stats["__sensor_max length"] = len(self.__sensor_max) stats["__write_buf length"] = len(self.__write_buf) stats["__host_status length"] = len(self.__host_status) stats["__node_locks length"] = len(self.__node_locks) stats["__node_fds length"] = len(self.__node_fds) stats["__writes_since_flush"] = self.__writes_since_flush return stats def startTimer(self): log.debug("MonitorDB: Starting flush timer") self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites) self.__flush_timer.start() def stopTimer(self): """\brief Stop the write flush timer. """ log.debug("MonitorDB: stopping flush timer") self.__flush_timer.stop() self.flushWrites() def setSensorCheckInterval(self, interval): log.debug("Setting sensor check interval to %s seconds" % str(interval)) self.__sensor_check_interval = interval def setDBDirectory(self, dbDir): if len(dbDir) <= 0: log.critical( "setDBDirectory(): No directory provided, using \"/\"") dbDir = "/" if dbDir[len(dbDir) - 1] != "/": dbDir = dbDir + "/" log.debug("setDBDirectory(): Creating dir [" + dbDir + "]") if not os.path.exists(dbDir): os.mkdir(dbDir) log.info("MonitorDB: Using DB directory [" + dbDir + "]") self.__db_dir = dbDir def setHostStatus(self, nodeid, status): self.__status_lock.acquire() self.__host_status[nodeid] = status self.__status_lock.release() def getHostStatus(self, nodeid): if self.__host_status.has_key(nodeid): return self.__host_status[nodeid] else: return -1 def getHostStatuses(self): return self.__host_status def getHostHistory(self, nodeid): history = {} if self.__sensor_history.has_key(nodeid): return str(self.__sensor_history[nodeid]) def getAllCurrentSensorReadings(self): """\brief Returns the current/max values of sensors for all nodes. The results are in the form: {nodeid:{sensorid:(type,time,val,maxval,status),..},..} """ results = {} for nodeid in self.__sensor_history.keys(): results[nodeid] = {} for sensorid in self.__sensor_history[nodeid].keys(): if len(self.__sensor_history[nodeid][sensorid]) < 1: return results (htype,htime,hval,hstat) = \ self.__sensor_history[nodeid][sensorid][0] if (self.__sensor_max[nodeid]).has_key(sensorid): results[nodeid][sensorid] = (htype, htime, hval, \ (self.__sensor_max[nodeid])[sensorid], hstat) else: results[nodeid][sensorid] = \ (htype, htime, hval, -1, hstat) return results def getLastNumSensorReadings(self, nodeid, sensorid, n_readings): """\brief Returns the last 'n_readings' readings for the given sensor of the given node The results are sorted, most recent first, in the form: [(time,value), (time,value), ...] """ results = [] # Check for node history. if self.__sensor_history.has_key(nodeid): if self.__sensor_history[nodeid].has_key(sensorid): readings = (self.__sensor_history[nodeid])[sensorid] # Is the history in memory sufficient for the query ? if len(readings) >= n_readings: return self.__historyToTimeValPairs( \ readings[0:n_readings - 1]) # If not, resort to files log.debug("getLastNumSensorReadings(): resorting to file lookup") files = self.__getOrderedFileNames(self.__db_dir + nodeid) for file in files: file_readings = self.__readFromFile(nodeid, file, sensorid) if file_readings: results += file_readings if len(results) >= n_readings: return results[0:n_readings - 1] # if we're here, we didn't get enough readings! return what we've got. return results def getSensorReadingsSinceTime(self, nodeid, sensorid, sinceTime): """\brief Returns all the sensor readings since the given time. The results are sorted, most recent first, in the form: [(time,value), (time,value), ...] """ results = [] # Check for node history. if self.__sensor_history.has_key(nodeid): if self.__sensor_history[nodeid].has_key(sensorid): readings = (self.__sensor_history[nodeid])[sensorid] # Is the history in memory sufficient for the query ? if len(readings) > 0: (type, time, val, status) = readings[0] if time < sinceTime: return self.__filterTimePairs( \ self.__historyToTimeValPairs(readings), sinceTime) # If we're here, we need to read from file. log.debug("getSensorReadingsSinceTime(): resorting to file lookup") files = self.__getOrderedFileNames(self.__db_dir + nodeid) for file in files: file_readings = self.__readFromFile(nodeid, file, sensorid) if file_readings: results += file_readings results.sort() results.reverse() (time, val) = file_readings[len(file_readings) - 1] if time < sinceTime: return self.__filterTimePairs(results, sinceTime) # if we're here, we didn't get enough readings! return what we've got. results.sort() results.reverse() return self.__filterTimePairs(results, sinceTime) def __historyToTimeValPairs(self, readings): """\brief Converts a list in history format (type, time, val, status) to a list in (time,val) format. """ def map_function(self, (type, time, val, status)): return (time, val) return map(map_function, readings)
class SwitchDB: HOSTONLINE = 1 HOSTOFFLINE = 0 __hm = None __db_dir = None __status_lock = None __flush_lock = None __update_lock = None __writebuf_lock = None __open_file_lock = None __writes_since_flush = None __node_fds = None __node_locks = None __write_buf = None __flush_timer = None # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..} __sensor_history = None __sensor_max = None # fdb format: {mac:(name,{(nodeid,port):[trunk,time]} __fdb = None # links format : __links = None __links_map_lock = None __links_map = None # __graph = None # host status dictionary. Format: {nodeid:status, ...} # Where status is: 0 = OK, 1 = OFFLINE __host_status = None def __init__(self): self.__writes_since_flush = 0 self.__status_lock = threading.RLock() self.__flush_lock = threading.RLock() self.__update_lock = threading.RLock() self.__writebuf_lock = threading.RLock() self.__open_file_lock = threading.RLock() self.__links_map_lock = threading.RLock() self.__node_fds = {} self.__node_locks = {} self.__write_buf = {} self.__sensor_history = {} self.__sensor_max = {} self.__fdb = {} self.__links = {} self.__graph = None self.__host_status = {} def setHenManager(self,hm): self.__hm = hm def getStorageStats(self): stats = {} stats["__sensor_history length"] = len(self.__sensor_history) histCount = 0 for sensorDict in self.__sensor_history.values(): histCount += len(sensorDict.keys()) stats["Total history entries"] = histCount stats["__sensor_max length"] = len(self.__sensor_max) stats["__write_buf length"] = len(self.__write_buf) stats["__host_status length"] = len(self.__host_status) stats["__node_locks length"] = len(self.__node_locks) stats["__node_fds length"] = len(self.__node_fds) stats["__writes_since_flush"] = self.__writes_since_flush return stats def startTimer(self): log.debug("SwitchDB: Starting flush timer") self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites) self.__flush_timer.start() def stopTimer(self): """\brief Stop the write flush timer. """ log.debug("SwitchDB: stopping flush timer") self.__flush_timer.stop() self.flushWrites() def locateMac(self,m,unique=False): mac = m.upper() if self.__fdb.has_key(mac): res = [] for i in self.__fdb[mac][1]: if unique: if self.__fdb[mac][1][i][0] == False: res.append(i) else: res.append(i) if unique: if (len(res) == 1): return res else: return None else: return res return None def setDBDirectory(self, dbDir): if len(dbDir) <= 0: log.critical("setDBDirectory(): No directory provided, using \"/\"") dbDir = "/" if dbDir[len(dbDir) - 1] != "/": dbDir = dbDir + "/" log.debug("setDBDirectory(): Creating dir [" + dbDir + "]") if not os.path.exists(dbDir): os.mkdir(dbDir) log.info("SwitchDB: Using DB directory [" + dbDir + "]") self.__db_dir = dbDir def setHostStatus(self, nodeid, status): self.__status_lock.acquire() self.__host_status[nodeid] = status self.__status_lock.release() def getHostStatus(self, nodeid): if self.__host_status.has_key(nodeid): return self.__host_status[nodeid] else: return -1 def getHostStatuses(self): return self.__host_status def getHostHistory(self, nodeid): history = {} if self.__sensor_history.has_key(nodeid): return str(self.__sensor_history[nodeid]) def getLinks(self): return self.__links def fromMacGetId(self,mac): if self.__hm == None: log.critical("HM not set in switchdb") return None try: nodes_dict = self.__hm.getNodes("all","all") except: log.critical("Unable to get nodes from hm") return None for nodetype in nodes_dict: for node in nodes_dict[nodetype].values(): interface_dict = node.getInterfaces() for interface_type in interface_dict: if interface_dict[interface_type] != None : for interface in interface_dict[interface_type]: if (str(interface.getMAC().upper().strip()) == str(mac.upper().strip())): return (node.getNodeID(),interface.getInterfaceID()) return None #("unknown","unknown") def isExternal(self,switch,port): # fix this #return False if self.__links == None: return False for link in self.__links.values(): if link[0] != "external": continue try: for member in link[2]: #if switch == "switch10": # log.debug("SWITCH10 "+str((member,switch,port))) if ((str(switch) == str(member[0])) and (str(port) == str(member[1]))): return True return False except Exception, e: print "error in isExternal ",e traceback.print_exc(file=sys.stdout) return False
class MonitorDB: HOSTONLINE = 1 HOSTOFFLINE = 0 __db_dir = None __status_lock = None __flush_lock = None __update_lock = None __writebuf_lock = None __open_file_lock = None __writes_since_flush = None __node_fds = None __node_locks = None __write_buf = None __flush_timer = None # history format: {nodeid:{sensorid:[(type, time, val, status),..],..},..} # newest data at the front of the list (inserted at index 0) __sensor_history = None __sensor_max = None __sensor_check_interval = None # host status dictionary. Format: {nodeid:status, ...} # Where status is: 0 = OK, 1 = OFFLINE __host_status = None def __init__(self): self.__writes_since_flush = 0 self.__status_lock = threading.RLock() self.__flush_lock = threading.RLock() self.__update_lock = threading.RLock() self.__writebuf_lock = threading.RLock() self.__open_file_lock = threading.RLock() self.__node_fds = {} self.__node_locks = {} self.__write_buf = {} self.__sensor_history = {} self.__sensor_max = {} self.__host_status = {} def getStorageStats(self): stats = {} stats["__sensor_history length"] = len(self.__sensor_history) histCount = 0 for sensorDict in self.__sensor_history.values(): histCount += len(sensorDict.keys()) stats["Total history entries"] = histCount stats["__sensor_max length"] = len(self.__sensor_max) stats["__write_buf length"] = len(self.__write_buf) stats["__host_status length"] = len(self.__host_status) stats["__node_locks length"] = len(self.__node_locks) stats["__node_fds length"] = len(self.__node_fds) stats["__writes_since_flush"] = self.__writes_since_flush return stats def startTimer(self): log.debug("MonitorDB: Starting flush timer") self.__flush_timer = GracefulTimer(AUTO_FLUSH_TIME, self.flushWrites) self.__flush_timer.start() def stopTimer(self): """\brief Stop the write flush timer. """ log.debug("MonitorDB: stopping flush timer") self.__flush_timer.stop() self.flushWrites() def setSensorCheckInterval(self, interval): log.debug("Setting sensor check interval to %s seconds" % str(interval)) self.__sensor_check_interval = interval def setDBDirectory(self, dbDir): if len(dbDir) <= 0: log.critical("setDBDirectory(): No directory provided, using \"/\"") dbDir = "/" if dbDir[len(dbDir) - 1] != "/": dbDir = dbDir + "/" log.debug("setDBDirectory(): Creating dir [" + dbDir + "]") if not os.path.exists(dbDir): os.mkdir(dbDir) log.info("MonitorDB: Using DB directory [" + dbDir + "]") self.__db_dir = dbDir def setHostStatus(self, nodeid, status): self.__status_lock.acquire() self.__host_status[nodeid] = status self.__status_lock.release() def getHostStatus(self, nodeid): if self.__host_status.has_key(nodeid): return self.__host_status[nodeid] else: return -1 def getHostStatuses(self): return self.__host_status def getHostHistory(self, nodeid): history = {} if self.__sensor_history.has_key(nodeid): return str(self.__sensor_history[nodeid]) def getAllCurrentSensorReadings(self): """\brief Returns the current/max values of sensors for all nodes. The results are in the form: {nodeid:{sensorid:(type,time,val,maxval,status),..},..} """ results = {} for nodeid in self.__sensor_history.keys(): results[nodeid] = {} for sensorid in self.__sensor_history[nodeid].keys(): if len(self.__sensor_history[nodeid][sensorid]) < 1: return results (htype,htime,hval,hstat) = \ self.__sensor_history[nodeid][sensorid][0] if (self.__sensor_max[nodeid]).has_key(sensorid): results[nodeid][sensorid] = (htype, htime, hval, \ (self.__sensor_max[nodeid])[sensorid], hstat) else: results[nodeid][sensorid] = \ (htype, htime, hval, -1, hstat) return results def getLastNumSensorReadings(self, nodeid, sensorid, n_readings): """\brief Returns the last 'n_readings' readings for the given sensor of the given node The results are sorted, most recent first, in the form: [(time,value), (time,value), ...] """ results = [] # Check for node history. if self.__sensor_history.has_key(nodeid): if self.__sensor_history[nodeid].has_key(sensorid): readings = (self.__sensor_history[nodeid])[sensorid] # Is the history in memory sufficient for the query ? if len(readings) >= n_readings: return self.__historyToTimeValPairs( \ readings[0:n_readings - 1]) # If not, resort to files log.debug("getLastNumSensorReadings(): resorting to file lookup") files = self.__getOrderedFileNames(self.__db_dir + nodeid) for file in files: file_readings = self.__readFromFile(nodeid, file, sensorid) if file_readings: results += file_readings if len(results) >= n_readings: return results[0:n_readings - 1] # if we're here, we didn't get enough readings! return what we've got. return results def getSensorReadingsSinceTime(self, nodeid, sensorid, sinceTime): """\brief Returns all the sensor readings since the given time. The results are sorted, most recent first, in the form: [(time,value), (time,value), ...] """ results = [] # Check for node history. if self.__sensor_history.has_key(nodeid): if self.__sensor_history[nodeid].has_key(sensorid): readings = (self.__sensor_history[nodeid])[sensorid] # Is the history in memory sufficient for the query ? if len(readings) > 0: (type,time,val,status) = readings[0] if time < sinceTime: return self.__filterTimePairs( \ self.__historyToTimeValPairs(readings), sinceTime) # If we're here, we need to read from file. log.debug("getSensorReadingsSinceTime(): resorting to file lookup") files = self.__getOrderedFileNames(self.__db_dir + nodeid) for file in files: file_readings = self.__readFromFile(nodeid, file, sensorid) if file_readings: results += file_readings results.sort() results.reverse() (time,val) = file_readings[len(file_readings) - 1] if time < sinceTime: return self.__filterTimePairs(results, sinceTime) # if we're here, we didn't get enough readings! return what we've got. results.sort() results.reverse() return self.__filterTimePairs(results, sinceTime) def __historyToTimeValPairs(self, readings): """\brief Converts a list in history format (type, time, val, status) to a list in (time,val) format. """ def map_function(self, (type, time, val, status)): return (time, val) return map(map_function, readings)
class EmergencyChecker(Daemon): """ \brief Implements the EmergencyD external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Emergency Daemon v0.1" __checker_timer = None __checker_lock = None __config = None __config_path = None __warning_email_addresses = None __critical_email_addresses = None __run_level = None def __init__(self, config=CONFIG_FILE): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__config_path = config self.__parseConfig(self.__config_path) self.__registerMethods() self.__checker_lock = threading.Lock() def testMethod(self,prot,seq,ln,payload): log.debug("testMethod() called.") sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 32.6) self.__handleCriticalStatus("computer32", sdata) prot.sendReply(200, seq, "Test method called.") def manualRunChecker(self,prot,seq,ln,payload): log.debug("manualRunChecker() called.") prot.sendReply(200, seq, "Running runChecker.") self.runChecker() def manualStopUpdateTimer(self,prot,seq,ln,payload): log.debug("manualStopUpdateTimer() called.") if not self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer isn't running!") else: self.stopUpdateTimer() prot.sendReply(200, seq, "Update Timer stopped.") def manualStartUpdateTimer(self,prot,seq,ln,payload): log.debug("manualStartUpdateTimer() called.") if self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer is already running!") else: self.startUpdateTimer() prot.sendReply(200, seq, "Update Timer started.") def setRunLevel(self,prot,seq,ln,payload): log.debug("setRunLevel() called.") new_run_level = int(payload) if not self.__validRunLevel(new_run_level): log.debug("Invalid run_level (%d) given" % new_run_level) prot.sendReply(400, seq, "Invalid run_level given.") return if new_run_level < self.__run_level: payload = "Lowering run_level from %d to %d." % \ (self.__run_level, new_run_level) elif new_run_level > self.__run_level: payload = "Raising run_level from %d to %d." % \ (self.__run_level, new_run_level) else: payload = "No change in run_level." log.info(payload) prot.sendReply(200, seq, payload) self.__run_level = new_run_level def getRunLevel(self,prot,seq,ln,payload): log.debug("getRunLevel() called.") prot.sendReply(200, seq, str(self.__run_level)) def reloadConfig(self,prot,seq,ln,payload): log.debug("reloadConfig() called.") self.__parseConfig(config) prot.sendReply(200, seq, "Reload of config file completed.") def killDaemon(self,prot,seq,ln,payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self,prot,seq,ln,payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self,prot,seq,ln,payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Emergency Daemon (self)") Daemon.stop(self) def startUpdateTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.runChecker, True) self.__checker_timer.start() def stopUpdateTimer(self): self.__checker_timer.stop() def updateTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def runChecker(self): log.debug("runChecker() called.") log.debug("Acquiring checker lock.") self.__checker_lock.acquire() p = Protocol(None) if DaemonStatus().monitorDaemonIsOnline(5): p.open(MONITORD_HOST, MONITORD_PORT) p.sendRequest("get_currentsensorreadings","",self.nodeStatusHandler) p.readAndProcess() else: log.info("Monitor Daemon is not online!") # TODO: Email self.__checker_lock.release() log.debug("Released checker lock.") def nodeStatusHandler(self,code,seq,size,payload): if (code != 200) or (len(payload) == 0): # TODO: Warn someone that monitord isn't working properly log.critical("Incorrect payload received from monitor daemon!") sensor_dom = xml.dom.minidom.parseString(payload) node_readings = sensor_dom.getElementsByTagName("nodereading") for nodereading in node_readings: self.checkNodeReadings(nodereading) sensor_dom.unlink() def checkNodeReadings(self, nodereading): nodeid = nodereading.attributes["id"].value overallstatus = nodereading.attributes["overallstatus"].value if overallstatus == 0: return readings = nodereading.getElementsByTagName("reading") for reading in readings: # (status, sensorid, sensortype, timeinsecs, sensorvalue, # sensormaxvalue) sdata = self.__parseXMLReading(reading) if sdata[0] == 0: continue elif sdata[0] == 1: log.critical("ALERT: [%s][%s] has WARNING status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleWarningStatus(nodeid, sdata) elif sdata[0] >= 2: log.critical("ALERT: [%s][%s] has CRITICAL status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleCriticalStatus(nodeid, sdata) else: log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \ % (nodeid, sdata[1], sdata[0])) self.__handleUnknownStatus(nodeid, sdata) def __makeEmailNodeMessage(self, status, nodeid, sdata, action): return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2], status, datetime.datetime.fromtimestamp(float(sdata[3])).\ strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \ self.__run_level, action)) def __handleWarningStatus(self, nodeid, sdata): """\brief Handles a sensor warning status, by sending out a warning email to the warning_email_addresses recipients. """ message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) self.__sendEmail(self.__warning_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\ % (nodeid, sdata[1]), message) def __handleCriticalStatus(self, nodeid, sdata): """\brief Handles a critical warning status, by attempting to shut down the node, and then send out an email to the critical_email_addresses recipients, with the results and details. """ if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) elif self.__run_level == 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) else: action = "Email warning, and attempted powerdown of node." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) powerdown_status = self.__attemptPowerDown(nodeid) message += "Output from powerdown attempt:\n" + powerdown_status log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) def __handleUnknownStatus(self, nodeid, sdata): """\brief Handles an unknown sensor status, by sending out an email to the warning_email_addresses recipients with details. """ unknown_state_message = "A node in an UNKNOWN state indicates a system"\ + "error. Please notify the author immediately." if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\ % (nodeid, sdata[1]), message) def __attemptPowerDown(self, nodeid): """\brief Attempts to power down the node specified by nodeid, and returns a string of the results \p nodeid - id of node that needs powering down """ (status, output) = commands.getstatusoutput(\ "%s %s" % (POWERDOWN_COMMAND, nodeid)) return output def __sendEmail(self, recipients, subject, message): """\brief Sends an email with the provided message to the provided list of recipients \p recipients - list of recipients. \p subject - message subject \p message - string message to be sent. """ if len(recipients) == 0: log.critical("__sendEmail(): Error: No recipients given.") return message = ("To: %s\r\nSubject: %s\r\n\r\n" % (", ".join(recipients), subject)) + message try: server = smtplib.SMTP('localhost') server.sendmail("", recipients, message) server.quit() except Exception, e: log.critical("__sendEmail() exception: %s" % str(e))
class EmergencyChecker(Daemon): """ \brief Implements the EmergencyD external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Emergency Daemon v0.1" __checker_timer = None __checker_lock = None __config = None __config_path = None __warning_email_addresses = None __critical_email_addresses = None __run_level = None def __init__(self, config=CONFIG_FILE): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__config_path = config self.__parseConfig(self.__config_path) self.__registerMethods() self.__checker_lock = threading.Lock() def testMethod(self, prot, seq, ln, payload): log.debug("testMethod() called.") sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 32.6) self.__handleCriticalStatus("computer32", sdata) prot.sendReply(200, seq, "Test method called.") def manualRunChecker(self, prot, seq, ln, payload): log.debug("manualRunChecker() called.") prot.sendReply(200, seq, "Running runChecker.") self.runChecker() def manualStopUpdateTimer(self, prot, seq, ln, payload): log.debug("manualStopUpdateTimer() called.") if not self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer isn't running!") else: self.stopUpdateTimer() prot.sendReply(200, seq, "Update Timer stopped.") def manualStartUpdateTimer(self, prot, seq, ln, payload): log.debug("manualStartUpdateTimer() called.") if self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer is already running!") else: self.startUpdateTimer() prot.sendReply(200, seq, "Update Timer started.") def setRunLevel(self, prot, seq, ln, payload): log.debug("setRunLevel() called.") new_run_level = int(payload) if not self.__validRunLevel(new_run_level): log.debug("Invalid run_level (%d) given" % new_run_level) prot.sendReply(400, seq, "Invalid run_level given.") return if new_run_level < self.__run_level: payload = "Lowering run_level from %d to %d." % \ (self.__run_level, new_run_level) elif new_run_level > self.__run_level: payload = "Raising run_level from %d to %d." % \ (self.__run_level, new_run_level) else: payload = "No change in run_level." log.info(payload) prot.sendReply(200, seq, payload) self.__run_level = new_run_level def getRunLevel(self, prot, seq, ln, payload): log.debug("getRunLevel() called.") prot.sendReply(200, seq, str(self.__run_level)) def reloadConfig(self, prot, seq, ln, payload): log.debug("reloadConfig() called.") self.__parseConfig(config) prot.sendReply(200, seq, "Reload of config file completed.") def killDaemon(self, prot, seq, ln, payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self, prot, seq, ln, payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self, prot, seq, ln, payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Emergency Daemon (self)") Daemon.stop(self) def startUpdateTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.runChecker, True) self.__checker_timer.start() def stopUpdateTimer(self): self.__checker_timer.stop() def updateTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def runChecker(self): log.debug("runChecker() called.") log.debug("Acquiring checker lock.") self.__checker_lock.acquire() p = Protocol(None) if DaemonStatus().monitorDaemonIsOnline(5): p.open(MONITORD_HOST, MONITORD_PORT) p.sendRequest("get_currentsensorreadings", "", self.nodeStatusHandler) p.readAndProcess() else: log.info("Monitor Daemon is not online!") # TODO: Email self.__checker_lock.release() log.debug("Released checker lock.") def nodeStatusHandler(self, code, seq, size, payload): if (code != 200) or (len(payload) == 0): # TODO: Warn someone that monitord isn't working properly log.critical("Incorrect payload received from monitor daemon!") sensor_dom = xml.dom.minidom.parseString(payload) node_readings = sensor_dom.getElementsByTagName("nodereading") for nodereading in node_readings: self.checkNodeReadings(nodereading) sensor_dom.unlink() def checkNodeReadings(self, nodereading): nodeid = nodereading.attributes["id"].value overallstatus = nodereading.attributes["overallstatus"].value if overallstatus == 0: return readings = nodereading.getElementsByTagName("reading") for reading in readings: # (status, sensorid, sensortype, timeinsecs, sensorvalue, # sensormaxvalue) sdata = self.__parseXMLReading(reading) if sdata[0] == 0: continue elif sdata[0] == 1: log.critical("ALERT: [%s][%s] has WARNING status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleWarningStatus(nodeid, sdata) elif sdata[0] >= 2: log.critical("ALERT: [%s][%s] has CRITICAL status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleCriticalStatus(nodeid, sdata) else: log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \ % (nodeid, sdata[1], sdata[0])) self.__handleUnknownStatus(nodeid, sdata) def __makeEmailNodeMessage(self, status, nodeid, sdata, action): return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2], status, datetime.datetime.fromtimestamp(float(sdata[3])).\ strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \ self.__run_level, action)) def __handleWarningStatus(self, nodeid, sdata): """\brief Handles a sensor warning status, by sending out a warning email to the warning_email_addresses recipients. """ message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) self.__sendEmail(self.__warning_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\ % (nodeid, sdata[1]), message) def __handleCriticalStatus(self, nodeid, sdata): """\brief Handles a critical warning status, by attempting to shut down the node, and then send out an email to the critical_email_addresses recipients, with the results and details. """ if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) elif self.__run_level == 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) else: action = "Email warning, and attempted powerdown of node." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) powerdown_status = self.__attemptPowerDown(nodeid) message += "Output from powerdown attempt:\n" + powerdown_status log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) def __handleUnknownStatus(self, nodeid, sdata): """\brief Handles an unknown sensor status, by sending out an email to the warning_email_addresses recipients with details. """ unknown_state_message = "A node in an UNKNOWN state indicates a system"\ + "error. Please notify the author immediately." if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\ % (nodeid, sdata[1]), message) def __attemptPowerDown(self, nodeid): """\brief Attempts to power down the node specified by nodeid, and returns a string of the results \p nodeid - id of node that needs powering down """ (status, output) = commands.getstatusoutput(\ "%s %s" % (POWERDOWN_COMMAND, nodeid)) return output def __sendEmail(self, recipients, subject, message): """\brief Sends an email with the provided message to the provided list of recipients \p recipients - list of recipients. \p subject - message subject \p message - string message to be sent. """ if len(recipients) == 0: log.critical("__sendEmail(): Error: No recipients given.") return message = ("To: %s\r\nSubject: %s\r\n\r\n" % (", ".join(recipients), subject)) + message try: server = smtplib.SMTP('localhost') server.sendmail("", recipients, message) server.quit() except Exception, e: log.critical("__sendEmail() exception: %s" % str(e))