class Worker(threading.Thread): """Just to demonstrate updating the MIB and sending traps """ def __init__(self, agent, mib, cfg): threading.Thread.__init__(self) self._agent = agent self._mib = mib self._cfg = cfg self.setDaemon(True) self.client = QumuloClient(cfg.clusters[0]) # only one cluster for now self.notified_offline = False self.notified_dead_drives = False self.notified_power_supply_failure = False self.snmp_enabled = cfg.snmp.enabled self.email_enabled = cfg.email.enabled self.ipmi_enabled = cfg.ipmi.enabled if self.email_enabled: self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT') self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD') def check_nodes(self): self.client.get_cluster_state() if len(self.client.offline_nodes) > 0: if self.notified_offline == False: msg = "There are currently " + str(len(self.client.offline_nodes)) + \ " nodes offline:" for n in self.client.offline_nodes: msg = msg + "\tNode " + n["node_name"] + " is currently offline." self.notify("Qumulo Nodes Offline", msg, "nodeDownTrap") self.notified_offline = True else: if self.notified_offline == True: self.notified_offline = False self.notify("Qumulo Nodes Back Online", "All nodes back online", "nodesClearTrap") def check_drives(self): self.client.get_drive_states() if len(self.client.dead_drives) > 0: if self.notified_dead_drives == False: msg = "There are currently " + str(len(self.client.dead_drives)) + " drives offline:" for d in self.client.dead_drives: msg = msg + "\t" + d["disk_type"] + " Drive" + d["id"] + " is offline." self.notify("Qumulo Drives Offline", msg, "driveFailureTrap") self.notified_dead_drives = True else: if self.notified_dead_drives == True: self.notified_dead_drives = False self.notify("Qumulo Drives Back Online", "All nodes back online", "nodesClearTrap") def check_power(self, ipmi_server): power_state = self.client.get_power_state(self._cfg['clusters'][0].ipmi.ipmi_server) m = re.search("Failure", power_state[0]) if m: if not self.notified_power_supply_failure: self.notify("Qumulo Power Supply Failure", power_state[0], "powerSupplyFailureTrap") self.notified_power_supply_failure = True else: if self.notified_power_supply_failure: # we're back to normal self.notified_power_supply_failure = False self.notify("Qumulo Cluster Back Online", "Qumulo Cluster power back to normal", "nodesClearTrap") def notify(self, subject, message, snmp_trap_name = None): print(message) if self.snmp_enabled: print("Sending trap") self._agent.sendTrap(message, snmp_trap_name, ()) if self.email_enabled: print("Sending email") self.send_email(subject, message) def check_cluster_status(self): # Check IPMI if self._cfg.ipmi.enabled: ipmi_server = self._cfg['clusters'][0].ipmi.ipmi_server self.check_power(ipmi_server, self._cfg.snmp.enabled, self._email.enabled) if self.client.credentials != None: self.check_nodes() self.check_drives() else: # we're offline if not self.notified_offline: print "Error connecting to Qumulo Cluster REST Server" self.notify("Qumulo Cluster offline", "Error connecting to Qumulo Cluster REST Server", "nodeDownTrap") self.notified_offline = True else: # retry login self.client.login() def send_email(self, subject, body): '''Send an email message to a list of recipients''' try: # Create a text/plain message msg = MIMEMultipart() msg['From'] = self._cfg.email.address_from msg['To'] = self._cfg.email.address_to msg['Subject'] = subject msg.attach(MIMEText(body, 'plain')) server = smtplib.SMTP(self._cfg.email.server, self._cfg.email.tls_port) server.starttls() server.login(self.email_acct, self.email_pwd) server.sendmail(msg['From'], msg['To'], msg.as_string()) server.quit() except Exception, excpt: print("Failed to send email (Subject: %s) (%s)" % (subject, excpt))
class Worker(threading.Thread): """Just to demonstrate updating the MIB and sending traps """ def __init__(self, agent, mib, cfg): threading.Thread.__init__(self) self.logger = logging.getLogger('agent.Worker') self._agent = agent self._mib = mib self._cfg = cfg self.setDaemon(True) self.client = QumuloClient(cfg.clusters[0]) # only one cluster for now self.notified_offline = False self.notified_dead_drives = False # Use an array of dictionaries to track per-node PS notify states self.notified_power_supply_failure = \ [{'PS1': False, 'PS2': False} for node in cfg.clusters[0].ipmi.ipmi_servers] # print self.notified_power_supply_failure self.snmp_enabled = cfg.snmp.enabled self.email_enabled = cfg.email.enabled self.ipmi_enabled = cfg.clusters[0].ipmi.enabled if self.email_enabled: self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT') self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD') def check_nodes(self): self.client.get_cluster_state() if len(self.client.offline_nodes) > 0: if self.notified_offline == False: msg = "There are currently " + str(len(self.client.offline_nodes)) + \ " nodes offline:" for n in self.client.offline_nodes: msg = msg + "\tNode " + n["node_name"] + " is currently offline." self.notify("Qumulo Nodes Offline", msg, "nodeDownTrap") self.notified_offline = True else: if self.notified_offline == True: self.notified_offline = False self.notify("Qumulo Nodes Back Online", "All nodes back online", "nodesClearTrap") def check_drives(self): self.client.get_drive_states() if len(self.client.dead_drives) > 0: if self.notified_dead_drives == False: msg = "There are currently " + str(len(self.client.dead_drives)) + " drives offline:" for d in self.client.dead_drives: msg = msg + "\t" + d["disk_type"] + " Drive" + d["id"] + " is offline." self.notify("Qumulo Drives Offline", msg, "driveFailureTrap") self.notified_dead_drives = True else: if self.notified_dead_drives == True: self.notified_dead_drives = False self.notify("Qumulo Drives Back Online", "All nodes back online", "nodesClearTrap") def check_power(self, ipmi_server, node_id): power_states = self.client.get_power_state(ipmi_server) cluster_name = self._cfg.clusters[0].name node_name = cluster_name + '-' + str(node_id + 1) # notify on every failed PS we find and set notified state to True try: for PS in power_states['FAIL']: if not self.notified_power_supply_failure[node_id][PS]: message = PS + " in " + node_name + " failed" subject = "[ALERT] Qumulo Power Supply Failure " + node_name self.notify(subject, message, "powerSupplyFailureTrap", [(rfc1902.ObjectName('1.3.6.1.4.1.47017.8'), rfc1902.OctetString(node_name)), (rfc1902.ObjectName('1.3.6.1.4.1.47017.11'), rfc1902.OctetString(PS)) ] ) self.notified_power_supply_failure[node_id][PS] = True except TypeError, err: self.logger.warn("WARNING: IPMI Exception, please verify IPMI config. (%s)" % str(err)) # notify on every good PS we find and set those notified states to False try: for PS in power_states['GOOD']: if self.notified_power_supply_failure[node_id][PS]: message = PS + " in " + node_name + " power back to normal" self.notify("Qumulo Power Supply Normal", message, "nodesClearTrap") self.notified_power_supply_failure[node_id][PS] = False except TypeError, err: self.logger.warn("WARNING: IPMI Exception, please verify IPMI config. (%s)" % str(err))