async def check(self): if not await self.enabled(): return alerts = [] for pool in await self.middleware.call("pool.query"): if not pool["is_decrypted"]: continue if not pool["healthy"]: if await self.middleware.call("system.is_enterprise"): try: await self.middleware.call("enclosure.sync_zpool", pool["name"]) except Exception: pass bad_vdevs = [] if pool["topology"]: for vdev in await self.middleware.call("pool.flatten_topology", pool["topology"]): if vdev["type"] == "DISK" and vdev["status"] != "ONLINE": name = vdev["guid"] if vdev.get("unavail_disk"): name = f'{vdev["unavail_disk"]["model"]} {vdev["unavail_disk"]["serial"]}' bad_vdevs.append(f"Disk {name} is {vdev['status']}") if bad_vdevs: devices = (f"<br>The following devices are not healthy:" f"<ul><li>{'</li><li>'.join(bad_vdevs)}</li></ul>") else: devices = "" alerts.append(Alert( VolumeStatusAlertClass, { "volume": pool["name"], "state": pool["status"], "status": pool["status_detail"], "devices": devices, } )) return alerts
async def initialize(self, load=True): is_freenas = await self.middleware.call("system.is_freenas") self.node = "A" if not is_freenas: if await self.middleware.call("failover.node") == "B": self.node = "B" self.alerts = [] if load: for alert in await self.middleware.call("datastore.query", "system.alert"): del alert["id"] try: alert["klass"] = AlertClass.class_by_name[alert["klass"]] except KeyError: self.logger.info("Alert class %r is no longer present", alert["klass"]) continue alert["_uuid"] = alert.pop("uuid") alert["_source"] = alert.pop("source") alert["_key"] = alert.pop("key") alert["_text"] = alert.pop("text") alert = Alert(**alert) if not any(a.uuid == alert.uuid for a in self.alerts): self.alerts.append(alert) self.alert_source_last_run = defaultdict(lambda: datetime.min) self.policies = { "IMMEDIATELY": AlertPolicy(), "HOURLY": AlertPolicy(lambda d: (d.date(), d.hour)), "DAILY": AlertPolicy(lambda d: (d.date())), "NEVER": AlertPolicy(lambda d: None), } for policy in self.policies.values(): policy.receive_alerts(datetime.utcnow(), self.alerts)
async def check(self): corefiles = [] for coredump in filter(lambda c: c["corefile"] == "present", await self.middleware.call("system.coredumps")): if coredump["unit"] in self.ignore: # Unit: "syslog-ng.service" has been core dumping for, literally, years # on freeBSD and now also on linux. The fix is non-trivial and it seems # to be very specific to how we implemented our system dataset. Anyways, # the crash isn't harmful so we ignore it. # Unit: "containerd.service" is related to k3s. # users are free to run whatever they would like to in containers # and we don't officially support all the apps themselves so we # ignore those core dumps continue corefiles.append(f"{coredump['exe']} ({coredump['time']})") if corefiles: return Alert(CoreFilesArePresentAlertClass, {"corefiles": ', '.join(corefiles)})
def check_sync(self): rrd_size_alert_threshold = 1610611911 # bytes try: used = shutil.disk_usage('/var/db/collectd/rrd').used except FileNotFoundError: raise UnavailableException() if used > rrd_size_alert_threshold: # zfs list reports in kibi/mebi/gibi(bytes) but # format_size() calculates in kilo/mega/giga by default # so the report that we send the user needs to match # up with what zfs list reports as to not confuse anyone used = format_size(used, binary=True) threshold = format_size(rrd_size_alert_threshold, binary=True) return Alert(ReportingDbAlertClass, { 'used': used, 'threshold': threshold }, key=None)
def check_sync(self): try: self.middleware.call_sync("datastore.query", "system.update", None, {"get": True}) except IndexError: self.middleware.call_sync("datastore.insert", "system.update", { "upd_autocheck": True, "upd_train": "", }) path = self.middleware.call_sync("update.get_update_location") if not path: return try: updates = PendingUpdates(path) except Exception: updates = None if updates: return Alert(HasUpdateAlertClass)
def check_sync(self): try: with LockFile(VMWARESNAPDELETE_FAILS): with open(VMWARESNAPDELETE_FAILS, "rb") as f: fails = pickle.load(f) except Exception: return alerts = [] for snapname, vms in list(fails.items()): for vm in vms: alerts.append( Alert( VMWareSnapshotDeleteFailedAlertClass, { "snapshot": snapname, "vm": vm, "hostname": "<hostname>", "error": "Error", })) return alerts
async def check(self): interfaces = await self.middleware.call("datastore.query", "network.interfaces") alerts = [] node = await self.middleware.call("failover.node") for interface in interfaces: if interface["int_critical"]: missing_ip_fields = [] if not interface["int_ipv4address"] and not interface[ "int_dhcp"]: if node == 'A': missing_ip_fields.append( 'IPv4 Address (This Storage Controller)') else: missing_ip_fields.append( 'IPv4 Address (Storage Controller 1)') if not interface["int_ipv4address_b"] and not interface[ "int_dhcp"]: if node == 'B': missing_ip_fields.append( 'IPv4 Address (This Storage Controller)') else: missing_ip_fields.append( 'IPv4 Address (Storage Controller 2)') if not interface["int_vip"]: missing_ip_fields.append('Virtual IP Address') if missing_ip_fields: alerts.append( Alert( FailoverIpAlertClass, { "interface": interface["int_name"], "addresses": " ".join(missing_ip_fields), })) return alerts
async def _produce_alerts_for_ipmitool_output(self, output): alerts = [] records = parse_ipmitool_output(output) if records: if await self.middleware.call("keyvalue.has_key", self.dismissed_datetime_kv_key): dismissed_datetime = ((await self.middleware.call( "keyvalue.get", self.dismissed_datetime_kv_key)).replace(tzinfo=None)) else: # Prevent notifying about existing alerts on first install/upgrade dismissed_datetime = max(record.datetime for record in records) await self.middleware.call("keyvalue.set", self.dismissed_datetime_kv_key, dismissed_datetime) for record in records: if record.datetime <= dismissed_datetime: continue title = "%(sensor)s %(direction)s %(event)s" if record.verbose is not None: title += ": %(verbose)s" args = dict(record._asdict()) args.pop("id") args.pop("datetime") alerts.append( Alert( title=title, args=args, key=[title, args, record.datetime.isoformat()], datetime=record.datetime, )) return alerts
def check_sync(self): try: used = shutil.disk_usage('/var/db/collectd/rrd').used except FileNotFoundError: raise UnavailableException() threshold = 1073741824 + len( self.middleware.call_sync('disk.query')) * 1024 * 1024 if used > threshold: # zfs list reports in kibi/mebi/gibi(bytes) but # format_size() calculates in kilo/mega/giga by default # so the report that we send the user needs to match # up with what zfs list reports as to not confuse anyone used = format_size(used, binary=True) threshold = format_size(threshold, binary=True) return Alert(ReportingDbAlertClass, { 'used': used, 'threshold': threshold }, key=None)
async def __run_source(self, source_name): alert_source = ALERT_SOURCES[source_name] try: alerts = (await alert_source.check()) or [] except Exception: alerts = [ Alert( title= "Unable to run alert source %(source_name)r\n%(traceback)s", args={ "source_name": alert_source.name, "traceback": traceback.format_exc(), }, key="__unhandled_exception__", level=AlertLevel.CRITICAL) ] else: if not isinstance(alerts, list): alerts = [alerts] return alerts
async def check(self): alerts = [] for replication in await self.middleware.call( "replication.query", [["enabled", "=", True]]): if replication["state"]["state"] == "ERROR": alerts.append( Alert( "Replication %(replication)s failed: %(message)s", { "replication": "%s -> %s:%s" % ( ". ".join(replication["source_datasets"]), (replication["ssh_credentials"] or {}).get( "name", "localhost"), replication["target_dataset"], ), "message": replication["state"]["error"], }, )) return alerts
async def check(self): alerts = [] for cert_id, service, type_c, datastore in ( ((await self.middleware.call('ftp.config'))['ssltls_certificate'], 'FTP', 'certificate', 'certificate'), ((await self.middleware.call('s3.config'))['certificate'], 'S3', 'certificate', 'certificate'), ((await self.middleware.call('webdav.config'))['certssl'], 'Webdav', 'certificate', 'certificate'), ((await self.middleware.call('openvpn.server.config') )['server_certificate'], 'OpenVPN server', 'certificate', 'certificate'), ((await self.middleware.call('openvpn.client.config') )['client_certificate'], 'OpenVPN client', 'certificate', 'certificate'), ((await self.middleware.call('system.general.config') )['ui_certificate']['id'], 'Web UI', 'certificate', 'certificate'), ((await self.middleware.call('system.advanced.config') )['syslog_tls_certificate'], 'Syslog', 'certificate', 'certificate'), ((await self.middleware.call('openvpn.server.config'))['root_ca'], 'OpenVPN server', 'root certificate authority', 'certificateauthority'), ((await self.middleware.call('openvpn.client.config'))['root_ca'], 'OpenVPN client', 'root certificate authority', 'certificateauthority')): if (cert_id and (await self.middleware.call( f'{datastore}.query', [['id', '=', cert_id]], {'get': True}))['revoked']): alerts.append( Alert(CertificateRevokedAlertClass, { 'service': service, 'type': type_c })) return alerts
def check_sync(self): if self.middleware.call_sync("datastore.query", "services.services", [("srv_service", "=", "smartd"), ("srv_enable", "=", True)]): # sysctl kern.vm_guest will return a hypervisor name, or the string "none" # if FreeNAS is running on bare iron. p0 = subprocess.Popen(["/sbin/sysctl", "-n", "kern.vm_guest"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding="utf8") status = p0.communicate()[0].strip() # This really isn"t confused with python None if status != "none": # We got something other than "none", maybe "vmware", "xen", "vbox". Regardless, smartd not running # in these environments isn"t a huge deal. So we"ll skip alerting. return if not self.middleware.call_sync("system.is_freenas"): if self.middleware.call_sync("failover.status") != "MASTER": return if not self.middleware.call_sync("service.started", "smartd"): return Alert("smartd is not running")
async def __run_source(self, source_name): alert_source = ALERT_SOURCES[source_name] try: alerts = (await alert_source.check()) or [] except UnavailableException: raise except Exception: alerts = [ Alert(AlertSourceRunFailedAlertClass, args={ "source_name": alert_source.name, "traceback": traceback.format_exc(), }) ] else: if not isinstance(alerts, list): alerts = [alerts] for alert in alerts: alert.source = source_name return alerts
async def check(self): alerts = [] for cert in await self.middleware.call('certificate.query', [[ 'certificate', '!=', None ]]) + await self.middleware.call('certificateauthority.query'): if cert['parsed']: diff = ( datetime.strptime(cert['until'], '%a %b %d %H:%M:%S %Y') - datetime.utcnow()).days if diff < 10: alerts.append( Alert( CertificateIsExpiringSoonAlertClass if diff <= 2 else CertificateIsExpiringAlertClass, { "name": cert["name"], "days": diff, }, key=[cert["name"]], )) return alerts
def check_sync(self): j = journal.Reader() j.add_match("SYSLOG_IDENTIFIER=sshd") j.seek_realtime(datetime.now() - timedelta(days=1)) count = 0 last_messages = deque([], 4) for record in j: if record["MESSAGE"].startswith("Failed password for"): count += 1 last_messages.append( f"{record['__REALTIME_TIMESTAMP'].strftime('%d %b %H:%M:%S')}: {record['MESSAGE']}" ) if count > 0: return Alert(SSHLoginFailuresAlertClass, { "count": count, "failures": "\n".join(([ f"... first {count - len(last_messages)} messages skipped ..." ] if count > len(last_messages) else []) + list(last_messages)) }, key=list(last_messages))
def check_sync(self): try: self.middleware.call_sync("datastore.query", "system.update", None, {"get": True}) except IndexError: self.middleware.call_sync("datastore.insert", "system.update", { "upd_autocheck": True, "upd_train": "", }) path = self.middleware.call_sync("notifier.get_update_location") if not path: return try: updates = PendingUpdates(path) except Exception: updates = None if updates: return Alert( "There is a new update available! Apply it in System -> Update tab." )
def check_sync(self): if not os.path.exists(COLLECTD_FILE): return lock = LockFile(COLLECTD_FILE) while not lock.i_am_locking(): try: lock.acquire(timeout=5) except LockTimeout: return with open(COLLECTD_FILE, "rb") as f: try: data = pickle.loads(f.read()) except Exception: data = {} lock.release() alerts = [] for k, v in list(data.items()): if k == "ctl-ha/disk_octets": text = ( "Storage Controller HA link is in use. Please check that all iSCSI and FC initiators support ALUA " "and are able to connect to the active node.") else: text = k if v["Severity"] == "WARNING": klass = CollectdWarningAlertClass else: klass = CollectdCriticalAlertClass alerts.append(Alert(klass, text)) return alerts
async def check(self): if not await self.middleware.call('service.started', 'iscsitarget'): return in_use_ips = { i['address'] for i in await self.middleware.call('interface.ip_in_use', {'any': True}) } portals = { p['id']: p for p in await self.middleware.call('iscsi.portal.query') } ips = [] for target in await self.middleware.call('iscsi.target.query'): for group in target['groups']: ips.extend( map( lambda ip: ip['ip'], filter(lambda a: a['ip'] not in in_use_ips, portals[group['portal']]['listen']))) if ips: return Alert(ISCSIPortalIPAlertClass, ', '.join(ips))
async def create(self, args): return Alert(CloudSyncTaskFailedAlertClass, args, key=args["id"])
def check_sync(self): alerts = [] datasets = self.middleware.call_sync( "zfs.dataset.query_for_quota_alert") pool_sizes = {} for d in datasets: d["name"] = d["name"]["rawvalue"] if "/" not in d["name"]: pool_sizes[d["name"]] = int(d["available"]["rawvalue"]) + int( d["used"]["rawvalue"]) for k, default in [("org.freenas:quota_warning", 80), ("org.freenas:quota_critical", 95), ("org.freenas:refquota_warning", 80), ("org.freenas:refquota_critical", 95)]: try: d[k] = int(d[k]["rawvalue"]) except (KeyError, ValueError): d[k] = default # call this outside the for loop since we don't need to check # for every dataset that could be potentially be out of quota... hostname = self.middleware.call_sync("system.hostname") datasets = sorted(datasets, key=lambda ds: ds["name"]) for dataset in datasets: for quota_property in ["quota", "refquota"]: try: quota_value = int(dataset[quota_property]["rawvalue"]) except (AttributeError, KeyError, ValueError): continue if quota_value == 0: continue if quota_property == "quota": # We can't use "used" property since it includes refreservation # But if "refquota" is smaller than "quota", then "available" will be reported with regards to # that smaller value, and we will get false positive try: refquota_value = int(dataset["refquota"]["rawvalue"]) except (AttributeError, KeyError, ValueError): continue else: if refquota_value and refquota_value < quota_value: continue # Quota larger than dataset available size will never be exceeded, # but will break out logic if quota_value > pool_sizes[dataset["name"].split("/")[0]]: continue used = quota_value - int(dataset["available"]["rawvalue"]) elif quota_property == "refquota": used = int(dataset["usedbydataset"]["rawvalue"]) else: raise RuntimeError() used_fraction = 100 * used / quota_value critical_threshold = dataset[ f"org.freenas:{quota_property}_critical"] warning_threshold = dataset[ f"org.freenas:{quota_property}_warning"] if critical_threshold != 0 and used_fraction >= critical_threshold: klass = QuotaCriticalAlertClass elif warning_threshold != 0 and used_fraction >= warning_threshold: klass = QuotaWarningAlertClass else: continue quota_name = quota_property[0].upper() + quota_property[1:] args = { "name": quota_name, "dataset": dataset["name"], "used_fraction": used_fraction, "used": format_size(used), "quota_value": format_size(quota_value), } mail = None owner = self._get_owner(dataset) if owner != 0: try: self.middleware.call_sync('user.get_user_obj', {'uid': owner}) user_exists = True except KeyError: user_exists = False to = None logger.debug("Unable to query bsduser with uid %r", owner) if user_exists: try: bsduser = self.middleware.call_sync( "datastore.query", "account.bsdusers", [["bsdusr_uid", "=", owner]], {"get": True}, ) to = bsduser["bsdusr_email"] or None except IndexError: to = None if to is not None: mail = { "to": [to], "subject": f"{hostname}: {quota_name} exceeded on dataset {dataset['name']}", "text": klass.text % args } alerts.append( Alert( klass, args=args, key=[dataset["name"], quota_property], mail=mail, )) return alerts
async def check(self): baseboard_manufacturer = ((await run( ["dmidecode", "-s", "baseboard-manufacturer"], check=False)).stdout.decode(errors="ignore")).strip() failover_hardware = await self.middleware.call("failover.hardware") is_gigabyte = baseboard_manufacturer == "GIGABYTE" is_m_series = baseboard_manufacturer == "Supermicro" and failover_hardware == "ECHOWARP" alerts = [] for sensor in await self.middleware.call("sensor.query"): if is_gigabyte: if sensor["value"] is None: continue if not (RE_CPUTEMP.match(sensor["name"]) or RE_SYSFAN.match(sensor["name"])): continue if sensor["lowarn"] and sensor["value"] < sensor["lowarn"]: relative = "below" if sensor["value"] < sensor["locrit"]: level = "critical" else: level = "recommended" elif sensor["hiwarn"] and sensor["value"] > sensor["hiwarn"]: relative = "above" if sensor["value"] > sensor["hicrit"]: level = "critical" else: level = "recommended" else: continue alerts.append( Alert( SensorAlertClass, { "name": sensor["name"], "relative": relative, "level": level, "value": sensor["value"], "desc": sensor["desc"], }, key=[sensor["name"], relative, level], )) if is_m_series: ps_match = re.match("(PS[0-9]+) Status", sensor["name"]) if ps_match: ps = ps_match.group(1) if sensor["notes"]: alerts.append( Alert( PowerSupplyAlertClass, { "number": ps, "errors": ", ".join(sensor["notes"]), })) return alerts
async def create(self, args): return Alert(VMWareSnapshotDeleteFailedAlertClass, args)
async def __run_alerts(self): master_node = "A" backup_node = "B" run_on_backup_node = False if not await self.middleware.call("system.is_freenas"): if await self.middleware.call("notifier.failover_licensed"): master_node = await self.middleware.call("failover.node") try: backup_node = await self.middleware.call( "failover.call_remote", "failover.node") remote_version = await self.middleware.call( "failover.call_remote", "system.version") remote_failover_status = await self.middleware.call( "failover.call_remote", "notifier.failover_status") except Exception: pass else: if remote_version == await self.middleware.call( "system.version"): if remote_failover_status == "BACKUP": run_on_backup_node = True for alert_source in ALERT_SOURCES.values(): if not alert_source.schedule.should_run( datetime.utcnow(), self.alert_source_last_run[alert_source.name]): continue self.alert_source_last_run[alert_source.name] = datetime.utcnow() self.logger.trace("Running alert source: %r", alert_source.name) try: alerts_a = await self.__run_source(alert_source.name) except UnavailableException: alerts_a = list(self.alerts["A"][alert_source.name].values()) for alert in alerts_a: alert.node = master_node alerts_b = [] if run_on_backup_node and alert_source.run_on_backup_node: try: try: alerts_b = await self.middleware.call( "failover.call_remote", "alert.run_source", [alert_source.name]) except CallError as e: if e.errno == CallError.EALERTCHECKERUNAVAILABLE: alerts_b = list( self.alerts["B"][alert_source.name].values()) else: raise else: alerts_b = [ Alert(**dict(alert, level=(AlertLevel(alert["level"]) if alert["level"] is not None else alert["level"]))) for alert in alerts_b ] except Exception: alerts_b = [ Alert( title= "Unable to run alert source %(source_name)r on backup node\n%(traceback)s", args={ "source_name": alert_source.name, "traceback": traceback.format_exc(), }, key="__remote_call_exception__", level=AlertLevel.CRITICAL) ] for alert in alerts_b: alert.node = backup_node for alert in alerts_a + alerts_b: existing_alert = self.alerts[alert.node][ alert_source.name].get(alert.key) alert.source = alert_source.name if existing_alert is None: alert.datetime = datetime.utcnow() else: alert.datetime = existing_alert.datetime alert.level = alert.level or alert_source.level alert.title = alert.title or alert_source.title if existing_alert is None: alert.dismissed = False else: alert.dismissed = existing_alert.dismissed self.alerts["A"][alert_source.name] = { alert.key: alert for alert in alerts_a } self.alerts["B"][alert_source.name] = { alert.key: alert for alert in alerts_b }
def check_sync(self): alerts = [] if not self.middleware.call_sync('failover.licensed'): return alerts if not self.middleware.call_sync('failover.internal_interfaces'): alerts.append(Alert(FailoverInterfaceNotFoundAlertClass)) return alerts try: self.middleware.call_sync('failover.call_remote', 'core.ping') local_version = self.middleware.call_sync('system.version') remote_version = self.middleware.call_sync('failover.call_remote', 'system.version') if local_version != remote_version: return [Alert(TrueNASVersionsMismatchAlertClass)] if not self.middleware.call_sync('failover.call_remote', 'system.ready'): raise UnavailableException() local = self.middleware.call_sync('failover.vip.get_states') remote = self.middleware.call_sync('failover.call_remote', 'failover.vip.get_states') errors = self.middleware.call_sync('failover.vip.check_states', local, remote) for error in errors: alerts.append( Alert( CARPStatesDoNotAgreeAlertClass, {"error": error}, )) except CallError as e: if e.errno != errno.ECONNREFUSED: return [Alert(FailoverStatusCheckFailedAlertClass, [str(e)])] status = self.middleware.call_sync('failover.status') if status == 'ERROR': errmsg = None if os.path.exists('/tmp/.failover_failed'): with open('/tmp/.failover_failed', 'r') as fh: errmsg = fh.read() if not errmsg: errmsg = 'Unknown error' alerts.append(Alert(FailoverFailedAlertClass, [errmsg])) elif status not in ('MASTER', 'BACKUP', 'SINGLE'): alerts.append(Alert(ExternalFailoverLinkStatusAlertClass)) internal_ifaces = self.middleware.call_sync( 'failover.internal_interfaces') if internal_ifaces: p1 = subprocess.Popen( "/sbin/ifconfig %s|grep -E 'vhid (10|20) '|grep 'carp:'" % internal_ifaces[0], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, encoding='utf8', ) stdout = p1.communicate()[0].strip() if status != "SINGLE" and stdout.count("\n") != 1: alerts.append(Alert(InternalFailoverLinkStatusAlertClass)) if status != "SINGLE": try: if sysctl.filter('kern.cam.ctl.ha_link')[0].value == 1: alerts.append(Alert(CTLHALinkAlertClass)) except Exception: pass if status == 'BACKUP': fobj = None try: with open(FAILOVER_JSON, 'r') as f: fobj = json.loads(f.read()) except Exception: pass try: if len(fobj['phrasedvolumes']) > 0: keys = self.middleware.call_sync( 'failover.encryption_keys')['geli'] not_found = False for pool in fobj['phrasedvolumes']: if pool not in keys: not_found = True alerts.append( Alert(NoFailoverPassphraseKeysAlertClass, {'pool': pool})) if not_found: # Kick a syncfrompeer if we don't. self.middleware.call_sync( 'failover.call_remote', 'failover.sync_keys_to_remote_node') except Exception: pass return alerts
async def create(self, args): return Alert(RsyncFailedAlertClass, args, key=args['id'])
async def test(self, data): """ Send a test alert using `type` of Alert Service. .. examples(websocket):: Send a test alert using Alert Service of Mail `type`. :::javascript { "id": "6841f242-840a-11e6-a437-00e04d680384", "msg": "method", "method": "alertservice.test", "params": [{ "name": "Test Email Alert", "enabled": true, "type": "Mail", "attributes": { "email": "*****@*****.**" }, "settings": {} }] } """ await self._validate(data, "alert_service_test") factory = ALERT_SERVICES_FACTORIES.get(data["type"]) if factory is None: self.logger.error("Alert service %r does not exist", data["type"]) return False try: alert_service = factory(self.middleware, data["attributes"]) except Exception: self.logger.error( "Error creating alert service %r with parameters=%r", data["type"], data["attributes"], exc_info=True) return False master_node = "A" if not await self.middleware.call("system.is_freenas"): if await self.middleware.call("failover.licensed"): master_node = await self.middleware.call("failover.node") test_alert = Alert( TestAlertClass, node=master_node, datetime=datetime.utcnow(), last_occurrence=datetime.utcnow(), _uuid="test", ) try: await alert_service.send([test_alert], [], [test_alert]) except Exception: self.logger.error("Error in alert service %r", data["type"], exc_info=True) return False return True
async def create(self, args): return Alert(CatalogSyncFailedAlertClass, args, _key=args['catalog'])
async def __run_alerts(self): master_node = "A" backup_node = "B" product_type = await self.middleware.call("alert.product_type") run_on_backup_node = False run_failover_related = False if product_type == "ENTERPRISE": if await self.middleware.call("failover.licensed"): if await self.middleware.call("failover.node") == "B": master_node = "B" backup_node = "A" try: remote_version = await self.middleware.call( "failover.call_remote", "system.version") remote_system_state = await self.middleware.call( "failover.call_remote", "system.state") remote_failover_status = await self.middleware.call( "failover.call_remote", "failover.status") except Exception: pass else: if remote_version == await self.middleware.call( "system.version"): if remote_system_state == "READY" and remote_failover_status == "BACKUP": run_on_backup_node = True run_failover_related = time.monotonic( ) > self.blocked_failover_alerts_until for k, source_lock in list(self.sources_locks.items()): if source_lock.expires_at <= time.monotonic(): await self.unblock_source(k) for alert_source in ALERT_SOURCES.values(): if product_type not in alert_source.products: continue if alert_source.failover_related and not run_failover_related: continue if not alert_source.schedule.should_run( datetime.utcnow(), self.alert_source_last_run[alert_source.name]): continue self.alert_source_last_run[alert_source.name] = datetime.utcnow() alerts_a = [ alert for alert in self.alerts if alert.node == master_node and alert.source == alert_source.name ] locked = False if self.blocked_sources[alert_source.name]: self.logger.debug( "Not running alert source %r because it is blocked", alert_source.name) locked = True else: self.logger.trace("Running alert source: %r", alert_source.name) try: alerts_a = await self.__run_source(alert_source.name) except UnavailableException: pass for alert in alerts_a: alert.node = master_node alerts_b = [] if run_on_backup_node and alert_source.run_on_backup_node: try: alerts_b = [ alert for alert in self.alerts if alert.node == backup_node and alert.source == alert_source.name ] try: if not locked: alerts_b = await self.middleware.call( "failover.call_remote", "alert.run_source", [alert_source.name]) alerts_b = [ Alert(**dict( { k: v for k, v in alert.items() if k in [ "args", "datetime", "last_occurrence", "dismissed", "mail" ] }, klass=AlertClass.class_by_name[ alert["klass"]], _source=alert["source"], _key=alert["key"])) for alert in alerts_b ] except CallError as e: if e.errno in [ errno.ECONNABORTED, errno.ECONNREFUSED, errno.ECONNRESET, errno.EHOSTDOWN, errno.ETIMEDOUT, CallError.EALERTCHECKERUNAVAILABLE ]: pass else: raise except ReserveFDException: self.logger.debug('Failed to reserve a privileged port') except Exception: alerts_b = [ Alert(AlertSourceRunFailedOnBackupNodeAlertClass, args={ "source_name": alert_source.name, "traceback": traceback.format_exc(), }, _source=alert_source.name) ] for alert in alerts_b: alert.node = backup_node for alert in alerts_a + alerts_b: self.__handle_alert(alert) self.alerts = ( [a for a in self.alerts if a.source != alert_source.name] + alerts_a + alerts_b)
async def create(self, args): return Alert(RsyncSuccessAlertClass, args, key=args['id'])