Exemplo n.º 1
0
    async def check(self):
        if not await self.enabled():
            return

        alerts = []
        for pool in await self.middleware.call("pool.query"):
            if not pool["is_decrypted"]:
                continue

            if not pool["healthy"]:
                if await self.middleware.call("system.is_enterprise"):
                    try:
                        await self.middleware.call("enclosure.sync_zpool", pool["name"])
                    except Exception:
                        pass

                bad_vdevs = []
                if pool["topology"]:
                    for vdev in await self.middleware.call("pool.flatten_topology", pool["topology"]):
                        if vdev["type"] == "DISK" and vdev["status"] != "ONLINE":
                            name = vdev["guid"]
                            if vdev.get("unavail_disk"):
                                name = f'{vdev["unavail_disk"]["model"]} {vdev["unavail_disk"]["serial"]}'
                            bad_vdevs.append(f"Disk {name} is {vdev['status']}")
                if bad_vdevs:
                    devices = (f"<br>The following devices are not healthy:"
                               f"<ul><li>{'</li><li>'.join(bad_vdevs)}</li></ul>")
                else:
                    devices = ""

                alerts.append(Alert(
                    VolumeStatusAlertClass,
                    {
                        "volume": pool["name"],
                        "state": pool["status"],
                        "status": pool["status_detail"],
                        "devices": devices,
                    }
                ))

        return alerts
Exemplo n.º 2
0
    async def initialize(self, load=True):
        is_freenas = await self.middleware.call("system.is_freenas")

        self.node = "A"
        if not is_freenas:
            if await self.middleware.call("failover.node") == "B":
                self.node = "B"

        self.alerts = []
        if load:
            for alert in await self.middleware.call("datastore.query",
                                                    "system.alert"):
                del alert["id"]

                try:
                    alert["klass"] = AlertClass.class_by_name[alert["klass"]]
                except KeyError:
                    self.logger.info("Alert class %r is no longer present",
                                     alert["klass"])
                    continue

                alert["_uuid"] = alert.pop("uuid")
                alert["_source"] = alert.pop("source")
                alert["_key"] = alert.pop("key")
                alert["_text"] = alert.pop("text")

                alert = Alert(**alert)

                if not any(a.uuid == alert.uuid for a in self.alerts):
                    self.alerts.append(alert)

        self.alert_source_last_run = defaultdict(lambda: datetime.min)

        self.policies = {
            "IMMEDIATELY": AlertPolicy(),
            "HOURLY": AlertPolicy(lambda d: (d.date(), d.hour)),
            "DAILY": AlertPolicy(lambda d: (d.date())),
            "NEVER": AlertPolicy(lambda d: None),
        }
        for policy in self.policies.values():
            policy.receive_alerts(datetime.utcnow(), self.alerts)
Exemplo n.º 3
0
    async def check(self):
        corefiles = []
        for coredump in filter(lambda c: c["corefile"] == "present", await
                               self.middleware.call("system.coredumps")):
            if coredump["unit"] in self.ignore:
                # Unit: "syslog-ng.service" has been core dumping for, literally, years
                # on freeBSD and now also on linux. The fix is non-trivial and it seems
                # to be very specific to how we implemented our system dataset. Anyways,
                # the crash isn't harmful so we ignore it.

                # Unit: "containerd.service" is related to k3s.
                # users are free to run whatever they would like to in containers
                # and we don't officially support all the apps themselves so we
                # ignore those core dumps
                continue

            corefiles.append(f"{coredump['exe']} ({coredump['time']})")

        if corefiles:
            return Alert(CoreFilesArePresentAlertClass,
                         {"corefiles": ', '.join(corefiles)})
Exemplo n.º 4
0
    def check_sync(self):
        rrd_size_alert_threshold = 1610611911  # bytes

        try:
            used = shutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        if used > rrd_size_alert_threshold:
            # zfs list reports in kibi/mebi/gibi(bytes) but
            # format_size() calculates in kilo/mega/giga by default
            # so the report that we send the user needs to match
            # up with what zfs list reports as to not confuse anyone
            used = format_size(used, binary=True)
            threshold = format_size(rrd_size_alert_threshold, binary=True)

            return Alert(ReportingDbAlertClass, {
                'used': used,
                'threshold': threshold
            },
                         key=None)
Exemplo n.º 5
0
    def check_sync(self):
        try:
            self.middleware.call_sync("datastore.query", "system.update", None,
                                      {"get": True})
        except IndexError:
            self.middleware.call_sync("datastore.insert", "system.update", {
                "upd_autocheck": True,
                "upd_train": "",
            })

        path = self.middleware.call_sync("update.get_update_location")
        if not path:
            return

        try:
            updates = PendingUpdates(path)
        except Exception:
            updates = None

        if updates:
            return Alert(HasUpdateAlertClass)
Exemplo n.º 6
0
    def check_sync(self):
        try:
            with LockFile(VMWARESNAPDELETE_FAILS):
                with open(VMWARESNAPDELETE_FAILS, "rb") as f:
                    fails = pickle.load(f)
        except Exception:
            return

        alerts = []
        for snapname, vms in list(fails.items()):
            for vm in vms:
                alerts.append(
                    Alert(
                        VMWareSnapshotDeleteFailedAlertClass, {
                            "snapshot": snapname,
                            "vm": vm,
                            "hostname": "<hostname>",
                            "error": "Error",
                        }))

        return alerts
Exemplo n.º 7
0
    async def check(self):
        interfaces = await self.middleware.call("datastore.query",
                                                "network.interfaces")
        alerts = []
        node = await self.middleware.call("failover.node")

        for interface in interfaces:
            if interface["int_critical"]:
                missing_ip_fields = []

                if not interface["int_ipv4address"] and not interface[
                        "int_dhcp"]:
                    if node == 'A':
                        missing_ip_fields.append(
                            'IPv4 Address (This Storage Controller)')
                    else:
                        missing_ip_fields.append(
                            'IPv4 Address (Storage Controller 1)')

                if not interface["int_ipv4address_b"] and not interface[
                        "int_dhcp"]:
                    if node == 'B':
                        missing_ip_fields.append(
                            'IPv4 Address (This Storage Controller)')
                    else:
                        missing_ip_fields.append(
                            'IPv4 Address (Storage Controller 2)')

                if not interface["int_vip"]:
                    missing_ip_fields.append('Virtual IP Address')

                if missing_ip_fields:
                    alerts.append(
                        Alert(
                            FailoverIpAlertClass, {
                                "interface": interface["int_name"],
                                "addresses": " ".join(missing_ip_fields),
                            }))

        return alerts
Exemplo n.º 8
0
    async def _produce_alerts_for_ipmitool_output(self, output):
        alerts = []

        records = parse_ipmitool_output(output)

        if records:
            if await self.middleware.call("keyvalue.has_key",
                                          self.dismissed_datetime_kv_key):
                dismissed_datetime = ((await self.middleware.call(
                    "keyvalue.get",
                    self.dismissed_datetime_kv_key)).replace(tzinfo=None))
            else:
                # Prevent notifying about existing alerts on first install/upgrade
                dismissed_datetime = max(record.datetime for record in records)
                await self.middleware.call("keyvalue.set",
                                           self.dismissed_datetime_kv_key,
                                           dismissed_datetime)

            for record in records:
                if record.datetime <= dismissed_datetime:
                    continue

                title = "%(sensor)s %(direction)s %(event)s"
                if record.verbose is not None:
                    title += ": %(verbose)s"

                args = dict(record._asdict())
                args.pop("id")
                args.pop("datetime")

                alerts.append(
                    Alert(
                        title=title,
                        args=args,
                        key=[title, args,
                             record.datetime.isoformat()],
                        datetime=record.datetime,
                    ))

        return alerts
Exemplo n.º 9
0
    def check_sync(self):
        try:
            used = shutil.disk_usage('/var/db/collectd/rrd').used
        except FileNotFoundError:
            raise UnavailableException()

        threshold = 1073741824 + len(
            self.middleware.call_sync('disk.query')) * 1024 * 1024

        if used > threshold:
            # zfs list reports in kibi/mebi/gibi(bytes) but
            # format_size() calculates in kilo/mega/giga by default
            # so the report that we send the user needs to match
            # up with what zfs list reports as to not confuse anyone
            used = format_size(used, binary=True)
            threshold = format_size(threshold, binary=True)

            return Alert(ReportingDbAlertClass, {
                'used': used,
                'threshold': threshold
            },
                         key=None)
Exemplo n.º 10
0
    async def __run_source(self, source_name):
        alert_source = ALERT_SOURCES[source_name]

        try:
            alerts = (await alert_source.check()) or []
        except Exception:
            alerts = [
                Alert(
                    title=
                    "Unable to run alert source %(source_name)r\n%(traceback)s",
                    args={
                        "source_name": alert_source.name,
                        "traceback": traceback.format_exc(),
                    },
                    key="__unhandled_exception__",
                    level=AlertLevel.CRITICAL)
            ]
        else:
            if not isinstance(alerts, list):
                alerts = [alerts]

        return alerts
Exemplo n.º 11
0
    async def check(self):
        alerts = []
        for replication in await self.middleware.call(
                "replication.query", [["enabled", "=", True]]):
            if replication["state"]["state"] == "ERROR":
                alerts.append(
                    Alert(
                        "Replication %(replication)s failed: %(message)s",
                        {
                            "replication":
                            "%s -> %s:%s" % (
                                ". ".join(replication["source_datasets"]),
                                (replication["ssh_credentials"] or {}).get(
                                    "name", "localhost"),
                                replication["target_dataset"],
                            ),
                            "message":
                            replication["state"]["error"],
                        },
                    ))

        return alerts
Exemplo n.º 12
0
    async def check(self):
        alerts = []

        for cert_id, service, type_c, datastore in (
            ((await self.middleware.call('ftp.config'))['ssltls_certificate'],
             'FTP', 'certificate', 'certificate'),
            ((await self.middleware.call('s3.config'))['certificate'], 'S3',
             'certificate', 'certificate'),
            ((await self.middleware.call('webdav.config'))['certssl'],
             'Webdav', 'certificate', 'certificate'),
            ((await self.middleware.call('openvpn.server.config')
              )['server_certificate'], 'OpenVPN server', 'certificate',
             'certificate'), ((await
                               self.middleware.call('openvpn.client.config')
                               )['client_certificate'], 'OpenVPN client',
                              'certificate', 'certificate'),
            ((await self.middleware.call('system.general.config')
              )['ui_certificate']['id'], 'Web UI', 'certificate',
             'certificate'), ((await
                               self.middleware.call('system.advanced.config')
                               )['syslog_tls_certificate'], 'Syslog',
                              'certificate', 'certificate'),
            ((await self.middleware.call('openvpn.server.config'))['root_ca'],
             'OpenVPN server', 'root certificate authority',
             'certificateauthority'),
            ((await self.middleware.call('openvpn.client.config'))['root_ca'],
             'OpenVPN client', 'root certificate authority',
             'certificateauthority')):
            if (cert_id and (await self.middleware.call(
                    f'{datastore}.query', [['id', '=', cert_id]],
                {'get': True}))['revoked']):
                alerts.append(
                    Alert(CertificateRevokedAlertClass, {
                        'service': service,
                        'type': type_c
                    }))

        return alerts
Exemplo n.º 13
0
    def check_sync(self):
        if self.middleware.call_sync("datastore.query", "services.services",
                                     [("srv_service", "=", "smartd"),
                                      ("srv_enable", "=", True)]):
            # sysctl kern.vm_guest will return a hypervisor name, or the string "none"
            # if FreeNAS is running on bare iron.
            p0 = subprocess.Popen(["/sbin/sysctl", "-n", "kern.vm_guest"],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  encoding="utf8")
            status = p0.communicate()[0].strip()
            # This really isn"t confused with python None
            if status != "none":
                # We got something other than "none", maybe "vmware", "xen", "vbox".  Regardless, smartd not running
                # in these environments isn"t a huge deal.  So we"ll skip alerting.
                return

            if not self.middleware.call_sync("system.is_freenas"):
                if self.middleware.call_sync("failover.status") != "MASTER":
                    return

            if not self.middleware.call_sync("service.started", "smartd"):
                return Alert("smartd is not running")
Exemplo n.º 14
0
    async def __run_source(self, source_name):
        alert_source = ALERT_SOURCES[source_name]

        try:
            alerts = (await alert_source.check()) or []
        except UnavailableException:
            raise
        except Exception:
            alerts = [
                Alert(AlertSourceRunFailedAlertClass,
                      args={
                          "source_name": alert_source.name,
                          "traceback": traceback.format_exc(),
                      })
            ]
        else:
            if not isinstance(alerts, list):
                alerts = [alerts]

        for alert in alerts:
            alert.source = source_name

        return alerts
Exemplo n.º 15
0
    async def check(self):
        alerts = []

        for cert in await self.middleware.call('certificate.query', [[
                'certificate', '!=', None
        ]]) + await self.middleware.call('certificateauthority.query'):
            if cert['parsed']:
                diff = (
                    datetime.strptime(cert['until'], '%a %b %d %H:%M:%S %Y') -
                    datetime.utcnow()).days
                if diff < 10:
                    alerts.append(
                        Alert(
                            CertificateIsExpiringSoonAlertClass
                            if diff <= 2 else CertificateIsExpiringAlertClass,
                            {
                                "name": cert["name"],
                                "days": diff,
                            },
                            key=[cert["name"]],
                        ))

        return alerts
Exemplo n.º 16
0
    def check_sync(self):
        j = journal.Reader()
        j.add_match("SYSLOG_IDENTIFIER=sshd")
        j.seek_realtime(datetime.now() - timedelta(days=1))
        count = 0
        last_messages = deque([], 4)
        for record in j:
            if record["MESSAGE"].startswith("Failed password for"):
                count += 1
                last_messages.append(
                    f"{record['__REALTIME_TIMESTAMP'].strftime('%d %b %H:%M:%S')}: {record['MESSAGE']}"
                )

        if count > 0:
            return Alert(SSHLoginFailuresAlertClass, {
                "count":
                count,
                "failures":
                "\n".join(([
                    f"... first {count - len(last_messages)} messages skipped ..."
                ] if count > len(last_messages) else []) + list(last_messages))
            },
                         key=list(last_messages))
Exemplo n.º 17
0
    def check_sync(self):
        try:
            self.middleware.call_sync("datastore.query", "system.update", None,
                                      {"get": True})
        except IndexError:
            self.middleware.call_sync("datastore.insert", "system.update", {
                "upd_autocheck": True,
                "upd_train": "",
            })

        path = self.middleware.call_sync("notifier.get_update_location")
        if not path:
            return

        try:
            updates = PendingUpdates(path)
        except Exception:
            updates = None

        if updates:
            return Alert(
                "There is a new update available! Apply it in System -> Update tab."
            )
Exemplo n.º 18
0
    def check_sync(self):
        if not os.path.exists(COLLECTD_FILE):
            return

        lock = LockFile(COLLECTD_FILE)

        while not lock.i_am_locking():
            try:
                lock.acquire(timeout=5)
            except LockTimeout:
                return

        with open(COLLECTD_FILE, "rb") as f:
            try:
                data = pickle.loads(f.read())
            except Exception:
                data = {}

        lock.release()

        alerts = []
        for k, v in list(data.items()):
            if k == "ctl-ha/disk_octets":
                text = (
                    "Storage Controller HA link is in use. Please check that all iSCSI and FC initiators support ALUA "
                    "and are able to connect to the active node.")
            else:
                text = k

            if v["Severity"] == "WARNING":
                klass = CollectdWarningAlertClass
            else:
                klass = CollectdCriticalAlertClass

            alerts.append(Alert(klass, text))

        return alerts
Exemplo n.º 19
0
    async def check(self):
        if not await self.middleware.call('service.started', 'iscsitarget'):
            return

        in_use_ips = {
            i['address']
            for i in await self.middleware.call('interface.ip_in_use',
                                                {'any': True})
        }
        portals = {
            p['id']: p
            for p in await self.middleware.call('iscsi.portal.query')
        }
        ips = []
        for target in await self.middleware.call('iscsi.target.query'):
            for group in target['groups']:
                ips.extend(
                    map(
                        lambda ip: ip['ip'],
                        filter(lambda a: a['ip'] not in in_use_ips,
                               portals[group['portal']]['listen'])))

        if ips:
            return Alert(ISCSIPortalIPAlertClass, ', '.join(ips))
Exemplo n.º 20
0
 async def create(self, args):
     return Alert(CloudSyncTaskFailedAlertClass, args, key=args["id"])
Exemplo n.º 21
0
    def check_sync(self):
        alerts = []

        datasets = self.middleware.call_sync(
            "zfs.dataset.query_for_quota_alert")

        pool_sizes = {}
        for d in datasets:
            d["name"] = d["name"]["rawvalue"]

            if "/" not in d["name"]:
                pool_sizes[d["name"]] = int(d["available"]["rawvalue"]) + int(
                    d["used"]["rawvalue"])

            for k, default in [("org.freenas:quota_warning", 80),
                               ("org.freenas:quota_critical", 95),
                               ("org.freenas:refquota_warning", 80),
                               ("org.freenas:refquota_critical", 95)]:
                try:
                    d[k] = int(d[k]["rawvalue"])
                except (KeyError, ValueError):
                    d[k] = default

        # call this outside the for loop since we don't need to check
        # for every dataset that could be potentially be out of quota...
        hostname = self.middleware.call_sync("system.hostname")
        datasets = sorted(datasets, key=lambda ds: ds["name"])
        for dataset in datasets:
            for quota_property in ["quota", "refquota"]:
                try:
                    quota_value = int(dataset[quota_property]["rawvalue"])
                except (AttributeError, KeyError, ValueError):
                    continue

                if quota_value == 0:
                    continue

                if quota_property == "quota":
                    # We can't use "used" property since it includes refreservation

                    # But if "refquota" is smaller than "quota", then "available" will be reported with regards to
                    # that smaller value, and we will get false positive
                    try:
                        refquota_value = int(dataset["refquota"]["rawvalue"])
                    except (AttributeError, KeyError, ValueError):
                        continue
                    else:
                        if refquota_value and refquota_value < quota_value:
                            continue

                    # Quota larger than dataset available size will never be exceeded,
                    # but will break out logic
                    if quota_value > pool_sizes[dataset["name"].split("/")[0]]:
                        continue

                    used = quota_value - int(dataset["available"]["rawvalue"])
                elif quota_property == "refquota":
                    used = int(dataset["usedbydataset"]["rawvalue"])
                else:
                    raise RuntimeError()

                used_fraction = 100 * used / quota_value

                critical_threshold = dataset[
                    f"org.freenas:{quota_property}_critical"]
                warning_threshold = dataset[
                    f"org.freenas:{quota_property}_warning"]
                if critical_threshold != 0 and used_fraction >= critical_threshold:
                    klass = QuotaCriticalAlertClass
                elif warning_threshold != 0 and used_fraction >= warning_threshold:
                    klass = QuotaWarningAlertClass
                else:
                    continue

                quota_name = quota_property[0].upper() + quota_property[1:]
                args = {
                    "name": quota_name,
                    "dataset": dataset["name"],
                    "used_fraction": used_fraction,
                    "used": format_size(used),
                    "quota_value": format_size(quota_value),
                }

                mail = None
                owner = self._get_owner(dataset)
                if owner != 0:
                    try:
                        self.middleware.call_sync('user.get_user_obj',
                                                  {'uid': owner})
                        user_exists = True
                    except KeyError:
                        user_exists = False
                        to = None
                        logger.debug("Unable to query bsduser with uid %r",
                                     owner)

                    if user_exists:
                        try:
                            bsduser = self.middleware.call_sync(
                                "datastore.query",
                                "account.bsdusers",
                                [["bsdusr_uid", "=", owner]],
                                {"get": True},
                            )
                            to = bsduser["bsdusr_email"] or None
                        except IndexError:
                            to = None

                    if to is not None:
                        mail = {
                            "to": [to],
                            "subject":
                            f"{hostname}: {quota_name} exceeded on dataset {dataset['name']}",
                            "text": klass.text % args
                        }

                alerts.append(
                    Alert(
                        klass,
                        args=args,
                        key=[dataset["name"], quota_property],
                        mail=mail,
                    ))

        return alerts
Exemplo n.º 22
0
    async def check(self):
        baseboard_manufacturer = ((await run(
            ["dmidecode", "-s", "baseboard-manufacturer"],
            check=False)).stdout.decode(errors="ignore")).strip()

        failover_hardware = await self.middleware.call("failover.hardware")

        is_gigabyte = baseboard_manufacturer == "GIGABYTE"
        is_m_series = baseboard_manufacturer == "Supermicro" and failover_hardware == "ECHOWARP"

        alerts = []
        for sensor in await self.middleware.call("sensor.query"):
            if is_gigabyte:
                if sensor["value"] is None:
                    continue

                if not (RE_CPUTEMP.match(sensor["name"])
                        or RE_SYSFAN.match(sensor["name"])):
                    continue

                if sensor["lowarn"] and sensor["value"] < sensor["lowarn"]:
                    relative = "below"
                    if sensor["value"] < sensor["locrit"]:
                        level = "critical"
                    else:
                        level = "recommended"
                elif sensor["hiwarn"] and sensor["value"] > sensor["hiwarn"]:
                    relative = "above"
                    if sensor["value"] > sensor["hicrit"]:
                        level = "critical"
                    else:
                        level = "recommended"
                else:
                    continue

                alerts.append(
                    Alert(
                        SensorAlertClass,
                        {
                            "name": sensor["name"],
                            "relative": relative,
                            "level": level,
                            "value": sensor["value"],
                            "desc": sensor["desc"],
                        },
                        key=[sensor["name"], relative, level],
                    ))

            if is_m_series:
                ps_match = re.match("(PS[0-9]+) Status", sensor["name"])
                if ps_match:
                    ps = ps_match.group(1)

                    if sensor["notes"]:
                        alerts.append(
                            Alert(
                                PowerSupplyAlertClass, {
                                    "number": ps,
                                    "errors": ", ".join(sensor["notes"]),
                                }))

        return alerts
Exemplo n.º 23
0
 async def create(self, args):
     return Alert(VMWareSnapshotDeleteFailedAlertClass, args)
Exemplo n.º 24
0
    async def __run_alerts(self):
        master_node = "A"
        backup_node = "B"
        run_on_backup_node = False
        if not await self.middleware.call("system.is_freenas"):
            if await self.middleware.call("notifier.failover_licensed"):
                master_node = await self.middleware.call("failover.node")
                try:
                    backup_node = await self.middleware.call(
                        "failover.call_remote", "failover.node")
                    remote_version = await self.middleware.call(
                        "failover.call_remote", "system.version")
                    remote_failover_status = await self.middleware.call(
                        "failover.call_remote", "notifier.failover_status")
                except Exception:
                    pass
                else:
                    if remote_version == await self.middleware.call(
                            "system.version"):
                        if remote_failover_status == "BACKUP":
                            run_on_backup_node = True

        for alert_source in ALERT_SOURCES.values():
            if not alert_source.schedule.should_run(
                    datetime.utcnow(),
                    self.alert_source_last_run[alert_source.name]):
                continue

            self.alert_source_last_run[alert_source.name] = datetime.utcnow()

            self.logger.trace("Running alert source: %r", alert_source.name)

            try:
                alerts_a = await self.__run_source(alert_source.name)
            except UnavailableException:
                alerts_a = list(self.alerts["A"][alert_source.name].values())
            for alert in alerts_a:
                alert.node = master_node

            alerts_b = []
            if run_on_backup_node and alert_source.run_on_backup_node:
                try:
                    try:
                        alerts_b = await self.middleware.call(
                            "failover.call_remote", "alert.run_source",
                            [alert_source.name])
                    except CallError as e:
                        if e.errno == CallError.EALERTCHECKERUNAVAILABLE:
                            alerts_b = list(
                                self.alerts["B"][alert_source.name].values())
                        else:
                            raise
                    else:
                        alerts_b = [
                            Alert(**dict(alert,
                                         level=(AlertLevel(alert["level"])
                                                if alert["level"] is not None
                                                else alert["level"])))
                            for alert in alerts_b
                        ]
                except Exception:
                    alerts_b = [
                        Alert(
                            title=
                            "Unable to run alert source %(source_name)r on backup node\n%(traceback)s",
                            args={
                                "source_name": alert_source.name,
                                "traceback": traceback.format_exc(),
                            },
                            key="__remote_call_exception__",
                            level=AlertLevel.CRITICAL)
                    ]
            for alert in alerts_b:
                alert.node = backup_node

            for alert in alerts_a + alerts_b:
                existing_alert = self.alerts[alert.node][
                    alert_source.name].get(alert.key)

                alert.source = alert_source.name
                if existing_alert is None:
                    alert.datetime = datetime.utcnow()
                else:
                    alert.datetime = existing_alert.datetime
                alert.level = alert.level or alert_source.level
                alert.title = alert.title or alert_source.title
                if existing_alert is None:
                    alert.dismissed = False
                else:
                    alert.dismissed = existing_alert.dismissed

            self.alerts["A"][alert_source.name] = {
                alert.key: alert
                for alert in alerts_a
            }
            self.alerts["B"][alert_source.name] = {
                alert.key: alert
                for alert in alerts_b
            }
Exemplo n.º 25
0
    def check_sync(self):
        alerts = []

        if not self.middleware.call_sync('failover.licensed'):
            return alerts

        if not self.middleware.call_sync('failover.internal_interfaces'):
            alerts.append(Alert(FailoverInterfaceNotFoundAlertClass))
            return alerts

        try:
            self.middleware.call_sync('failover.call_remote', 'core.ping')

            local_version = self.middleware.call_sync('system.version')
            remote_version = self.middleware.call_sync('failover.call_remote',
                                                       'system.version')
            if local_version != remote_version:
                return [Alert(TrueNASVersionsMismatchAlertClass)]

            if not self.middleware.call_sync('failover.call_remote',
                                             'system.ready'):
                raise UnavailableException()

            local = self.middleware.call_sync('failover.vip.get_states')
            remote = self.middleware.call_sync('failover.call_remote',
                                               'failover.vip.get_states')

            errors = self.middleware.call_sync('failover.vip.check_states',
                                               local, remote)
            for error in errors:
                alerts.append(
                    Alert(
                        CARPStatesDoNotAgreeAlertClass,
                        {"error": error},
                    ))

        except CallError as e:
            if e.errno != errno.ECONNREFUSED:
                return [Alert(FailoverStatusCheckFailedAlertClass, [str(e)])]

        status = self.middleware.call_sync('failover.status')

        if status == 'ERROR':
            errmsg = None
            if os.path.exists('/tmp/.failover_failed'):
                with open('/tmp/.failover_failed', 'r') as fh:
                    errmsg = fh.read()
            if not errmsg:
                errmsg = 'Unknown error'

            alerts.append(Alert(FailoverFailedAlertClass, [errmsg]))

        elif status not in ('MASTER', 'BACKUP', 'SINGLE'):
            alerts.append(Alert(ExternalFailoverLinkStatusAlertClass))

        internal_ifaces = self.middleware.call_sync(
            'failover.internal_interfaces')
        if internal_ifaces:
            p1 = subprocess.Popen(
                "/sbin/ifconfig %s|grep -E 'vhid (10|20) '|grep 'carp:'" %
                internal_ifaces[0],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                shell=True,
                encoding='utf8',
            )
            stdout = p1.communicate()[0].strip()
            if status != "SINGLE" and stdout.count("\n") != 1:
                alerts.append(Alert(InternalFailoverLinkStatusAlertClass))

        if status != "SINGLE":
            try:
                if sysctl.filter('kern.cam.ctl.ha_link')[0].value == 1:
                    alerts.append(Alert(CTLHALinkAlertClass))
            except Exception:
                pass

        if status == 'BACKUP':
            fobj = None
            try:
                with open(FAILOVER_JSON, 'r') as f:
                    fobj = json.loads(f.read())
            except Exception:
                pass
            try:
                if len(fobj['phrasedvolumes']) > 0:
                    keys = self.middleware.call_sync(
                        'failover.encryption_keys')['geli']
                    not_found = False
                    for pool in fobj['phrasedvolumes']:
                        if pool not in keys:
                            not_found = True
                            alerts.append(
                                Alert(NoFailoverPassphraseKeysAlertClass,
                                      {'pool': pool}))
                    if not_found:
                        # Kick a syncfrompeer if we don't.
                        self.middleware.call_sync(
                            'failover.call_remote',
                            'failover.sync_keys_to_remote_node')
            except Exception:
                pass

        return alerts
Exemplo n.º 26
0
 async def create(self, args):
     return Alert(RsyncFailedAlertClass, args, key=args['id'])
Exemplo n.º 27
0
    async def test(self, data):
        """
        Send a test alert using `type` of Alert Service.

        .. examples(websocket)::

          Send a test alert using Alert Service of Mail `type`.

            :::javascript
            {
                "id": "6841f242-840a-11e6-a437-00e04d680384",
                "msg": "method",
                "method": "alertservice.test",
                "params": [{
                    "name": "Test Email Alert",
                    "enabled": true,
                    "type": "Mail",
                    "attributes": {
                        "email": "*****@*****.**"
                    },
                    "settings": {}
                }]
            }
        """
        await self._validate(data, "alert_service_test")

        factory = ALERT_SERVICES_FACTORIES.get(data["type"])
        if factory is None:
            self.logger.error("Alert service %r does not exist", data["type"])
            return False

        try:
            alert_service = factory(self.middleware, data["attributes"])
        except Exception:
            self.logger.error(
                "Error creating alert service %r with parameters=%r",
                data["type"],
                data["attributes"],
                exc_info=True)
            return False

        master_node = "A"
        if not await self.middleware.call("system.is_freenas"):
            if await self.middleware.call("failover.licensed"):
                master_node = await self.middleware.call("failover.node")

        test_alert = Alert(
            TestAlertClass,
            node=master_node,
            datetime=datetime.utcnow(),
            last_occurrence=datetime.utcnow(),
            _uuid="test",
        )

        try:
            await alert_service.send([test_alert], [], [test_alert])
        except Exception:
            self.logger.error("Error in alert service %r",
                              data["type"],
                              exc_info=True)
            return False

        return True
Exemplo n.º 28
0
 async def create(self, args):
     return Alert(CatalogSyncFailedAlertClass, args, _key=args['catalog'])
Exemplo n.º 29
0
    async def __run_alerts(self):
        master_node = "A"
        backup_node = "B"
        product_type = await self.middleware.call("alert.product_type")
        run_on_backup_node = False
        run_failover_related = False
        if product_type == "ENTERPRISE":
            if await self.middleware.call("failover.licensed"):
                if await self.middleware.call("failover.node") == "B":
                    master_node = "B"
                    backup_node = "A"
                try:
                    remote_version = await self.middleware.call(
                        "failover.call_remote", "system.version")
                    remote_system_state = await self.middleware.call(
                        "failover.call_remote", "system.state")
                    remote_failover_status = await self.middleware.call(
                        "failover.call_remote", "failover.status")
                except Exception:
                    pass
                else:
                    if remote_version == await self.middleware.call(
                            "system.version"):
                        if remote_system_state == "READY" and remote_failover_status == "BACKUP":
                            run_on_backup_node = True

            run_failover_related = time.monotonic(
            ) > self.blocked_failover_alerts_until

        for k, source_lock in list(self.sources_locks.items()):
            if source_lock.expires_at <= time.monotonic():
                await self.unblock_source(k)

        for alert_source in ALERT_SOURCES.values():
            if product_type not in alert_source.products:
                continue

            if alert_source.failover_related and not run_failover_related:
                continue

            if not alert_source.schedule.should_run(
                    datetime.utcnow(),
                    self.alert_source_last_run[alert_source.name]):
                continue

            self.alert_source_last_run[alert_source.name] = datetime.utcnow()

            alerts_a = [
                alert for alert in self.alerts if alert.node == master_node
                and alert.source == alert_source.name
            ]
            locked = False
            if self.blocked_sources[alert_source.name]:
                self.logger.debug(
                    "Not running alert source %r because it is blocked",
                    alert_source.name)
                locked = True
            else:
                self.logger.trace("Running alert source: %r",
                                  alert_source.name)

                try:
                    alerts_a = await self.__run_source(alert_source.name)
                except UnavailableException:
                    pass
            for alert in alerts_a:
                alert.node = master_node

            alerts_b = []
            if run_on_backup_node and alert_source.run_on_backup_node:
                try:
                    alerts_b = [
                        alert for alert in self.alerts
                        if alert.node == backup_node
                        and alert.source == alert_source.name
                    ]
                    try:
                        if not locked:
                            alerts_b = await self.middleware.call(
                                "failover.call_remote", "alert.run_source",
                                [alert_source.name])

                            alerts_b = [
                                Alert(**dict(
                                    {
                                        k: v
                                        for k, v in alert.items() if k in [
                                            "args", "datetime",
                                            "last_occurrence", "dismissed",
                                            "mail"
                                        ]
                                    },
                                    klass=AlertClass.class_by_name[
                                        alert["klass"]],
                                    _source=alert["source"],
                                    _key=alert["key"])) for alert in alerts_b
                            ]
                    except CallError as e:
                        if e.errno in [
                                errno.ECONNABORTED, errno.ECONNREFUSED,
                                errno.ECONNRESET, errno.EHOSTDOWN,
                                errno.ETIMEDOUT,
                                CallError.EALERTCHECKERUNAVAILABLE
                        ]:
                            pass
                        else:
                            raise
                except ReserveFDException:
                    self.logger.debug('Failed to reserve a privileged port')
                except Exception:
                    alerts_b = [
                        Alert(AlertSourceRunFailedOnBackupNodeAlertClass,
                              args={
                                  "source_name": alert_source.name,
                                  "traceback": traceback.format_exc(),
                              },
                              _source=alert_source.name)
                    ]

            for alert in alerts_b:
                alert.node = backup_node

            for alert in alerts_a + alerts_b:
                self.__handle_alert(alert)

            self.alerts = (
                [a for a in self.alerts if a.source != alert_source.name] +
                alerts_a + alerts_b)
Exemplo n.º 30
0
 async def create(self, args):
     return Alert(RsyncSuccessAlertClass, args, key=args['id'])