Exemplo n.º 1
0
def check_running_firmware(client: Client) -> State:
    """current running firmware"""
    # Verify only one version from firmwareRunning and firmwareCtrlrRunning
    versions = set()
    for record in client.get_class("firmwareRunning"):
        versions.add(record["peVer"])
    for record in client.get_class("fimrwareCtrlrRunning"):
        versions.add(record["version"])
    if len(versions) > 1:
        log.warning("Multiple firmware versions found",
                    versions=list(versions))
    elif client.args["debug"] and len(versions) > 0:
        log.debug("Firmware:", version=versions.pop())
    return State.OK
Exemplo n.º 2
0
def check_maintenance_groups(client: Client) -> State:
    """switches are in maintenance groups"""
    # Verify all switches from topSystem are also in maintUpgJob objects
    job_dns = []
    for job in client.get_class("maintUpgJob"):
        if job.get("maintGrp", "") != "" and job["dn"].startswith("topology"):
            job_dns.append(get_node_dn(job["dn"]))
    for device in client.get_class("topSystem"):
        if device["role"] == "spine" or device["role"] == "leaf":
            if get_node_dn(device["dn"]) not in job_dns:
                log.warning("Device not in maintenance group",
                            name=device["name"])
                return State.FAIL
    log.debug("All devices in maintenance groups")
    return State.OK
Exemplo n.º 3
0
def check_apic_cluster(client: Client) -> State:
    """APIC cluster state"""
    # Verify health == 'fully-fit' in infraWiNode
    for controller in client.get_class("infraWiNode"):
        if controller.get("health") != "fully-fit":
            log.warning("not fully-fit")
            return State.FAIL
    return State.OK
Exemplo n.º 4
0
def check_switch_scale(client: Client) -> State:
    """per-switch scale"""
    # Verify counts from ctxClassCnt are < limits from fvcapRule
    from collections import defaultdict

    metrics = defaultdict(lambda: defaultdict(dict))
    # map ctxClassCnt counts to fvcapRule limits
    count_to_limit = {"l2BD": "fvBD", "fvEpP": "fvCEp", "l3Dom": "fvCtx"}
    # Build dict with device/mo/metric
    counts = client.get_class("ctxClassCnt",
                              params={"rsp-subtree-class": "l2BD,fvEpP,l3Dom"})
    for record in counts:
        node_dn = get_node_dn(record["dn"])
        key = count_to_limit.get(record["name"])
        if key:
            metrics[node_dn][key]["count"] = get_path(int, record, "count")

    # Add limits to the metrics dict
    limits = client.get_class("fvcapRule", cache=True)
    for record in limits:
        if record["dn"].startswith("topology"):
            node_dn = get_node_dn(record["dn"])
            subj = record["subj"]
            if node_dn in metrics and subj in count_to_limit.values():
                limit = get_path(int, record, "constraint")
                metrics[node_dn][subj]["limit"] = limit

    # Validate metrics
    over_limit = False
    for node_dn, by_mo in metrics.items():
        for mo, metric in by_mo.items():
            count = metric.get("count", 0)
            limit = metric.get("limit", 0)
            if count > 0 and count >= limit:
                over_limit = True
                log.warning(f"Over scale limit on {node_dn}",
                            mo=mo,
                            count=count,
                            limit=limit)
            if client.args["debug"]:
                log.debug(f"Scale metric on {node_dn}:",
                          mo=mo,
                          count=count,
                          limit=limit)
    return State.FAIL if over_limit else State.OK
Exemplo n.º 5
0
def check_vpc_health(client: Client) -> State:
    """vPC health"""
    # Verify peerSt == 'up' for vpcDom
    for vpc in client.get_class("vpcDom"):
        if vpc["peerSt"] != "up":
            log.warning("vPC not up", id=vpc["id"], state=vpc["peerSt"])
            return State.FAIL
    log.debug("All vPCs are up")
    return State.OK
Exemplo n.º 6
0
def get_interpod_routes(client: Client) -> List[Dict[str, str]]:
    """Get current inter-pod routes"""
    tepQueries = []
    for pod in client.get_class("fabricSetupP"):
        if pod.get("podType") == "physical":
            tepPool = pod.get("tepPool")
            if tepPool:
                tepQueries.append(f'eq(isisRoute.pfx,"{tepPool}")')
    if len(tepQueries) > 0:
        tepQuery = ",".join(tepQueries)
        return client.get_class(
            "isisRoute",
            params={
                "rsp-subtree-include": "relations",
                "query-target-filter": f"or({tepQuery})",
            },
        )
    return []
Exemplo n.º 7
0
def check_vcenter(client: Client) -> State:
    """VMware vCenter state"""
    # Verify operSt == 'online' in compCtrlr
    for ctrlr in client.get_class("compCtrlr"):
        if ctrlr.get("operSt", "") != "online":
            log.warning("vCenter offline", name=ctrlr["name"])
            return State.FAIL
    log.debug("All vCenter(s) online")
    return State.OK
Exemplo n.º 8
0
 def is_firmware_downloaded(self, client: Client) -> bool:
     """Is the firmware downloaded for this version?"""
     is_firmware_downloaded = False
     for record in client.get_class("firmwareFirmware"):
         is_target_ver = record.get("fullVersion") == self.version_str
         is_downloaded = record.get("dnldStatus") == "downloaded"
         if is_target_ver and is_downloaded:
             is_firmware_downloaded = True
     return is_firmware_downloaded
Exemplo n.º 9
0
def check_fabric_scale(client: Client) -> State:
    """fabric-wide scale"""
    # Verify fabric-wide MO counts are < limits from fvcapRule
    over_scale = False
    metrics = {
        "fvCEp": {
            "name": "endpoints"
        },
        "fvAEPg": {
            "name": "EPGs"
        },
        "fvBD": {
            "name": "BDs"
        },
        "fvCtx": {
            "name": "VRFs"
        },
        "fvTenant": {
            "name": "tenants"
        },
        #
        # API doesn't provide these limits
        "vzBrCP": {
            "name": "contracts",
            "limit": 10000
        },
        "vzFilter": {
            "name": "filters",
            "limit": 10000
        },
    }
    for record in client.get_class("fvcapRule", cache=True):
        subj = record.get("subj")
        if subj in metrics and record["dn"].startswith("uni"):
            metrics[subj]["limit"] = int(record.get("constraint", 0))

    def get_count(class_name):
        res = client.get(f"/api/class/{class_name}",
                         params={"rsp-subtree-include": "count"})
        return get_path(int, res, 0, "moCount", "attributes", "count")

    for class_name in metrics:
        metrics[class_name]["count"] = get_count(class_name)

    for class_name, metric in metrics.items():
        # TODO validate scenario where limit isn't found
        if "limit" in metric and metric["count"] > metric["limit"]:
            over_scale = True
            log.warning(f"Over scale limit for {class_name}:", **metric)
        elif "limit" in metric and client.args["debug"]:
            log.debug(
                f'Scale for {metric["name"]}:',
                count=metric["count"],
                limit=metric["limit"],
                mo=class_name,
            )
    return State.FAIL if over_scale else State.OK
Exemplo n.º 10
0
def check_dvs(client: Client) -> State:
    """VMware DVS state"""
    # Verify state == 'poweredOn' in compHv
    for dvs in client.get_class("compHv"):
        if dvs.get("state", "") != "poweredOn":
            log.warning("vSwitch offline", name=dvs["name"])
            return State.FAIL
    log.debug("All vSwitch(s) online")
    return State.OK
Exemplo n.º 11
0
def check_ntp_state(client: Client) -> State:
    """NTP sync"""
    # Verify srvStatus == 'synced' in datetimeClkPol
    synced_peers = set()
    for ntp in client.get_class("datetimeClkPol"):
        if "synced" in ntp.get("srvStatus", ""):
            synced_peers.add(ntp["dn"])
    if len(synced_peers) == 0:
        log.warning("NTP not synced to at least 1 peer")
        return State.FAIL
    log.debug("NTP synced.")
    return State.OK
Exemplo n.º 12
0
def check_apic_interfaces(client: Client) -> State:
    """APIC interfaces state"""
    # Verify operSt == 'up' for at least 2 ints in cnwPhysIf
    apic_ints = defaultdict(set)
    for record in client.get_class("cnwPhysIf"):
        node_dn = get_node_dn(record["dn"])
        if record.get("operSt", "") == "up":
            apic_ints[node_dn].add(record["id"])
    for dn, ints in apic_ints.items():
        if len(ints) < 2:
            log.warning("APIC {dn} has < 2 active interfaces")
            return State.FAIL
    return State.OK
Exemplo n.º 13
0
def check_tcam_scale(client: Client) -> State:
    """per-leaf TCAM scale"""
    # Verify polUsageCum <= polUsageCapCum for eqptcapacityPolUsage5min
    over_limit = False
    for record in client.get_class("eqptcapacityPolUsage5min"):
        node_dn = get_node_dn(record["dn"])
        count = get_path(int, record, "polUsageCum")
        limit = get_path(int, record, "polUsageCapCum")
        if count > 0 and count >= limit:
            over_limit = True
            log.warning(f"Over TCAM scale on {node_dn}",
                        count=count,
                        limit=limit)
        if client.args["debug"]:
            log.debug(f"TCAM scale on {node_dn}", count=count, limit=limit)
    return State.FAIL if over_limit else State.OK
Exemplo n.º 14
0
def check_backup(client: Client) -> State:
    """last backup status"""
    # Verify executeTime is within last 24hrs for configJob
    recent_backup = False
    latest_backup = None
    for backup in client.get_class("configJob"):
        iso_backup_str = backup["executeTime"][:19]
        this_backup_time = datetime.strptime(iso_backup_str,
                                             "%Y-%m-%dT%H:%M:%S")
        if latest_backup is None or this_backup_time > latest_backup:
            latest_backup = this_backup_time
        last_24hrs = datetime.now() - timedelta(hours=24)
        if this_backup_time >= last_24hrs and backup["operSt"] == "success":
            recent_backup = True
    latest = "None" if latest_backup is None else latest_backup.isoformat()
    if not recent_backup:
        log.warning("Backup not performed within 24 hours", last_backup=latest)
        return State.FAIL
    elif client.args["debug"]:
        log.debug("Last backup performed within 24 hours", last_backup=latest)
    return State.OK
Exemplo n.º 15
0
def check_firmware_download(client: Client) -> State:
    """firmware download status"""
    # Verify dnldStatus == 'downloaded' in firmwareFirmware
    # Note: this checks all firmware downloads; not just target code
    for record in client.get_class("firmwareFirmware"):
        if "fullVersion" not in record:
            continue
        status = record.get("dnldStatus", "")
        if client.args["debug"]:
            log.debug(
                "Firmware download status:",
                name=record.get("name"),
                status=record.get("dnldStatus"),
            )
        if status != "downloaded":
            log.warning(
                "Failed firmware download",
                name=record.get("name"),
                description=record.get("description"),
                status=record.get("dnldStatus"),
            )
            return State.FAIL
    return State.OK
Exemplo n.º 16
0
def get_faults(client: Client) -> List[Dict[str, str]]:
    """Get current fault list"""
    return client.get_class("faultInst")
Exemplo n.º 17
0
def get_maint_job(client: Client, group: str) -> List[Dict[str, str]]:
    """Fetch maintenance job for current group"""
    return client.get_class(
        "maintUpgJob",
        params={"query-target-filter": f'eq(maintUpgJob.maintGrp,"{group}")'},
    )
Exemplo n.º 18
0
def get_devices(client: Client) -> List[Dict[str, str]]:
    """Get current device list"""
    return client.get_class("topSystem")