def check_running_firmware(client: Client) -> State: """current running firmware""" # Verify only one version from firmwareRunning and firmwareCtrlrRunning versions = set() for record in client.get_class("firmwareRunning"): versions.add(record["peVer"]) for record in client.get_class("fimrwareCtrlrRunning"): versions.add(record["version"]) if len(versions) > 1: log.warning("Multiple firmware versions found", versions=list(versions)) elif client.args["debug"] and len(versions) > 0: log.debug("Firmware:", version=versions.pop()) return State.OK
def check_maintenance_groups(client: Client) -> State: """switches are in maintenance groups""" # Verify all switches from topSystem are also in maintUpgJob objects job_dns = [] for job in client.get_class("maintUpgJob"): if job.get("maintGrp", "") != "" and job["dn"].startswith("topology"): job_dns.append(get_node_dn(job["dn"])) for device in client.get_class("topSystem"): if device["role"] == "spine" or device["role"] == "leaf": if get_node_dn(device["dn"]) not in job_dns: log.warning("Device not in maintenance group", name=device["name"]) return State.FAIL log.debug("All devices in maintenance groups") return State.OK
def check_apic_cluster(client: Client) -> State: """APIC cluster state""" # Verify health == 'fully-fit' in infraWiNode for controller in client.get_class("infraWiNode"): if controller.get("health") != "fully-fit": log.warning("not fully-fit") return State.FAIL return State.OK
def check_switch_scale(client: Client) -> State: """per-switch scale""" # Verify counts from ctxClassCnt are < limits from fvcapRule from collections import defaultdict metrics = defaultdict(lambda: defaultdict(dict)) # map ctxClassCnt counts to fvcapRule limits count_to_limit = {"l2BD": "fvBD", "fvEpP": "fvCEp", "l3Dom": "fvCtx"} # Build dict with device/mo/metric counts = client.get_class("ctxClassCnt", params={"rsp-subtree-class": "l2BD,fvEpP,l3Dom"}) for record in counts: node_dn = get_node_dn(record["dn"]) key = count_to_limit.get(record["name"]) if key: metrics[node_dn][key]["count"] = get_path(int, record, "count") # Add limits to the metrics dict limits = client.get_class("fvcapRule", cache=True) for record in limits: if record["dn"].startswith("topology"): node_dn = get_node_dn(record["dn"]) subj = record["subj"] if node_dn in metrics and subj in count_to_limit.values(): limit = get_path(int, record, "constraint") metrics[node_dn][subj]["limit"] = limit # Validate metrics over_limit = False for node_dn, by_mo in metrics.items(): for mo, metric in by_mo.items(): count = metric.get("count", 0) limit = metric.get("limit", 0) if count > 0 and count >= limit: over_limit = True log.warning(f"Over scale limit on {node_dn}", mo=mo, count=count, limit=limit) if client.args["debug"]: log.debug(f"Scale metric on {node_dn}:", mo=mo, count=count, limit=limit) return State.FAIL if over_limit else State.OK
def check_vpc_health(client: Client) -> State: """vPC health""" # Verify peerSt == 'up' for vpcDom for vpc in client.get_class("vpcDom"): if vpc["peerSt"] != "up": log.warning("vPC not up", id=vpc["id"], state=vpc["peerSt"]) return State.FAIL log.debug("All vPCs are up") return State.OK
def get_interpod_routes(client: Client) -> List[Dict[str, str]]: """Get current inter-pod routes""" tepQueries = [] for pod in client.get_class("fabricSetupP"): if pod.get("podType") == "physical": tepPool = pod.get("tepPool") if tepPool: tepQueries.append(f'eq(isisRoute.pfx,"{tepPool}")') if len(tepQueries) > 0: tepQuery = ",".join(tepQueries) return client.get_class( "isisRoute", params={ "rsp-subtree-include": "relations", "query-target-filter": f"or({tepQuery})", }, ) return []
def check_vcenter(client: Client) -> State: """VMware vCenter state""" # Verify operSt == 'online' in compCtrlr for ctrlr in client.get_class("compCtrlr"): if ctrlr.get("operSt", "") != "online": log.warning("vCenter offline", name=ctrlr["name"]) return State.FAIL log.debug("All vCenter(s) online") return State.OK
def is_firmware_downloaded(self, client: Client) -> bool: """Is the firmware downloaded for this version?""" is_firmware_downloaded = False for record in client.get_class("firmwareFirmware"): is_target_ver = record.get("fullVersion") == self.version_str is_downloaded = record.get("dnldStatus") == "downloaded" if is_target_ver and is_downloaded: is_firmware_downloaded = True return is_firmware_downloaded
def check_fabric_scale(client: Client) -> State: """fabric-wide scale""" # Verify fabric-wide MO counts are < limits from fvcapRule over_scale = False metrics = { "fvCEp": { "name": "endpoints" }, "fvAEPg": { "name": "EPGs" }, "fvBD": { "name": "BDs" }, "fvCtx": { "name": "VRFs" }, "fvTenant": { "name": "tenants" }, # # API doesn't provide these limits "vzBrCP": { "name": "contracts", "limit": 10000 }, "vzFilter": { "name": "filters", "limit": 10000 }, } for record in client.get_class("fvcapRule", cache=True): subj = record.get("subj") if subj in metrics and record["dn"].startswith("uni"): metrics[subj]["limit"] = int(record.get("constraint", 0)) def get_count(class_name): res = client.get(f"/api/class/{class_name}", params={"rsp-subtree-include": "count"}) return get_path(int, res, 0, "moCount", "attributes", "count") for class_name in metrics: metrics[class_name]["count"] = get_count(class_name) for class_name, metric in metrics.items(): # TODO validate scenario where limit isn't found if "limit" in metric and metric["count"] > metric["limit"]: over_scale = True log.warning(f"Over scale limit for {class_name}:", **metric) elif "limit" in metric and client.args["debug"]: log.debug( f'Scale for {metric["name"]}:', count=metric["count"], limit=metric["limit"], mo=class_name, ) return State.FAIL if over_scale else State.OK
def check_dvs(client: Client) -> State: """VMware DVS state""" # Verify state == 'poweredOn' in compHv for dvs in client.get_class("compHv"): if dvs.get("state", "") != "poweredOn": log.warning("vSwitch offline", name=dvs["name"]) return State.FAIL log.debug("All vSwitch(s) online") return State.OK
def check_ntp_state(client: Client) -> State: """NTP sync""" # Verify srvStatus == 'synced' in datetimeClkPol synced_peers = set() for ntp in client.get_class("datetimeClkPol"): if "synced" in ntp.get("srvStatus", ""): synced_peers.add(ntp["dn"]) if len(synced_peers) == 0: log.warning("NTP not synced to at least 1 peer") return State.FAIL log.debug("NTP synced.") return State.OK
def check_apic_interfaces(client: Client) -> State: """APIC interfaces state""" # Verify operSt == 'up' for at least 2 ints in cnwPhysIf apic_ints = defaultdict(set) for record in client.get_class("cnwPhysIf"): node_dn = get_node_dn(record["dn"]) if record.get("operSt", "") == "up": apic_ints[node_dn].add(record["id"]) for dn, ints in apic_ints.items(): if len(ints) < 2: log.warning("APIC {dn} has < 2 active interfaces") return State.FAIL return State.OK
def check_tcam_scale(client: Client) -> State: """per-leaf TCAM scale""" # Verify polUsageCum <= polUsageCapCum for eqptcapacityPolUsage5min over_limit = False for record in client.get_class("eqptcapacityPolUsage5min"): node_dn = get_node_dn(record["dn"]) count = get_path(int, record, "polUsageCum") limit = get_path(int, record, "polUsageCapCum") if count > 0 and count >= limit: over_limit = True log.warning(f"Over TCAM scale on {node_dn}", count=count, limit=limit) if client.args["debug"]: log.debug(f"TCAM scale on {node_dn}", count=count, limit=limit) return State.FAIL if over_limit else State.OK
def check_backup(client: Client) -> State: """last backup status""" # Verify executeTime is within last 24hrs for configJob recent_backup = False latest_backup = None for backup in client.get_class("configJob"): iso_backup_str = backup["executeTime"][:19] this_backup_time = datetime.strptime(iso_backup_str, "%Y-%m-%dT%H:%M:%S") if latest_backup is None or this_backup_time > latest_backup: latest_backup = this_backup_time last_24hrs = datetime.now() - timedelta(hours=24) if this_backup_time >= last_24hrs and backup["operSt"] == "success": recent_backup = True latest = "None" if latest_backup is None else latest_backup.isoformat() if not recent_backup: log.warning("Backup not performed within 24 hours", last_backup=latest) return State.FAIL elif client.args["debug"]: log.debug("Last backup performed within 24 hours", last_backup=latest) return State.OK
def check_firmware_download(client: Client) -> State: """firmware download status""" # Verify dnldStatus == 'downloaded' in firmwareFirmware # Note: this checks all firmware downloads; not just target code for record in client.get_class("firmwareFirmware"): if "fullVersion" not in record: continue status = record.get("dnldStatus", "") if client.args["debug"]: log.debug( "Firmware download status:", name=record.get("name"), status=record.get("dnldStatus"), ) if status != "downloaded": log.warning( "Failed firmware download", name=record.get("name"), description=record.get("description"), status=record.get("dnldStatus"), ) return State.FAIL return State.OK
def get_faults(client: Client) -> List[Dict[str, str]]: """Get current fault list""" return client.get_class("faultInst")
def get_maint_job(client: Client, group: str) -> List[Dict[str, str]]: """Fetch maintenance job for current group""" return client.get_class( "maintUpgJob", params={"query-target-filter": f'eq(maintUpgJob.maintGrp,"{group}")'}, )
def get_devices(client: Client) -> List[Dict[str, str]]: """Get current device list""" return client.get_class("topSystem")