def get_summaries(self, items, item_type, get_hour_data_function): log_console( f"Calculating {item_type} summaries for {self.current_hour}") hourly_summaries = dict() for item in items: service_status_data = get_hour_data_function( item["id"], self.current_hour) hourly_summary = dict() hourly_summary["id"] = item["id"] hourly_summary["hour"] = str( datetime.fromisoformat(self.current_hour)) hourly_summary["availability"] = 0 hourly_summary["response_time"] = 0 num_availability_records = 0 num_response_time_records = 0 for service_status_data_item in service_status_data: num_availability_records += 1 if service_status_data_item["availability"]: hourly_summary["availability"] += 100 hourly_summary[ "response_time"] += service_status_data_item[ "response_time"] num_response_time_records += 1 if num_response_time_records > 0: hourly_summary["response_time"] = ( hourly_summary["response_time"] / num_response_time_records) if num_availability_records > 0: hourly_summary["availability"] = ( hourly_summary["availability"] / num_availability_records) log_console( f"Summary: {item_type} hourly summary for {item['name']}: {hourly_summary}" ) hourly_summaries[item["id"]] = hourly_summary rsp_time_in_seconds = hourly_summary["response_time"] / 1000 if "sla_response_time" in item and rsp_time_in_seconds > item[ "sla_response_time"]: info = f"SLA response time violation, {rsp_time_in_seconds:.2f} > {item['sla_response_time']}" log_event( str(datetime.now())[:-3], item_type, item["name"], "WARNING", info) if ("sla_availability" in item and hourly_summary["availability"] < item["sla_availability"]): info = f"SLA availability violation, {hourly_summary['availability']:.2f} < {item['sla_availability']}" log_event( str(datetime.now())[:-3], item_type, item["name"], "WARNING", info) return hourly_summaries
def monitor(self, interval): while True and not self.terminate: device_ids = get_all_device_ids() log_console( f"Monitor: Beginning Configuration monitoring for {len(device_ids)} devices" ) for device_id in device_ids: if self.terminate: break result, device = get_device( device_id=device_id ) # re-retrieve device as it may have been changed if result != "success": log_console( f"Configuration Monitor: Error retrieving device from DB. id: {device_id}, error: {device}" ) continue try: result, config = get_device_info(device, "config", get_live_info=True) if result != "success": log_console( f"!!! Unable to get device info (config) for {device['name']}" ) continue except BaseException as e: log_console( f"!!! Exception getting device info in configuration monitoring for {device['name']}: {repr(e)}" ) continue # If we made it here, we got the configuration, so store it in the DB record_device_config(device_id, config["config"]["running"]) log_event( str(datetime.now())[:-3], "configuration", device['name'], "INFO", f"Stored configuration for: {device['name']}", ) for _ in range(0, int(interval / 10)): sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:configuration")
def monitor(self, interval): while True and not self.terminate: hosts = get_all_hosts() log_console( f"monitor:host Beginning monitoring for {len(hosts)} hosts") for host in hosts: if self.terminate: break log_console(f"--- monitor:host pinging {host['ip_address']}") try: ping_output = subprocess.check_output([ "ping", "-c3", "-n", "-i0.5", "-W2", str(host["ip_address"]) ]) host["availability"] = True host["response_time"] = get_response_time(str(ping_output)) host["last_heard"] = str(datetime.now())[:-3] except subprocess.CalledProcessError: host["availability"] = False log_event( str(datetime.now())[:-3], "host monitor", host["name"], "INFO", f"Availability failed for host: {host['name']}", ) record_host_status(host) set_host(host) for _ in range(0, int(interval / 10)): time.sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:host")
def monitor(self, interval): log_console(f"Service monitoring starting, interval={interval}") while True and not self.terminate: services = get_all_services() log_console( f"Monitor: Beginning monitoring for {len(services)} services") for service in services: if self.terminate: break log_console(f"--- service monitor for {service['name']}") availability, response_time = get_avail_and_rsp_time(service) service["availability"] = availability if not availability: record_service_status(service) set_service(service) log_event( str(datetime.now())[:-3], "service monitor", service["name"], "WARNING", f"Availability failed for service: {service['name']}", ) continue service["response_time"] = int(response_time * 1000) service["last_heard"] = str(datetime.now())[:-3] record_service_status(service) set_service(service) for _ in range(0, int(interval / 10)): time.sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:service")
def get_device_status(device): device_status = dict() device_status["availability"] = False device_status["response_time"] = None device_status["cpu"] = None device_status["memory"] = None device_status["last_heard"] = None env = None response_time = None if device["os"] in {"ios", "iosxe", "nxos-ssh" } and device["transport"] == "napalm": try: time_start = time.time() result, env = get_device_info(device, "environment") response_time = time.time() - time_start except BaseException as e: info = f"!!! Exception in monitoring device, get environment: {repr(e)}" log_console(info) log_event( str(datetime.now())[:-3], "device", device["name"], "SEVERE", info) result = "failed" else: try: time_start = time.time() result, facts = get_device_info(device, "facts", get_live_info=True) response_time = time.time() - time_start except BaseException as e: info = f"!!! Exception in monitoring device, get facts: {repr(e)}" log_console(info) log_event( str(datetime.now())[:-3], "device", device["name"], "SEVERE", info) result = "failed" if result != "success": log_event( str(datetime.now())[:-3], "device monitor", device["name"], "SEVERE", f"Availability failed for device: {device['name']}", ) else: device_status["availability"] = True if response_time: device_status["response_time"] = int(response_time * 1000) device_status["last_heard"] = str(datetime.now())[:-3] if env: device_status["cpu"] = calculate_cpu(env["environment"]["cpu"]) device_status["memory"] = calculate_memory( env["environment"]["memory"]) return device_status
def monitor(self, interval): while True and not self.terminate: # We get device IDs every time through, so that we can then re-retrieve the device object. # The reason for this is because other entities may have changed device (e.g. SDWAN heartbeats) device_ids = get_all_device_ids() log_console( f"Monitor: Beginning monitoring for {len(device_ids)} devices") for device_id in device_ids: result, device = get_device( device_id=device_id ) # re-retrieve device as it may have been changed if result != "success": log_console( f"Device Monitor: Error retrieving device from DB. id: {device_id}, error: {device}" ) continue if device["transport"] == "HTTP-REST": if not device["last_heard"]: continue last_heard_time = datetime.strptime( device["last_heard"], "%Y-%m-%d %H:%M:%S.%f") print( f"now: {datetime.now()}, last_heard: {last_heard_time}" ) if (datetime.now() - last_heard_time) > timedelta( seconds=MAX_NOT_HEARD_SECONDS): device["availability"] = False record_device_status(device) set_device(device) continue # HTTP-REST devices (e.g. sdwan) communicate to us, we don't poll them try: ip_address = socket.gethostbyname(device["hostname"]) except (socket.error, socket.gaierror) as e: info = f"!!! Caught socket error {repr(e)}, continuing to next device" log_console(info) log_event( str(datetime.now())[:-3], "device", device['name'], "SEVERE", info) ip_address = None if self.terminate: break log_console( f"--- monitor:device get environment {device['name']}") device_status = get_device_status(device) device["ip_address"] = ip_address device["availability"] = device_status["availability"] device["response_time"] = device_status["response_time"] device["cpu"] = device_status["cpu"] device["memory"] = device_status["memory"] if device_status["last_heard"]: device["last_heard"] = device_status["last_heard"] record_device_status(device) set_device(device) for _ in range(0, int(interval / 10)): sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:device")