def import_services(filename=None): db.session.query(Service).delete() try: with open("intuitiveNMS/data/" + filename, "r") as import_file: services = yaml.safe_load(import_file.read()) except FileNotFoundError as e: log_console(f"Could not import services file: {repr(e)}") # validate services: make sure no duplicate ids ids = set() for service in services: if service["id"] in ids: log_event( str(datetime.now())[:-3], "importing services", filename, "ERROR", f"Duplicate service id: {service['id']}", ) continue ids.add(service["id"]) service_obj = Service(**service) db.session.add(service_obj) db.session.commit() return
def portscan_store(): portscan_info = request.get_json() if not portscan_info: return "Must provide portscan information in JSON body", 400 if "source" not in portscan_info: return "Must provide 'source' in portscan information", 400 if "serial" not in portscan_info: return "Must provide 'serial' in portscan information", 400 if "host_ip" not in portscan_info: return "Must provide 'host_ip' in portscan information", 400 if "host_name" not in portscan_info: return "Must provide 'host_name' in portscan information", 400 if "timestamp" not in portscan_info: return "Must provide 'timestamp' in portscan information", 400 if "scan_output" not in portscan_info: return "Must include 'scan_output' in portscan information", 400 record_portscan(portscan_info) log_console( f"Received portscan store request from {portscan_info['source']} for host {portscan_info['host_name']}" ) return {}, 200
def record_traceroute(traceroute_info): traceroute = dict() if "source" not in traceroute_info: log_console(f"record_traceroute: missing 'source' in traceroute info") return if "target" not in traceroute_info: log_console(f"record_traceroute: missing 'target' in traceroute info") return if "token" not in traceroute_info: log_console(f"record_traceroute: missing 'token' in traceroute_info") return if "timestamp" not in traceroute_info: log_console( f"record_traceroute: missing 'timestamp' in traceroute info") return if "traceroute_img" not in traceroute_info: log_console( f"record_traceroute: missing 'traceroute_img' in traceroute info") return traceroute["source"] = traceroute_info["source"] traceroute["target"] = traceroute_info["target"] traceroute["token"] = traceroute_info["token"] traceroute["timestamp"] = traceroute_info["timestamp"] traceroute["traceroute_img"] = traceroute_info["traceroute_img"] traceroute_obj = Traceroute(**traceroute) db.session.add(traceroute_obj) db.session.commit()
def monitor(self, interval): while True and not self.terminate: workers = get_all_workers() log_console( f"monitor:worker Beginning monitoring for {len(workers)} workers" ) for worker in workers: if self.terminate: break if not worker["last_heard"]: continue last_heard_time = datetime.strptime(worker["last_heard"], "%Y-%m-%d %H:%M:%S.%f") print(f"now: {datetime.now()}, last_heard: {last_heard_time}") if (datetime.now() - last_heard_time) > timedelta( seconds=MAX_NOT_HEARD_SECONDS): worker["availability"] = False record_worker_status(worker) set_worker(worker) for _ in range(0, int(interval / 10)): time.sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:worker")
def on_portscan_worker_reply(ch, method_frame, properties, portscan_results_str): log_console(f"portscan: received reply: {portscan_results_str}") portscan_results = json.loads(portscan_results_str) ch.close()
def get_summaries(self, items, item_type, get_hour_data_function): log_console( f"Calculating {item_type} summaries for {self.current_hour}") hourly_summaries = dict() for item in items: service_status_data = get_hour_data_function( item["id"], self.current_hour) hourly_summary = dict() hourly_summary["id"] = item["id"] hourly_summary["hour"] = str( datetime.fromisoformat(self.current_hour)) hourly_summary["availability"] = 0 hourly_summary["response_time"] = 0 num_availability_records = 0 num_response_time_records = 0 for service_status_data_item in service_status_data: num_availability_records += 1 if service_status_data_item["availability"]: hourly_summary["availability"] += 100 hourly_summary[ "response_time"] += service_status_data_item[ "response_time"] num_response_time_records += 1 if num_response_time_records > 0: hourly_summary["response_time"] = ( hourly_summary["response_time"] / num_response_time_records) if num_availability_records > 0: hourly_summary["availability"] = ( hourly_summary["availability"] / num_availability_records) log_console( f"Summary: {item_type} hourly summary for {item['name']}: {hourly_summary}" ) hourly_summaries[item["id"]] = hourly_summary rsp_time_in_seconds = hourly_summary["response_time"] / 1000 if "sla_response_time" in item and rsp_time_in_seconds > item[ "sla_response_time"]: info = f"SLA response time violation, {rsp_time_in_seconds:.2f} > {item['sla_response_time']}" log_event( str(datetime.now())[:-3], item_type, item["name"], "WARNING", info) if ("sla_availability" in item and hourly_summary["availability"] < item["sla_availability"]): info = f"SLA availability violation, {hourly_summary['availability']:.2f} < {item['sla_availability']}" log_event( str(datetime.now())[:-3], item_type, item["name"], "WARNING", info) return hourly_summaries
def monitor(self, interval): while True and not self.terminate: device_ids = get_all_device_ids() log_console( f"Monitor: Beginning Configuration monitoring for {len(device_ids)} devices" ) for device_id in device_ids: if self.terminate: break result, device = get_device( device_id=device_id ) # re-retrieve device as it may have been changed if result != "success": log_console( f"Configuration Monitor: Error retrieving device from DB. id: {device_id}, error: {device}" ) continue try: result, config = get_device_info(device, "config", get_live_info=True) if result != "success": log_console( f"!!! Unable to get device info (config) for {device['name']}" ) continue except BaseException as e: log_console( f"!!! Exception getting device info in configuration monitoring for {device['name']}: {repr(e)}" ) continue # If we made it here, we got the configuration, so store it in the DB record_device_config(device_id, config["config"]["running"]) log_event( str(datetime.now())[:-3], "configuration", device['name'], "INFO", f"Stored configuration for: {device['name']}", ) for _ in range(0, int(interval / 10)): sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:configuration")
def stop_discovery_thread(): log_console("--- ---> Shutting down discovery thread") if ThreadManager.discovery_task and ThreadManager.discovery_thread: ThreadManager.discovery_task.set_terminate() ThreadManager.discovery_thread.join() ThreadManager.discovery_task = None ThreadManager.discovery_thread = None
def stop_db_maintenance_thread(): log_console("--- ---> Shutting down dbmaintenance thread") if ThreadManager.db_maintenance_task and ThreadManager.db_maintenance_thread: ThreadManager.db_maintenance_task.set_terminate() ThreadManager.db_maintenance_thread.join() ThreadManager.db_maintenance_task = None ThreadManager.db_maintenance_thread = None
def stop_host_thread(): log_console("--- ---> Shutting down host monitoring thread") if ThreadManager.host_monitor_task and ThreadManager.host_monitor_thread: ThreadManager.host_monitor_task.set_terminate() ThreadManager.host_monitor_thread.join() ThreadManager.host_monitor_task = None ThreadManager.host_monitor_thread = None
def stop_summaries_thread(): log_console("--- ---> Shutting down summaries thread") if ThreadManager.summaries_task and ThreadManager.summaries_thread: ThreadManager.summaries_task.set_terminate() ThreadManager.summaries_thread.join() ThreadManager.summaries_task = None ThreadManager.summaries_thread = None
def initiate_capture(ip, protocol, port, count): monitor = CaptureManager.find_monitor(ip) worker = get_worker(host=monitor, worker_type=CaptureManager.worker_type) if worker is None: log_console( f"Capture Manager: could not find worker, host={monitor}, worker_type={CaptureManager.worker_type} in DB" ) return if ( protocol ): # Translate port and protocol if necessary, e.g. 'http' must become 'tcp', '80' protocol, port = CaptureManager.translate_protocol_and_port( protocol, port) capture_info = { "intuitiveNMS": get_this_ip(), "interface": interface, "ip": ip, "protocol": protocol, "port": port, "count": count, } capture_info_json = json.dumps(capture_info) if worker["connection_type"] == "rabbitmq": channel = CaptureManager.get_channel(monitor) channel.basic_publish(exchange="", routing_key="capture_queue", body=capture_info_json) log_console( f"Capture Manager: starting capture: ip:{ip} protocol:{protocol} port:{port} count:{count}" ) elif worker["connection_type"] == "http": command = dict() command["host"] = worker["host"] command["serial"] = worker["serial"] command["worker_type"] = CaptureManager.worker_type command["command"] = "start-capture" command["command_info"] = capture_info_json command["delivered"] = False set_command(command)
def shutdown(): log_console("\n\n\n---> Entering shutdown sequence") ThreadManager.initiate_terminate_all_threads() ThreadManager.stop_discovery_thread() ThreadManager.stop_host_thread() ThreadManager.stop_service_thread() ThreadManager.stop_summaries_thread() ThreadManager.stop_worker_thread() ThreadManager.stop_device_threads() ThreadManager.stop_db_maintenance_thread() log_console("\n---> all threads shut down, terminating.")
def import_compliance(filename=None): db.session.query(Compliance).delete() try: with open("intuitiveNMS/data/" + filename, "r") as import_file: standards = yaml.safe_load(import_file.read()) except FileNotFoundError as e: log_console(f"Could not import compliance file: {repr(e)}") for standard in standards: standard_obj = Compliance(**standard) db.session.add(standard_obj) db.session.commit() return
def device_heartbeat(): heartbeat_info = request.get_json() if not heartbeat_info: return "Must provide heartbeat information in JSON body", 400 if "serial" not in heartbeat_info: return "Must provide 'serial' in heartbeat information", 400 if "name" not in heartbeat_info: return "Must provide 'name' in heartbeat information", 400 result, device = get_device(device_name=heartbeat_info["name"]) if result != "success": return "Unknown device name in heartbeat information", 400 if heartbeat_info["serial"] != device["serial"]: return "Serial number in heartbeat information does not match device serial", 400 device["availability"] = True device["last_heard"] = str(datetime.now())[:-3] if "vendor" in heartbeat_info: device["vendor"] = heartbeat_info["vendor"] if "model" in heartbeat_info: device["model"] = heartbeat_info["model"] if "os" in heartbeat_info: device["os"] = heartbeat_info["os"] if "version" in heartbeat_info: device["version"] = heartbeat_info["version"] if "response_time" in heartbeat_info: device["response_time"] = heartbeat_info["response_time"] if "cpu" in heartbeat_info: device["cpu"] = heartbeat_info["cpu"] if "memory" in heartbeat_info: device["memory"] = heartbeat_info["memory"] if "uptime" in heartbeat_info: device["uptime"] = heartbeat_info["uptime"] record_device_status(device) set_device(device) log_console( f"Received heartbeat from {heartbeat_info['name']}, info={heartbeat_info}" ) return {}, 200
def check_config_compliance(device): standard = Compliance.query.filter_by(**{"vendor": device["vendor"], "os": device["os"]}).one_or_none() if standard is None: log_console(f"!!! Error retrieving compliance record for this device {device['name']}") return False standard_filename = "quokka/data/" + standard.standard_config_file result, diff = config_diff(device, standard_filename) if result != "success": return False if len(diff) > 0: with open(standard_filename + ".diff." + device["name"], "w") as config_out: config_out.write(diff) return False return True
def initiate_traceroute(target, token): # Target could be a URL; if so, use urlparse to extract the network location (hostname) if target.startswith("http://") or target.startswith("https://"): parsed_target = urlparse(target) target = parsed_target.netloc monitor = TracerouteManager.find_monitor(target) worker = get_worker(host=monitor, worker_type=TracerouteManager.worker_type) if worker is None: log_console( f"Traceroute Manager: could not find worker, host={monitor}, worker_type={TracerouteManager.worker_type} in DB" ) return traceroute_info = { "intuitiveNMS": get_this_ip(), "target": target, "token": token, } traceroute_info_json = json.dumps(traceroute_info) if worker["connection_type"] == "rabbitmq": channel = TracerouteManager.get_channel(monitor) channel.basic_publish( exchange="", routing_key="traceroute_queue", body=traceroute_info_json ) log_console(f"Traceroute Manager: starting traceroute: target : {target}") elif worker["connection_type"] == "http": command = dict() command["host"] = worker["host"] command["serial"] = worker["serial"] command["worker_type"] = TracerouteManager.worker_type command["command"] = "start-capture" command["command_info"] = traceroute_info_json command["delivered"] = False set_command(command)
def traceroute_register(): registration_info = request.get_json() if not registration_info: return "Must provide registration information in JSON body", 400 if "serial" not in registration_info: return "Must provide 'serial' in registration information", 400 if "name" not in registration_info: return "Must provide 'name' in registration information", 400 result, device = get_device(device_name=registration_info["name"]) if result != "success": return "Unknown device name in registration information", 400 if registration_info["serial"] != device["serial"]: return "Serial number in registration information does not match device serial", 400 log_console( f"Received registration request from {registration_info['name']}, serial no: {registration_info['serial']}" ) return {}, 200
def config_diff(device, config_to_diff): if device["transport"] == "napalm": napalm_device = get_napalm_device(device) try: napalm_device.open() napalm_device.load_merge_candidate(filename=config_to_diff) return "success", napalm_device.compare_config() except BaseException as e: log_console( f"!!! Exception in doing load_merge_candidate: {repr(e)}") return "failure", repr(e) else: log_console( f"!!! Unable to compare configurations, no live config to compare") return "failure", "Unable to compare configurations"
def monitor(self, interval): while True and not self.terminate: # We get device IDs every time through, so that we can then re-retrieve the device object. # The reason for this is because other entities may have changed device (e.g. SDWAN heartbeats) device_ids = get_all_device_ids() log_console(f"Monitor: Beginning compliance monitoring for {len(device_ids)} devices") for device_id in device_ids: if self.terminate: break result, device = get_device(device_id=device_id) # re-retrieve device as it may have been changed if result != "success": log_console(f"Compliance Monitor: Error retrieving device from DB. id: {device_id}, error: {device}") continue if device["availability"]: device["os_compliance"] = check_os_compliance(device) device["config_compliance"] = check_config_compliance(device) device["last_compliance_check"] = str(datetime.now())[:-3] set_device(device) for _ in range(0, int(interval / 10)): sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:compliance")
def start(self, interval): while True and not self.terminate: this_hour = str(datetime.now())[:-13] if this_hour == self.current_hour: time.sleep(60) continue service_hourly_summaries = self.get_summaries( get_all_services(), "services", get_service_status_data_for_hour) record_service_hourly_summaries(service_hourly_summaries) host_hourly_summaries = self.get_summaries( get_all_hosts(), "hosts", get_host_status_data_for_hour) record_host_hourly_summaries(host_hourly_summaries) self.get_summaries(get_all_devices(), "devices", get_device_status_data_for_hour) self.current_hour = this_hour log_console("...gracefully exiting summaries task")
def stop_device_threads(): log_console( "--- ---> Shutting down device monitoring threads (device, configuration and compliance)" ) if ThreadManager.device_monitor_task and ThreadManager.device_monitor_thread: ThreadManager.device_monitor_task.set_terminate() ThreadManager.device_monitor_thread.join() if ThreadManager.compliance_monitor_task and ThreadManager.compliance_monitor_thread: ThreadManager.compliance_monitor_task.set_terminate() ThreadManager.compliance_monitor_thread.join() if ThreadManager.configuration_monitor_task and ThreadManager.configuration_monitor_thread: ThreadManager.configuration_monitor_task.set_terminate() ThreadManager.configuration_monitor_thread.join() ThreadManager.device_monitor_task = None ThreadManager.device_monitor_thread = None ThreadManager.compliance_monitor_task = None ThreadManager.compliance_monitor_thread = None ThreadManager.configuration_monitor_task = None ThreadManager.configuration_monitor_thread = None
def get_device_info_napalm(device, requested_info, get_live_info=False): # Try to get the info from the DB first if requested_info == "facts" and not get_live_info: result, facts = get_facts(device["name"]) if result == "success": return "success", {"facts": facts} napalm_device = get_napalm_device(device) try: napalm_device.open() if requested_info == "facts": facts = napalm_device.get_facts() set_facts(device, {"facts": facts}) return "success", {"facts": napalm_device.get_facts()} elif requested_info == "environment": return "success", {"environment": napalm_device.get_environment()} elif requested_info == "interfaces": return "success", {"interfaces": napalm_device.get_interfaces()} elif requested_info == "arp": return "success", {"arp": napalm_device.get_arp_table()} elif requested_info == "mac": return "success", {"mac": napalm_device.get_mac_address_table()} elif requested_info == "config": return "success", {"config": napalm_device.get_config()} elif requested_info == "counters": return "success", { "counters": napalm_device.get_interfaces_counters() } else: return "failure", "Unknown requested info" except BaseException as e: log_console(f"!!! Exception in get device info: {repr(e)}") return "failure", repr(e)
def start(self, interval): while True and not self.terminate: this_hour = str(datetime.now())[:-13] if this_hour == self.current_hour: time.sleep(60) continue # Get datetime for 24 hours ago now = datetime.now() now_minus_24_hours = now - timedelta(hours=24) now_minus_2_hours = now - timedelta(hours=2) try: # Clean up time-series data, which can be deleted after 24 hours for table in [DeviceStatus, HostStatus, ServiceStatus, WorkerStatus]: count = table.query.filter(table.timestamp < str(now_minus_2_hours)).delete() log_console(f"DbMaintenanceTask: deleted {count} records from {table}") # Clean up packet capture data, which we allow to hang around for 24 hours for table in [Capture, Portscan, Traceroute]: count = table.query.filter(table.timestamp < str(now_minus_24_hours)).delete() log_console(f"DbMaintenanceTask: deleted {count} records from {table}") # Clean up commands greater than 24 hours old count = Command.query.filter(Command.timestamp < str(now_minus_24_hours)).delete() log_console(f"DbMaintenanceTask: deleted {count} records from Command") db.session.commit() except BaseException as e: log_console(f"!!! uh-oh, exception in DbMaintenance thread: {e}") self.current_hour = this_hour log_console("...gracefully exiting db maintenance task")
def traceroute_store(): traceroute_info = request.get_json() if not traceroute_info: return "Must provide traceroute information in JSON body", 400 if "source" not in traceroute_info: return "Must provide 'source' in traceroute information", 400 if "serial" not in traceroute_info: return "Must provide 'serial' in traceroute information", 400 if "target" not in traceroute_info: return "Must provide 'target' in traceroute information", 400 if "timestamp" not in traceroute_info: return "Must provide 'timestamp' in traceroute information", 400 if "traceroute_img" not in traceroute_info: return "Must include 'traceroute_img' in traceroute information", 400 record_traceroute(traceroute_info) log_console( f"Received traceroute store request from {traceroute_info['source']} for target {traceroute_info['target']}" ) return {}, 200
def worker_register(): registration_info = request.get_json() if not registration_info: return "Must provide registration information in JSON body", 400 if "serial" not in registration_info: return "Must provide 'serial' in registration information", 400 if "name" not in registration_info: return "Must provide 'name' in registration information", 400 worker = get_worker(host=registration_info["name"], worker_type=registration_info["worker_type"]) if worker is None: return "Unknown worker name in registration information", 400 if registration_info["serial"] != worker["serial"]: return "Serial number in registration information does not match worker serial", 400 log_console( f"Received registration request from {registration_info['name']}, serial no: {registration_info['serial']}" ) worker["availability"] = True worker["last_heard"] = str(datetime.now())[:-3] set_worker(worker) return {}, 200
def device_register(): registration_info = request.get_json() if not registration_info: return "Must provide registration information in JSON body", 400 if "serial" not in registration_info: return "Must provide 'serial' in registration information", 400 if "name" not in registration_info: return "Must provide 'name' in registration information", 400 result, device = get_device(device_name=registration_info["name"]) if result != "success": return "Unknown device name in registration information", 400 if registration_info["serial"] != device["serial"]: return "Serial number in registration information does not match device serial", 400 log_console( f"Received registration request from {registration_info['name']}, serial no: {registration_info['serial']}" ) device["availability"] = True device["last_heard"] = str(datetime.now())[:-3] set_device(device) return {}, 200
def monitor(self, interval): log_console(f"Service monitoring starting, interval={interval}") while True and not self.terminate: services = get_all_services() log_console( f"Monitor: Beginning monitoring for {len(services)} services") for service in services: if self.terminate: break log_console(f"--- service monitor for {service['name']}") availability, response_time = get_avail_and_rsp_time(service) service["availability"] = availability if not availability: record_service_status(service) set_service(service) log_event( str(datetime.now())[:-3], "service monitor", service["name"], "WARNING", f"Availability failed for service: {service['name']}", ) continue service["response_time"] = int(response_time * 1000) service["last_heard"] = str(datetime.now())[:-3] record_service_status(service) set_service(service) for _ in range(0, int(interval / 10)): time.sleep(10) if self.terminate: break log_console("...gracefully exiting monitor:service")
def worker_heartbeat(): heartbeat_info = request.get_json() if not heartbeat_info: return "Must provide heartbeat information in JSON body", 400 if "serial" not in heartbeat_info: return "Must provide 'serial' in heartbeat information", 400 worker = get_worker(serial=heartbeat_info["serial"], worker_type=heartbeat_info["worker_type"]) if worker is None: return "Unknown worker serial number in heartbeat information", 400 worker["availability"] = True worker["last_heard"] = str(datetime.now())[:-3] if "response_time" in heartbeat_info: worker["response_time"] = heartbeat_info["response_time"] if "cpu" in heartbeat_info: worker["cpu"] = heartbeat_info["cpu"] if "memory" in heartbeat_info: worker["memory"] = heartbeat_info["memory"] if "uptime" in heartbeat_info: worker["uptime"] = heartbeat_info["uptime"] record_worker_status(worker) set_worker(worker) log_console( f"Received heartbeat from {heartbeat_info['name']}, info={heartbeat_info}" ) result, commands = get_commands( serial=heartbeat_info["serial"], worker_type=heartbeat_info["worker_type"], set_delivered=True, ) if result != "success": log_console( f"Failed to retrieve commands for {heartbeat_info['serial']}") else: log_console( f"Delivered commands to {heartbeat_info['serial']}, commands={commands}" ) return {"commands": commands}, 200
def check_os_compliance(device): facts = None standard = Compliance.query.filter_by(**{"vendor": device["vendor"], "os": device["os"]}).one_or_none() if standard is None: log_console(f"!!! Error retrieving compliance record for this device {device['name']}") return False try: result, facts = get_device_info(device, "facts", get_live_info=True) except BaseException as e: log_console(f"!!! Exception getting device info in compliance monitoring for {device['name']}: {repr(e)}") result = "failed" if result == "failed" or not facts or "facts" not in facts or "os_version" not in facts["facts"]: log_console(f"!!! Error retrieving version info for this device {device['name']}") return False return check_version(device, standard=standard.standard_version, actual=facts["facts"]["os_version"])