def __init__(self, rule_file, discovery_server, discovery_port, collector_addr, sandesh_global, send_build_info = False): self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr self.rule_file = rule_file self.rules_data = '' self.max_cores = 4 self.max_old_cores = 3 self.max_new_cores = 1 self.all_core_file_list = [] self.core_dir_modified_time = 0 self.tick_count = 0 self.fail_status_bits = 0 self.prev_fail_status_bits = 1 self.instance_id = INSTANCE_ID_DEFAULT self.discovery_server = discovery_server self.discovery_port = discovery_port self.collector_addr = collector_addr self.listener_nodemgr = EventListenerProtocolNodeMgr() self.sandesh_global = sandesh_global self.curr_build_info = None self.new_build_info = None self.send_build_info = send_build_info
def __init__(self, rule_file, discovery_server, discovery_port, collector_addr, sandesh_global): self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr self.rule_file = rule_file self.rules_data = '' self.max_cores = 4 self.max_old_cores = 3 self.max_new_cores = 1 self.all_core_file_list = [] self.core_dir_modified_time = 0 self.tick_count = 0 self.fail_status_bits = 0 self.prev_fail_status_bits = 1 self.instance_id = INSTANCE_ID_DEFAULT self.discovery_server = discovery_server self.discovery_port = discovery_port self.collector_addr = collector_addr self.listener_nodemgr = EventListenerProtocolNodeMgr() self.sandesh_global = sandesh_global self.curr_build_info = None self.new_build_info = None self.last_cpu = None self.last_time = 0 self.installed_package_version = None
class EventManager(object): rules_data = [] group_names = [] process_state_db = {} third_party_process_state_db = {} FAIL_STATUS_DUMMY = 0x1 FAIL_STATUS_DISK_SPACE = 0x2 FAIL_STATUS_SERVER_PORT = 0x4 FAIL_STATUS_NTP_SYNC = 0x8 FAIL_STATUS_DISK_SPACE_NA = 0x10 def __init__(self, rule_file, discovery_server, discovery_port, collector_addr, sandesh_global): self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr self.rule_file = rule_file self.rules_data = '' self.max_cores = 4 self.max_old_cores = 3 self.max_new_cores = 1 self.all_core_file_list = [] self.core_dir_modified_time = 0 self.tick_count = 0 self.fail_status_bits = 0 self.prev_fail_status_bits = 1 self.instance_id = INSTANCE_ID_DEFAULT self.discovery_server = discovery_server self.discovery_port = discovery_port self.collector_addr = collector_addr self.listener_nodemgr = EventListenerProtocolNodeMgr() self.sandesh_global = sandesh_global self.curr_build_info = None self.new_build_info = None self.last_cpu = None self.last_time = 0 self.installed_package_version = None # Get all the current processes in the node def get_current_process(self): proxy = xmlrpclib.ServerProxy( 'http://127.0.0.1', transport=supervisor.xmlrpc.SupervisorTransport( None, None, serverurl=self.supervisor_serverurl)) # Add all current processes to make sure nothing misses the radar process_state_db = {} # list of all processes on the node is made here for proc_info in proxy.supervisor.getAllProcessInfo(): if (proc_info['name'] != proc_info['group']): proc_name = proc_info['group'] + ":" + proc_info['name'] else: proc_name = proc_info['name'] proc_pid = proc_info['pid'] process_stat_ent = self.get_process_stat_object(proc_name) process_stat_ent.process_state = "PROCESS_STATE_" + \ proc_info['statename'] if (process_stat_ent.process_state == 'PROCESS_STATE_RUNNING'): process_stat_ent.start_time = str(proc_info['start'] * 1000000) process_stat_ent.start_count += 1 process_stat_ent.pid = proc_pid process_state_db[proc_name] = process_stat_ent return process_state_db # end get_current_process # Add the current processes in the node to db def add_current_process(self): self.process_state_db = self.get_current_process() # end add_current_process # In case the processes in the Node can change, update current processes def update_current_process(self): process_state_db = self.get_current_process() old_process_set = set(self.process_state_db.keys()) new_process_set = set(process_state_db.keys()) common_process_set = new_process_set.intersection(old_process_set) added_process_set = new_process_set - common_process_set deleted_process_set = old_process_set - common_process_set for deleted_process in deleted_process_set: self.delete_process_handler(deleted_process) for added_process in added_process_set: self.add_process_handler( added_process, process_state_db[added_process]) # end update_current_process # process is deleted, send state & remove it from db def delete_process_handler(self, deleted_process): self.process_state_db[deleted_process].deleted = True group_val = self.process_state_db[deleted_process].group self.send_process_state_db([group_val]) del self.process_state_db[deleted_process] # end delete_process_handler # new process added, update db & send state def add_process_handler(self, added_process, process_info): self.process_state_db[added_process] = process_info group_val = self.process_state_db[added_process].group self.send_process_state_db([group_val]) # end add_process_handler def get_discovery_client(self): _disc = client.DiscoveryClient( self.discovery_server, self.discovery_port, self.module_id) return _disc def check_ntp_status(self): ntp_status_cmd = 'ntpq -n -c pe | grep "^*"' proc = Popen(ntp_status_cmd, shell=True, stdout=PIPE, stderr=PIPE) (output, errout) = proc.communicate() if proc.returncode != 0: self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC else: self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC self.send_nodemgr_process_status() def get_build_info(self): # Retrieve build_info from package/rpm and cache it if self.curr_build_info is None: command = "contrail-version contrail-nodemgr | grep contrail-nodemgr" version = os.popen(command).read() version_partials = version.split() if len(version_partials) < 3: sys.stderr.write('Not enough values to parse package version %s' % version) return "" else: _, rpm_version, build_num = version_partials self.new_build_info = build_info + '"build-id" : "' + \ rpm_version + '", "build-number" : "' + \ build_num + '"}]}' if (self.new_build_info != self.curr_build_info): self.curr_build_info = self.new_build_info return self.curr_build_info def update_process_core_file_list(self): #LOG_DEBUG sys.stderr.write('update_process_core_file_list: begin:') ret_value = False try: ls_command = "ls -1 /var/crashes" (corenames, stderr) = Popen( ls_command.split(), stdout=PIPE).communicate() process_state_db_tmp = {} for key in self.process_state_db: #LOG_DEBUG sys.stderr.write('update_process_core_file_list: key: '+key+'\n') proc_stat = self.get_process_stat_object(key) process_state_db_tmp[key] = proc_stat #LOG_DEBUG sys.stderr.write('update_process_core_file_list: corenames: '+corenames+'\n') for corename in corenames.split(): exec_name = corename.split('.')[1] for key in self.process_state_db: if key.startswith(exec_name): #LOG_DEBUG sys.stderr.write('update_process_core_file_list: startswith: '+exec_name+'\n') process_state_db_tmp[key].core_file_list.append(corename.rstrip()) for key in self.process_state_db: if set(process_state_db_tmp[key].core_file_list) != set(self.process_state_db[key].core_file_list): self.process_state_db[key].core_file_list = process_state_db_tmp[key].core_file_list ret_value = True except Exception as e: sys.stderr.write('update_process_core_file_list: exception: '+str(e)) #LOG_DEBUG sys.stderr.write('update_process_core_file_list: ret_value: '+str(ret_value)+'\n') return ret_value #end update_process_core_file_list def send_process_state_db_base(self, group_names, ProcessInfo): name = socket.gethostname() for group in group_names: process_infos = [] delete_status = True for key in self.process_state_db: pstat = self.process_state_db[key] if (pstat.group != group): continue process_info = ProcessInfo() process_info.process_name = key process_info.process_state = pstat.process_state process_info.start_count = pstat.start_count process_info.stop_count = pstat.stop_count process_info.exit_count = pstat.exit_count process_info.last_start_time = pstat.start_time process_info.last_stop_time = pstat.stop_time process_info.last_exit_time = pstat.exit_time process_info.core_file_list = pstat.core_file_list process_infos.append(process_info) #in tor-agent case, we should use tor-agent name as uve key name = pstat.name if pstat.deleted == False: delete_status = False if not process_infos: continue # send node UVE node_status = NodeStatus() node_status.name = name node_status.deleted = delete_status node_status.process_info = process_infos node_status.build_info = self.get_build_info() node_status_uve = NodeStatusUVE(table=self.table, data=node_status) msg = 'send_process_state_db_base: Sending UVE:' + str(node_status_uve) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_INFO), msg) node_status_uve.send() def update_all_core_file(self): stat_command_option = "stat --printf=%Y /var/crashes" modified_time = Popen( stat_command_option.split(), stdout=PIPE).communicate() if modified_time[0] == self.core_dir_modified_time: return False self.core_dir_modified_time = modified_time[0] ls_command_option = "ls /var/crashes" (corename, stderr) = Popen( ls_command_option.split(), stdout=PIPE).communicate() self.all_core_file_list = corename.split('\n')[0:-1] self.send_process_state_db(self.group_names) return True def get_process_stat_object(self, pname): return ProcessStat(pname) def send_process_state(self, pname, pstate, pheaders): # update process stats if pname in self.process_state_db.keys(): proc_stat = self.process_state_db[pname] else: proc_stat = self.get_process_stat_object(pname) if not proc_stat.group in self.group_names: self.group_names.append(proc_stat.group) proc_stat.process_state = pstate send_uve = False if (pstate == 'PROCESS_STATE_RUNNING'): proc_stat.start_count += 1 proc_stat.start_time = str(int(time.time() * 1000000)) send_uve = True proc_stat.pid = int(pheaders['pid']) if (pstate == 'PROCESS_STATE_STOPPED'): proc_stat.stop_count += 1 send_uve = True proc_stat.stop_time = str(int(time.time() * 1000000)) proc_stat.last_exit_unexpected = False if (pstate == 'PROCESS_STATE_EXITED'): proc_stat.exit_count += 1 send_uve = True proc_stat.exit_time = str(int(time.time() * 1000000)) if not(int(pheaders['expected'])): self.stderr.write( pname + " with pid:" + pheaders['pid'] + " exited abnormally\n") proc_stat.last_exit_unexpected = True # check for core file for this exit find_command_option = \ "find /var/crashes -name core.[A-Za-z]*." + \ pheaders['pid'] + "*" self.stderr.write( "find command option for cores:" + find_command_option + "\n") (corename, stderr) = Popen( find_command_option.split(), stdout=PIPE).communicate() self.stderr.write("core file: " + corename + "\n") if ((corename is not None) and (len(corename.rstrip()) >= 1)): # before adding to the core file list make # sure that we do not have too many cores sys.stderr.write( 'core_file_list:' + str(proc_stat.core_file_list) + ", self.max_cores:" + str(self.max_cores) + "\n") if (len(proc_stat.core_file_list) == self.max_cores): # get rid of old cores sys.stderr.write( 'max # of cores reached:' + str(self.max_cores) + "\n") val = self.max_cores - self.max_new_cores + 1 core_files_to_be_deleted = \ proc_stat.core_file_list[self.max_old_cores:(val)] sys.stderr.write( 'deleting core file list:' + str(core_files_to_be_deleted) + "\n") for core_file in core_files_to_be_deleted: sys.stderr.write( 'deleting core file:' + core_file + "\n") try: os.remove(core_file) except OSError as e: sys.stderr.write('ERROR: ' + str(e) + '\n') # now delete the list as well val = self.max_cores - self.max_new_cores + 1 del proc_stat.core_file_list[self.max_old_cores:(val)] # now add the new core to the core file list proc_stat.core_file_list.append(corename.rstrip()) sys.stderr.write( "# of cores for " + pname + ":" + str(len(proc_stat.core_file_list)) + "\n") # update process state database self.process_state_db[pname] = proc_stat f = open('/var/log/contrail/process_state' + self.node_type + ".json", 'w') f.write(json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) if not(send_uve): return if (send_uve): self.send_process_state_db([proc_stat.group]) def send_nodemgr_process_status_base(self, ProcessStateNames, ProcessState, ProcessStatus): if (self.prev_fail_status_bits != self.fail_status_bits): self.prev_fail_status_bits = self.fail_status_bits fail_status_bits = self.fail_status_bits state, description = self.get_process_state(fail_status_bits) process_status = ProcessStatus( module_id=self.module_id, instance_id=self.instance_id, state=state, description=description) process_status_list = [] process_status_list.append(process_status) node_status = NodeStatus(name=socket.gethostname(), process_status=process_status_list) node_status_uve = NodeStatusUVE(table=self.table, data=node_status) msg = 'send_nodemgr_process_status_base: Sending UVE:' + str(node_status_uve) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_INFO), msg) node_status_uve.send() def send_init_info(self): # system_cpu_info mem_cpu_usage_data = MemCpuUsageData(os.getpid(), self.last_cpu, self.last_time) sys_cpu = SystemCpuInfo() sys_cpu.num_socket = mem_cpu_usage_data.get_num_socket() sys_cpu.num_cpu = mem_cpu_usage_data.get_num_cpu() sys_cpu.num_core_per_socket = mem_cpu_usage_data.get_num_core_per_socket() sys_cpu.num_thread_per_core = mem_cpu_usage_data.get_num_thread_per_core() node_status = NodeStatus( name=socket.gethostname(), system_cpu_info=sys_cpu, build_info = self.get_build_info()) # installed/running package version installed_package_version = \ NodeMgrUtils.get_package_version(self.get_package_name()) if installed_package_version is None: sys.stderr.write("Error getting %s package version\n" % (self.get_package_name())) exit(-1) else: self.installed_package_version = installed_package_version node_status.installed_package_version = installed_package_version node_status.running_package_version = installed_package_version node_status_uve = NodeStatusUVE(table=self.table, data=node_status) node_status_uve.send() def get_all_processes_mem_cpu_usage(self): process_mem_cpu_usage = {} for key in self.process_state_db: pstat = self.process_state_db[key] if (pstat.process_state == 'PROCESS_STATE_RUNNING'): try: mem_cpu_usage_data = MemCpuUsageData(pstat.pid, pstat.last_cpu, pstat.last_time) process_mem_cpu = mem_cpu_usage_data.get_process_mem_cpu_info() except psutil.NoSuchProcess: sys.stderr.write("NoSuchProcess: process name:%s pid:%d\n" % (pstat.pname, pstat.pid)) else: process_mem_cpu.__key = pstat.pname process_mem_cpu_usage[process_mem_cpu.__key] = process_mem_cpu pstat.last_cpu = mem_cpu_usage_data.last_cpu pstat.last_time = mem_cpu_usage_data.last_time # walk through all processes being monitored by nodemgr, # not spawned by supervisord third_party_process_dict = self.get_node_third_party_process_dict() for pname in third_party_process_dict: pattern = third_party_process_dict[pname] cmd = "ps -aux | grep " + pattern + " | awk '{print $2}' | head -n1" proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if (stdout != ''): pid = int(stdout.strip('\n')) if pname in self.third_party_process_state_db: pstat = self.third_party_process_state_db[pname] else: pstat = self.get_process_stat_object(pname) pstat.pid = pid self.third_party_process_state_db[pname] = pstat try: mem_cpu_usage_data = MemCpuUsageData(pstat.pid, pstat.last_cpu, pstat.last_time) process_mem_cpu = mem_cpu_usage_data.get_process_mem_cpu_info() except psutil.NoSuchProcess: sys.stderr.write("NoSuchProcess: process name:%s pid:%d\n" % (pstat.pname, pstat.pid)) self.third_party_process_state_db.pop(pstat.pname) else: process_mem_cpu.__key = pname process_mem_cpu_usage[process_mem_cpu.__key] = process_mem_cpu pstat.last_cpu = mem_cpu_usage_data.last_cpu pstat.last_time = mem_cpu_usage_data.last_time return process_mem_cpu_usage def get_disk_usage(self): disk_usage_info = {} partition = subprocess.Popen( "df -PT -t ext2 -t ext3 -t ext4 -t xfs", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for line in partition.stdout: if 'Filesystem' in line: continue partition_name = line.rsplit()[0] partition_type = line.rsplit()[1] partition_space_used_1k = line.rsplit()[3] partition_space_available_1k = line.rsplit()[4] disk_usage_stat = DiskPartitionUsageStats() try: disk_usage_stat.partition_type = str(partition_type) disk_usage_stat.__key = str(partition_name) disk_usage_stat.partition_space_used_1k = \ int(partition_space_used_1k) disk_usage_stat.partition_space_available_1k = \ int(partition_space_available_1k) total_disk_space = \ disk_usage_stat.partition_space_used_1k + \ disk_usage_stat.partition_space_available_1k disk_usage_stat.percentage_partition_space_used = \ int(round((float(disk_usage_stat.partition_space_used_1k)/ \ float(total_disk_space))*100)) except ValueError: sys.stderr.write("Failed to get local disk space usage" + "\n") else: disk_usage_info[partition_name] = disk_usage_stat return disk_usage_info # end get_disk_usage def get_process_state_base(self, fail_status_bits, ProcessStateNames, ProcessState): if fail_status_bits: state = ProcessStateNames[ProcessState.NON_FUNCTIONAL] description = self.get_failbits_nodespecific_desc(fail_status_bits) if (description is ""): if fail_status_bits & self.FAIL_STATUS_NTP_SYNC: if description != "": description += " " description += "NTP state unsynchronized." else: state = ProcessStateNames[ProcessState.FUNCTIONAL] description = '' return state, description def get_failbits_nodespecific_desc(self, fail_status_bits): return "" def event_process_state(self, pheaders, headers): msg = ("process:" + pheaders['processname'] + "," + "groupname:" + pheaders['groupname'] + "," + "eventname:" + headers['eventname']) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level(SandeshLevel.SYS_DEBUG), msg) pname = pheaders['processname'] if (pheaders['processname'] != pheaders['groupname']): pname = pheaders['groupname'] + ":" + pheaders['processname'] self.send_process_state(pname, headers['eventname'], pheaders) for rules in self.rules_data['Rules']: if 'processname' in rules: if ((rules['processname'] == pheaders['groupname']) and (rules['process_state'] == headers['eventname'])): msg = "got a hit with:" + str(rules) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_DEBUG), msg) # do not make async calls try: ret_code = subprocess.call( [rules['action']], shell=True, stdout=self.stderr, stderr=self.stderr) except Exception as e: msg = ('Failed to execute action: ' + rules['action'] + ' with err ' + str(e)) self.sandesh_global.logger().logger.log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_ERR), msg) else: if ret_code: msg = ('Execution of action ' + rules['action'] + ' returned err ' + str(ret_code)) self.sandesh_global.logger().log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_ERR), msg) def event_process_communication(self, pdata): flag_and_value = pdata.partition(":") msg = ("Flag:" + flag_and_value[0] + " Value:" + flag_and_value[2]) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level (SandeshLevel.SYS_DEBUG), msg) for rules in self.rules_data['Rules']: if 'flag_name' in rules: if ((rules['flag_name'] == flag_and_value[0]) and (rules['flag_value'].strip() == flag_and_value[2].strip())): msg = "got a hit with:" + str(rules) self.sandesh_global.logger().log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_DEBUG), msg) cmd_and_args = ['/usr/bin/bash', '-c', rules['action']] subprocess.Popen(cmd_and_args) def event_tick_60(self): self.tick_count += 1 # get disk usage info periodically disk_usage_info = self.get_disk_usage() # typical ntp sync time is about 5 min - first time, # we scan only after 10 min if self.tick_count >= 10: self.check_ntp_status() if self.update_process_core_file_list(): self.send_process_state_db(['default']) process_mem_cpu_usage = self.get_all_processes_mem_cpu_usage() # get system mem/cpu usage system_mem_cpu_usage_data = MemCpuUsageData(os.getpid(), self.last_cpu, self.last_time) system_mem_usage = system_mem_cpu_usage_data.get_sys_mem_info(self.uve_node_type) system_cpu_usage = system_mem_cpu_usage_data.get_sys_cpu_info(self.uve_node_type) # update last_cpu/time after all processing is complete self.last_cpu = system_mem_cpu_usage_data.last_cpu self.last_time = system_mem_cpu_usage_data.last_time # send above encoded buffer node_status = NodeStatus(name=socket.gethostname(), disk_usage_info=disk_usage_info, system_mem_usage=system_mem_usage, system_cpu_usage=system_cpu_usage, process_mem_cpu_usage=process_mem_cpu_usage) # encode other core file if self.update_all_core_file(): node_status.all_core_file_list = self.all_core_file_list installed_package_version = \ NodeMgrUtils.get_package_version(self.get_package_name()) if installed_package_version is None: sys.stderr.write("Error getting %s package version\n" % (self.get_package_name())) installed_package_version = "package-version-unknown" if (installed_package_version != self.installed_package_version): self.installed_package_version = installed_package_version node_status.installed_package_version = installed_package_version node_status_uve = NodeStatusUVE(table=self.table, data=node_status) node_status_uve.send() current_time = int(time.time()) if ((abs(current_time - self.prev_current_time)) > 300): # update all process start_times with the updated time # Compute the elapsed time and subtract them from # current time to get updated values sys.stderr.write( "Time lapse detected " + str(abs(current_time - self.prev_current_time)) + "\n") for key in self.process_state_db: pstat = self.process_state_db[key] if pstat.start_time is not '': pstat.start_time = str( (int(current_time - (self.prev_current_time - ((int)(pstat.start_time)) / 1000000))) * 1000000) if (pstat.process_state == 'PROCESS_STATE_STOPPED'): if pstat.stop_time is not '': pstat.stop_time = str( int(current_time - (self.prev_current_time - ((int)(pstat.stop_time)) / 1000000)) * 1000000) if (pstat.process_state == 'PROCESS_STATE_EXITED'): if pstat.exit_time is not '': pstat.exit_time = str( int(current_time - (self.prev_current_time - ((int)(pstat.exit_time)) / 1000000)) * 1000000) # update process state database self.process_state_db[key] = pstat try: json_file = '/var/log/contrail/process_state' + \ self.node_type + ".json" f = open(json_file, 'w') f.write( json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) except: sys.stderr.write("Unable to write json") pass self.send_process_state_db(self.group_names) self.prev_current_time = int(time.time()) def do_periodic_events(self): self.event_tick_60() def runforever(self, test=False): self.prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload + '\n') # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): self.do_periodic_events() self.listener_nodemgr.ok(self.stdout) def nodemgr_sighup_handler(self): config = ConfigParser.SafeConfigParser() config.read(self.config_file) if 'COLLECTOR' in config.sections(): try: collector = config.get('COLLECTOR', 'server_list') collector_list = collector.split() except ConfigParser.NoOptionError as e: pass if collector_list: new_chksum = hashlib.md5("".join(collector_list)).hexdigest() if new_chksum != self.collector_chksum: self.collector_chksum = new_chksum random_collectors = random.sample(collector_list, len(collector_list)) self.sandesh_global.reconfig_collectors(random_collectors)
class EventManager(object): rules_data = [] group_names = [] process_state_db = {} FAIL_STATUS_DUMMY = 0x1 FAIL_STATUS_DISK_SPACE = 0x2 FAIL_STATUS_SERVER_PORT = 0x4 FAIL_STATUS_NTP_SYNC = 0x8 FAIL_STATUS_DISK_SPACE_NA = 0x10 def __init__(self, rule_file, discovery_server, discovery_port, collector_addr, sandesh_global, send_build_info = False): self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr self.rule_file = rule_file self.rules_data = '' self.max_cores = 4 self.max_old_cores = 3 self.max_new_cores = 1 self.all_core_file_list = [] self.core_dir_modified_time = 0 self.tick_count = 0 self.fail_status_bits = 0 self.prev_fail_status_bits = 1 self.instance_id = INSTANCE_ID_DEFAULT self.discovery_server = discovery_server self.discovery_port = discovery_port self.collector_addr = collector_addr self.listener_nodemgr = EventListenerProtocolNodeMgr() self.sandesh_global = sandesh_global self.curr_build_info = None self.new_build_info = None self.send_build_info = send_build_info # Get all the current processes in the node def get_current_process(self): proxy = xmlrpclib.ServerProxy( 'http://127.0.0.1', transport=supervisor.xmlrpc.SupervisorTransport( None, None, serverurl=self.supervisor_serverurl)) # Add all current processes to make sure nothing misses the radar process_state_db = {} for proc_info in proxy.supervisor.getAllProcessInfo(): if (proc_info['name'] != proc_info['group']): proc_name = proc_info['group'] + ":" + proc_info['name'] else: proc_name = proc_info['name'] process_stat_ent = self.get_process_stat_object(proc_name) process_stat_ent.process_state = "PROCESS_STATE_" + \ proc_info['statename'] if (process_stat_ent.process_state == 'PROCESS_STATE_RUNNING'): process_stat_ent.start_time = str(proc_info['start'] * 1000000) process_stat_ent.start_count += 1 process_state_db[proc_name] = process_stat_ent return process_state_db # end get_current_process # Add the current processes in the node to db def add_current_process(self): self.process_state_db = self.get_current_process() # end add_current_process # In case the processes in the Node can change, update current processes def update_current_process(self): process_state_db = self.get_current_process() old_process_set = set(self.process_state_db.keys()) new_process_set = set(process_state_db.keys()) common_process_set = new_process_set.intersection(old_process_set) added_process_set = new_process_set - common_process_set deleted_process_set = old_process_set - common_process_set for deleted_process in deleted_process_set: self.delete_process_handler(deleted_process) for added_process in added_process_set: self.add_process_handler( added_process, process_state_db[added_process]) # end update_current_process # process is deleted, send state & remove it from db def delete_process_handler(self, deleted_process): self.process_state_db[deleted_process].deleted = True group_val = self.process_state_db[deleted_process].group self.send_process_state_db([group_val]) del self.process_state_db[deleted_process] # end delete_process_handler # new process added, update db & send state def add_process_handler(self, added_process, process_info): self.process_state_db[added_process] = process_info group_val = self.process_state_db[added_process].group self.send_process_state_db([group_val]) # end add_process_handler def get_discovery_client(self): _disc = client.DiscoveryClient( self.discovery_server, self.discovery_port, self.module_id) return _disc def check_ntp_status(self): ntp_status_cmd = 'ntpq -n -c pe | grep "^*"' proc = Popen(ntp_status_cmd, shell=True, stdout=PIPE, stderr=PIPE) (output, errout) = proc.communicate() if proc.returncode != 0: self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC else: self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC self.send_nodemgr_process_status() def _add_build_info(self, node_status): # Retrieve build_info from package/rpm and cache it if self.curr_build_info is None: command = "contrail-version contrail-nodemgr | grep contrail-nodemgr" version = os.popen(command).read() _, rpm_version, build_num = version.split() self.new_build_info = build_info + '"build-id" : "' + \ rpm_version + '", "build-number" : "' + \ build_num + '"}]}' if (self.new_build_info != self.curr_build_info): self.curr_build_info = self.new_build_info node_status.build_info = self.curr_build_info def update_process_core_file_list(self): #LOG_DEBUG sys.stderr.write('update_process_core_file_list: begin:') ret_value = False try: ls_command = "ls -1 /var/crashes" (corenames, stderr) = Popen( ls_command.split(), stdout=PIPE).communicate() process_state_db_tmp = {} for key in self.process_state_db: #LOG_DEBUG sys.stderr.write('update_process_core_file_list: key: '+key+'\n') proc_stat = self.get_process_stat_object(key) process_state_db_tmp[key] = proc_stat #LOG_DEBUG sys.stderr.write('update_process_core_file_list: corenames: '+corenames+'\n') for corename in corenames.split(): exec_name = corename.split('.')[1] for key in self.process_state_db: if key.startswith(exec_name): #LOG_DEBUG sys.stderr.write('update_process_core_file_list: startswith: '+exec_name+'\n') process_state_db_tmp[key].core_file_list.append(corename.rstrip()) for key in self.process_state_db: if set(process_state_db_tmp[key].core_file_list) != set(self.process_state_db[key].core_file_list): self.process_state_db[key].core_file_list = process_state_db_tmp[key].core_file_list ret_value = True except Exception as e: sys.stderr.write('update_process_core_file_list: exception: '+str(e)) #LOG_DEBUG sys.stderr.write('update_process_core_file_list: ret_value: '+str(ret_value)+'\n') return ret_value #end update_process_core_file_list def send_process_state_db_base(self, group_names, ProcessInfo, NodeStatus, NodeStatusUVE): name = socket.gethostname() for group in group_names: process_infos = [] delete_status = True for key in self.process_state_db: pstat = self.process_state_db[key] if (pstat.group != group): continue process_info = ProcessInfo() process_info.process_name = key process_info.process_state = pstat.process_state process_info.start_count = pstat.start_count process_info.stop_count = pstat.stop_count process_info.exit_count = pstat.exit_count process_info.last_start_time = pstat.start_time process_info.last_stop_time = pstat.stop_time process_info.last_exit_time = pstat.exit_time process_info.core_file_list = pstat.core_file_list process_infos.append(process_info) if pstat.deleted == False: delete_status = False if not process_infos: continue # send node UVE node_status = NodeStatus() node_status.name = socket.gethostname() node_status.deleted = delete_status node_status.process_info = process_infos if (self.send_build_info): self._add_build_info(node_status) node_status_uve = NodeStatusUVE(data=node_status) msg = 'send_process_state_db_base: Sending UVE:' + str(node_status_uve) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_INFO), msg) node_status_uve.send() def update_all_core_file(self): stat_command_option = "stat --printf=%Y /var/crashes" modified_time = Popen( stat_command_option.split(), stdout=PIPE).communicate() if modified_time[0] == self.core_dir_modified_time: return False self.core_dir_modified_time = modified_time[0] ls_command_option = "ls /var/crashes" (corename, stderr) = Popen( ls_command_option.split(), stdout=PIPE).communicate() self.all_core_file_list = corename.split('\n')[0:-1] self.send_process_state_db(self.group_names) return True def get_process_stat_object(self, pname): return ProcessStat(pname) def send_process_state(self, pname, pstate, pheaders): # update process stats if pname in self.process_state_db.keys(): proc_stat = self.process_state_db[pname] else: proc_stat = self.get_process_stat_object(pname) if not proc_stat.group in self.group_names: self.group_names.append(proc_stat.group) proc_stat.process_state = pstate send_uve = False if (pstate == 'PROCESS_STATE_RUNNING'): proc_stat.start_count += 1 proc_stat.start_time = str(int(time.time() * 1000000)) send_uve = True if (pstate == 'PROCESS_STATE_STOPPED'): proc_stat.stop_count += 1 send_uve = True proc_stat.stop_time = str(int(time.time() * 1000000)) proc_stat.last_exit_unexpected = False if (pstate == 'PROCESS_STATE_EXITED'): proc_stat.exit_count += 1 send_uve = True proc_stat.exit_time = str(int(time.time() * 1000000)) if not(int(pheaders['expected'])): self.stderr.write( pname + " with pid:" + pheaders['pid'] + " exited abnormally\n") proc_stat.last_exit_unexpected = True # check for core file for this exit find_command_option = \ "find /var/crashes -name core.[A-Za-z]*." + \ pheaders['pid'] + "*" self.stderr.write( "find command option for cores:" + find_command_option + "\n") (corename, stderr) = Popen( find_command_option.split(), stdout=PIPE).communicate() self.stderr.write("core file: " + corename + "\n") if ((corename is not None) and (len(corename.rstrip()) >= 1)): # before adding to the core file list make # sure that we do not have too many cores sys.stderr.write( 'core_file_list:' + str(proc_stat.core_file_list) + ", self.max_cores:" + str(self.max_cores) + "\n") if (len(proc_stat.core_file_list) == self.max_cores): # get rid of old cores sys.stderr.write( 'max # of cores reached:' + str(self.max_cores) + "\n") val = self.max_cores - self.max_new_cores + 1 core_files_to_be_deleted = \ proc_stat.core_file_list[self.max_old_cores:(val)] sys.stderr.write( 'deleting core file list:' + str(core_files_to_be_deleted) + "\n") for core_file in core_files_to_be_deleted: sys.stderr.write( 'deleting core file:' + core_file + "\n") try: os.remove(core_file) except OSError as e: sys.stderr.write('ERROR: ' + str(e) + '\n') # now delete the list as well val = self.max_cores - self.max_new_cores + 1 del proc_stat.core_file_list[self.max_old_cores:(val)] # now add the new core to the core file list proc_stat.core_file_list.append(corename.rstrip()) sys.stderr.write( "# of cores for " + pname + ":" + str(len(proc_stat.core_file_list)) + "\n") # update process state database self.process_state_db[pname] = proc_stat f = open('/var/log/contrail/process_state' + self.node_type + ".json", 'w') f.write(json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) if not(send_uve): return if (send_uve): self.send_process_state_db([proc_stat.group]) def send_nodemgr_process_status_base(self, ProcessStateNames, ProcessState, ProcessStatus, NodeStatus, NodeStatusUVE): if (self.prev_fail_status_bits != self.fail_status_bits): self.prev_fail_status_bits = self.fail_status_bits fail_status_bits = self.fail_status_bits state, description = self.get_process_state(fail_status_bits) process_status = ProcessStatus( module_id=self.module_id, instance_id=self.instance_id, state=state, description=description) process_status_list = [] process_status_list.append(process_status) node_status = NodeStatus(name=socket.gethostname(), process_status=process_status_list) if (self.send_build_info): self._add_build_info(node_status) node_status_uve = NodeStatusUVE(data=node_status) msg = 'send_nodemgr_process_status_base: Sending UVE:' + str(node_status_uve) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_INFO), msg) node_status_uve.send() def send_disk_usage_info_base(self, NodeStatusUVE, NodeStatus, DiskPartitionUsageStats): partition = subprocess.Popen( "df -T -t ext2 -t ext3 -t ext4 -t xfs", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) disk_usage_infos = [] for line in partition.stdout: if 'Filesystem' in line: continue partition_name = line.rsplit()[0] partition_type = line.rsplit()[1] partition_space_used_1k = line.rsplit()[3] partition_space_available_1k = line.rsplit()[4] disk_usage_stat = DiskPartitionUsageStats() try: disk_usage_stat.partition_type = str(partition_type) disk_usage_stat.partition_name = str(partition_name) disk_usage_stat.partition_space_used_1k = \ int(partition_space_used_1k) disk_usage_stat.partition_space_available_1k = \ int(partition_space_available_1k) total_disk_space = \ disk_usage_stat.partition_space_used_1k + \ disk_usage_stat.partition_space_available_1k disk_usage_stat.percentage_partition_space_used = \ int(round((float(disk_usage_stat.partition_space_used_1k)/ \ float(total_disk_space))*100)) except ValueError: sys.stderr.write("Failed to get local disk space usage" + "\n") else: disk_usage_infos.append(disk_usage_stat) # send node UVE node_status = NodeStatus( name=socket.gethostname(), disk_usage_info=disk_usage_infos) # send other core file if self.update_all_core_file(): node_status.all_core_file_list = self.all_core_file_list if (self.send_build_info): self._add_build_info(node_status) node_status_uve = NodeStatusUVE(data=node_status) msg = 'send_disk_usage_info_base: Sending UVE:' + str(node_status_uve) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_INFO), msg) node_status_uve.send() # end send_disk_usage_info def get_process_state_base(self, fail_status_bits, ProcessStateNames, ProcessState): if fail_status_bits: state = ProcessStateNames[ProcessState.NON_FUNCTIONAL] description = self.get_failbits_nodespecific_desc(fail_status_bits) if (description is ""): if fail_status_bits & self.FAIL_STATUS_NTP_SYNC: if description != "": description += " " description += "NTP state unsynchronized." else: state = ProcessStateNames[ProcessState.FUNCTIONAL] description = '' return state, description def get_failbits_nodespecific_desc(self, fail_status_bits): return "" def event_process_state(self, pheaders, headers): msg = ("process:" + pheaders['processname'] + "," + "groupname:" + pheaders['groupname'] + "," + "eventname:" + headers['eventname']) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level(SandeshLevel.SYS_DEBUG), msg) pname = pheaders['processname'] if (pheaders['processname'] != pheaders['groupname']): pname = pheaders['groupname'] + ":" + pheaders['processname'] self.send_process_state(pname, headers['eventname'], pheaders) for rules in self.rules_data['Rules']: if 'processname' in rules: if ((rules['processname'] == pheaders['groupname']) and (rules['process_state'] == headers['eventname'])): msg = "got a hit with:" + str(rules) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level( SandeshLevel.SYS_DEBUG), msg) # do not make async calls try: ret_code = subprocess.call( [rules['action']], shell=True, stdout=self.stderr, stderr=self.stderr) except Exception as e: msg = ('Failed to execute action: ' + rules['action'] + ' with err ' + str(e)) self.sandesh_global.logger().logger.log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_ERR), msg) else: if ret_code: msg = ('Execution of action ' + rules['action'] + ' returned err ' + str(ret_code)) self.sandesh_global.logger().log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_ERR), msg) def event_process_communication(self, pdata): flag_and_value = pdata.partition(":") msg = ("Flag:" + flag_and_value[0] + " Value:" + flag_and_value[2]) self.sandesh_global.logger().log(SandeshLogger.get_py_logger_level (SandeshLevel.SYS_DEBUG), msg) for rules in self.rules_data['Rules']: if 'flag_name' in rules: if ((rules['flag_name'] == flag_and_value[0]) and (rules['flag_value'].strip() == flag_and_value[2].strip())): msg = "got a hit with:" + str(rules) self.sandesh_global.logger().log(SandeshLogger. get_py_logger_level(SandeshLevel.SYS_DEBUG), msg) cmd_and_args = ['/usr/bin/bash', '-c', rules['action']] subprocess.Popen(cmd_and_args) def event_tick_60(self, prev_current_time): self.tick_count += 1 # send disk usage info periodically self.send_disk_usage_info() # typical ntp sync time is about 5 min - first time, # we scan only after 10 min if self.tick_count >= 10: self.check_ntp_status() if self.update_process_core_file_list(): self.send_process_state_db(['default']) current_time = int(time.time()) if ((abs(current_time - prev_current_time)) > 300): # update all process start_times with the updated time # Compute the elapsed time and subtract them from # current time to get updated values sys.stderr.write( "Time lapse detected " + str(abs(current_time - prev_current_time)) + "\n") for key in self.process_state_db: pstat = self.process_state_db[key] if pstat.start_time is not '': pstat.start_time = str( (int(current_time - (prev_current_time - ((int)(pstat.start_time)) / 1000000))) * 1000000) if (pstat.process_state == 'PROCESS_STATE_STOPPED'): if pstat.stop_time is not '': pstat.stop_time = str( int(current_time - (prev_current_time - ((int)(pstat.stop_time)) / 1000000)) * 1000000) if (pstat.process_state == 'PROCESS_STATE_EXITED'): if pstat.exit_time is not '': pstat.exit_time = str( int(current_time - (prev_current_time - ((int)(pstat.exit_time)) / 1000000)) * 1000000) # update process state database self.process_state_db[key] = pstat try: json_file = '/var/log/contrail/process_state' + \ self.node_type + ".json" f = open(json_file, 'w') f.write( json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) except: sys.stderr.write("Unable to write json") pass self.send_process_state_db(self.group_names) prev_current_time = int(time.time()) return prev_current_time def runforever(self, test=False): prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload + '\n') # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): prev_current_time = self.event_tick_60(prev_current_time) self.listener_nodemgr.ok(self.stdout)
class EventManager(object): rules_data = [] group_names = [] process_state_db = {} FAIL_STATUS_DUMMY = 0x1 FAIL_STATUS_DISK_SPACE = 0x2 FAIL_STATUS_SERVER_PORT = 0x4 FAIL_STATUS_NTP_SYNC = 0x8 FAIL_STATUS_DISK_SPACE_NA = 0x10 def __init__(self, rule_file, discovery_server, discovery_port, collector_addr): self.stdin = sys.stdin self.stdout = sys.stdout self.stderr = sys.stderr self.rule_file = rule_file self.rules_data = '' self.max_cores = 4 self.max_old_cores = 3 self.max_new_cores = 1 self.all_core_file_list = [] self.core_dir_modified_time = 0 self.tick_count = 0 self.fail_status_bits = 0 self.prev_fail_status_bits = 1 self.instance_id = INSTANCE_ID_DEFAULT self.discovery_server = discovery_server self.discovery_port = discovery_port self.collector_addr = collector_addr self.listener_nodemgr = EventListenerProtocolNodeMgr() self.sandesh_global = None # Get all the current processes in the node def get_current_process(self): proxy = xmlrpclib.ServerProxy( 'http://127.0.0.1', transport=supervisor.xmlrpc.SupervisorTransport( None, None, serverurl=self.supervisor_serverurl)) # Add all current processes to make sure nothing misses the radar process_state_db = {} for proc_info in proxy.supervisor.getAllProcessInfo(): if (proc_info['name'] != proc_info['group']): proc_name = proc_info['group'] + ":" + proc_info['name'] else: proc_name = proc_info['name'] process_stat_ent = self.get_process_stat_object(proc_name) process_stat_ent.process_state = "PROCESS_STATE_" + \ proc_info['statename'] if (process_stat_ent.process_state == 'PROCESS_STATE_RUNNING'): process_stat_ent.start_time = str(proc_info['start'] * 1000000) process_stat_ent.start_count += 1 process_state_db[proc_name] = process_stat_ent return process_state_db # end get_current_process # Add the current processes in the node to db def add_current_process(self): self.process_state_db = self.get_current_process() # end add_current_process # In case the processes in the Node can change, update current processes def update_current_process(self): process_state_db = self.get_current_process() old_process_set = set(self.process_state_db.keys()) new_process_set = set(process_state_db.keys()) common_process_set = new_process_set.intersection(old_process_set) added_process_set = new_process_set - common_process_set deleted_process_set = old_process_set - common_process_set for deleted_process in deleted_process_set: self.delete_process_handler(deleted_process) for added_process in added_process_set: self.add_process_handler( added_process, process_state_db[added_process]) # end update_current_process # process is deleted, send state & remove it from db def delete_process_handler(self, deleted_process): self.process_state_db[deleted_process].deleted = True group_val = self.process_state_db[deleted_process].group self.send_process_state_db([group_val]) del self.process_state_db[deleted_process] # end delete_process_handler # new process added, update db & send state def add_process_handler(self, added_process, process_info): self.process_state_db[added_process] = process_info group_val = self.process_state_db[added_process].group self.send_process_state_db([group_val]) # end add_process_handler def get_discovery_client(self): _disc = client.DiscoveryClient( self.discovery_server, self.discovery_port, self.module_id) return _disc def check_ntp_status(self): ntp_status_cmd = 'ntpq -n -c pe | grep "^*"' proc = Popen(ntp_status_cmd, shell=True, stdout=PIPE, stderr=PIPE) (output, errout) = proc.communicate() if proc.returncode != 0: self.fail_status_bits |= self.FAIL_STATUS_NTP_SYNC else: self.fail_status_bits &= ~self.FAIL_STATUS_NTP_SYNC self.send_nodemgr_process_status() def send_process_state_db_base(self, group_names, ProcessInfo, NodeStatus, NodeStatusUVE): name = socket.gethostname() for group in group_names: process_infos = [] delete_status = True for key in self.process_state_db: pstat = self.process_state_db[key] if (pstat.group != group): continue process_info = ProcessInfo() process_info.process_name = key process_info.process_state = pstat.process_state process_info.start_count = pstat.start_count process_info.stop_count = pstat.stop_count process_info.exit_count = pstat.exit_count process_info.last_start_time = pstat.start_time process_info.last_stop_time = pstat.stop_time process_info.last_exit_time = pstat.exit_time process_info.core_file_list = pstat.core_file_list process_infos.append(process_info) name = pstat.name if pstat.deleted == False: delete_status = False if not process_infos: continue # send node UVE node_status = NodeStatus() node_status.name = name node_status.deleted = delete_status node_status.process_info = process_infos node_status.all_core_file_list = self.all_core_file_list node_status_uve = NodeStatusUVE(data=node_status) sys.stderr.write('Sending UVE:' + str(node_status_uve)) node_status_uve.send() def send_all_core_file(self): stat_command_option = "stat --printf=%Y /var/crashes" modified_time = Popen( stat_command_option.split(), stdout=PIPE).communicate() if modified_time[0] == self.core_dir_modified_time: return self.core_dir_modified_time = modified_time[0] ls_command_option = "ls /var/crashes" (corename, stderr) = Popen( ls_command_option.split(), stdout=PIPE).communicate() self.all_core_file_list = corename.split('\n')[0:-1] self.send_process_state_db(self.group_names) def get_process_stat_object(self, pname): return ProcessStat(pname) def send_process_state(self, pname, pstate, pheaders): # update process stats if pname in self.process_state_db.keys(): proc_stat = self.process_state_db[pname] else: proc_stat = self.get_process_stat_object(pname) if not proc_stat.group in self.group_names: self.group_names.append(proc_stat.group) proc_stat.process_state = pstate send_uve = False if (pstate == 'PROCESS_STATE_RUNNING'): proc_stat.start_count += 1 proc_stat.start_time = str(int(time.time() * 1000000)) send_uve = True if (pstate == 'PROCESS_STATE_STOPPED'): proc_stat.stop_count += 1 send_uve = True proc_stat.stop_time = str(int(time.time() * 1000000)) proc_stat.last_exit_unexpected = False if (pstate == 'PROCESS_STATE_EXITED'): proc_stat.exit_count += 1 send_uve = True proc_stat.exit_time = str(int(time.time() * 1000000)) if not(int(pheaders['expected'])): self.stderr.write( pname + " with pid:" + pheaders['pid'] + " exited abnormally\n") proc_stat.last_exit_unexpected = True # check for core file for this exit find_command_option = \ "find /var/crashes -name core.[A-Za-z]*." + \ pheaders['pid'] + "*" self.stderr.write( "find command option for cores:" + find_command_option + "\n") (corename, stderr) = Popen( find_command_option.split(), stdout=PIPE).communicate() self.stderr.write("core file: " + corename + "\n") if ((corename is not None) and (len(corename.rstrip()) >= 1)): # before adding to the core file list make # sure that we do not have too many cores sys.stderr.write( 'core_file_list:' + str(proc_stat.core_file_list) + ", self.max_cores:" + str(self.max_cores) + "\n") if (len(proc_stat.core_file_list) == self.max_cores): # get rid of old cores sys.stderr.write( 'max # of cores reached:' + str(self.max_cores) + "\n") val = self.max_cores - self.max_new_cores + 1 core_files_to_be_deleted = \ proc_stat.core_file_list[self.max_old_cores:(val)] sys.stderr.write( 'deleting core file list:' + str(core_files_to_be_deleted) + "\n") for core_file in core_files_to_be_deleted: sys.stderr.write( 'deleting core file:' + core_file + "\n") try: os.remove(core_file) except OSError as e: sys.stderr.write('ERROR: ' + str(e) + '\n') # now delete the list as well val = self.max_cores - self.max_new_cores + 1 del proc_stat.core_file_list[self.max_old_cores:(val)] # now add the new core to the core file list proc_stat.core_file_list.append(corename.rstrip()) sys.stderr.write( "# of cores for " + pname + ":" + str(len(proc_stat.core_file_list)) + "\n") # update process state database self.process_state_db[pname] = proc_stat f = open('/var/log/contrail/process_state' + self.node_type + ".json", 'w') f.write(json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) if not(send_uve): return if (send_uve): self.send_process_state_db([proc_stat.group]) def send_nodemgr_process_status_base(self, ProcessStateNames, ProcessState, ProcessStatus, NodeStatus, NodeStatusUVE): if (self.prev_fail_status_bits != self.fail_status_bits): self.prev_fail_status_bits = self.fail_status_bits fail_status_bits = self.fail_status_bits state, description = self.get_process_state(fail_status_bits) process_status = ProcessStatus( module_id=self.module_id, instance_id=self.instance_id, state=state, description=description) process_status_list = [] process_status_list.append(process_status) node_status = NodeStatus( name=socket.gethostname(), process_status=process_status_list) node_status_uve = NodeStatusUVE(data=node_status) sys.stderr.write('Sending UVE:' + str(node_status_uve)) node_status_uve.send() def send_disk_usage_info_base(self, NodeStatusUVE, NodeStatus, DiskPartitionUsageStats): partition = subprocess.Popen( "df -T -t ext2 -t ext3 -t ext4 -t xfs", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) disk_usage_infos = [] for line in partition.stdout: if 'Filesystem' in line: continue partition_name = line.rsplit()[0] partition_type = line.rsplit()[1] partition_space_used_1k = line.rsplit()[3] partition_space_available_1k = line.rsplit()[4] disk_usage_stat = DiskPartitionUsageStats() try: disk_usage_stat.partition_type = str(partition_type) disk_usage_stat.partition_name = str(partition_name) disk_usage_stat.partition_space_used_1k = \ int(partition_space_used_1k) disk_usage_stat.partition_space_available_1k = \ int(partition_space_available_1k) except ValueError: sys.stderr.write("Failed to get local disk space usage" + "\n") else: disk_usage_infos.append(disk_usage_stat) # send node UVE node_status = NodeStatus( name=socket.gethostname(), disk_usage_info=disk_usage_infos) node_status_uve = NodeStatusUVE(data=node_status) sys.stderr.write('Sending UVE:' + str(node_status_uve)) node_status_uve.send() # end send_disk_usage_info def get_process_state_base(self, fail_status_bits, ProcessStateNames, ProcessState): if fail_status_bits: state = ProcessStateNames[ProcessState.NON_FUNCTIONAL] description = self.get_failbits_nodespecific_desc(fail_status_bits) if (description is ""): if fail_status_bits & self.FAIL_STATUS_NTP_SYNC: if description != "": description += " " description += "NTP state unsynchronized." else: state = ProcessStateNames[ProcessState.FUNCTIONAL] description = '' return state, description def get_failbits_nodespecific_desc(self, fail_status_bits): return "" def event_process_state(self, pheaders, headers): self.stderr.write("process:" + pheaders['processname'] + "," + "groupname:" + pheaders['groupname'] + "," + "eventname:" + headers['eventname'] + '\n') pname = pheaders['processname'] if (pheaders['processname'] != pheaders['groupname']): pname = pheaders['groupname'] + ":" + pheaders['processname'] self.send_process_state(pname, headers['eventname'], pheaders) for rules in self.rules_data['Rules']: if 'processname' in rules: if ((rules['processname'] == pheaders['groupname']) and (rules['process_state'] == headers['eventname'])): self.stderr.write("got a hit with:" + str(rules) + '\n') # do not make async calls try: ret_code = subprocess.call( [rules['action']], shell=True, stdout=self.stderr, stderr=self.stderr) except Exception as e: self.stderr.write( 'Failed to execute action: ' + rules['action'] + ' with err ' + str(e) + '\n') else: if ret_code: self.stderr.write( 'Execution of action ' + rules['action'] + ' returned err ' + str(ret_code) + '\n') def event_process_communication(self, pdata): flag_and_value = pdata.partition(":") self.stderr.write("Flag:" + flag_and_value[0] + " Value:" + flag_and_value[2] + "\n") for rules in self.rules_data['Rules']: if 'flag_name' in rules: if ((rules['flag_name'] == flag_and_value[0]) and (rules['flag_value'].strip() == flag_and_value[2].strip())): self.stderr.write("got a hit with:" + str(rules) + '\n') cmd_and_args = ['/usr/bin/bash', '-c', rules['action']] subprocess.Popen(cmd_and_args) def event_tick_60(self, prev_current_time): self.tick_count += 1 # send other core file self.send_all_core_file() # send disk usage info periodically self.send_disk_usage_info() # typical ntp sync time is about 5 min - first time, # we scan only after 10 min if self.tick_count >= 10: self.check_ntp_status() current_time = int(time.time()) if ((abs(current_time - prev_current_time)) > 300): # update all process start_times with the updated time # Compute the elapsed time and subtract them from # current time to get updated values sys.stderr.write( "Time lapse detected " + str(abs(current_time - prev_current_time)) + "\n") for key in self.process_state_db: pstat = self.process_state_db[key] if pstat.start_time is not '': pstat.start_time = str( (int(current_time - (prev_current_time - ((int)(pstat.start_time)) / 1000000))) * 1000000) if (pstat.process_state == 'PROCESS_STATE_STOPPED'): if pstat.stop_time is not '': pstat.stop_time = str( int(current_time - (prev_current_time - ((int)(pstat.stop_time)) / 1000000)) * 1000000) if (pstat.process_state == 'PROCESS_STATE_EXITED'): if pstat.exit_time is not '': pstat.exit_time = str( int(current_time - (prev_current_time - ((int)(pstat.exit_time)) / 1000000)) * 1000000) # update process state database self.process_state_db[key] = pstat try: json_file = '/var/log/contrail/process_state' + \ self.node_type + ".json" f = open(json_file, 'w') f.write( json.dumps( self.process_state_db, default=lambda obj: obj.__dict__)) except: sys.stderr.write("Unable to write json") pass self.send_process_state_db(self.group_names) prev_current_time = int(time.time()) return prev_current_time def runforever(self, test=False): prev_current_time = int(time.time()) while 1: # we explicitly use self.stdin, self.stdout, and self.stderr # instead of sys.* so we can unit test this code headers, payload = self.listener_nodemgr.wait( self.stdin, self.stdout) pheaders, pdata = childutils.eventdata(payload + '\n') # check for process state change events if headers['eventname'].startswith("PROCESS_STATE"): self.event_process_state(pheaders, headers) # check for flag value change events if headers['eventname'].startswith("PROCESS_COMMUNICATION"): self.event_process_communication(pdata) # do periodic events if headers['eventname'].startswith("TICK_60"): prev_current_time = self.event_tick_60(prev_current_time) self.listener_nodemgr.ok(self.stdout)