def __operate(self, consumer): """ poll from consumer, performing the related operation""" # for msg in consumer: # # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}} # for action, info in msg.value: # if action in switcher: # Mu.log_info(self.__logger, "Trying to perform action: {0}...") # switcher[action].operate(info) # Mu.log_info(self.__logger, "Action: {0} is done.") app_opp_msg_pack = consumer.poll(update_offsets=True) if app_opp_msg_pack: for tp, messages in app_opp_msg_pack.items(): # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}} for msg in messages: for action, info in msg.value.items(): action_type = int(action) if action_type in self.switcher: Mu.log_info( self.__logger, "Trying to perform action: {0}...".format( action)) try: Mu.log_debug(self.__logger, "Action detail: {0}".format(info)) self.switcher[action_type].operate(info) except Exception as ex: Mu.log_warning_exc( self.__logger, "Perform action failed with {0}, action detail is {1}" .format(ex, info)) Mu.log_info(self.__logger, "Action: {0} is done.".format(action))
def __analyze(self, consumer): """ process all the un-aggregated data from agent, produce aggregated data to db-operator :param consumer: kafka consumer """ # info structure is { info_type : {server_id : info_detail}} # eg : { "memory" : {1: info_detail, 2; info_detail} , "disk" : {3: info_detail : 7: info_detail} } info = {} # start flag for recording all statuses for different resources and server Ids # eg : { "memory" : { 1 : True, 8 : False} , "disk" : { 1 : True, 7 : False } } start_flag = {} for msg in consumer: try: if msg and msg.value: message = msg.value server_id = message[Mc.FIELD_SERVER_ID] info_analyzer = self.__get_info_analyzer(message) if server_id is not None and info_analyzer is not None: Mu.log_debug(self.__logger, message) info_type = info_analyzer.type() if DataAnalyzer.__is_header(message): # init the value, if previous ending is lost, all the previous messages with # same type and server id will be abandoned if info_type not in start_flag: start_flag[info_type] = {} start_flag[info_type][server_id] = True # init the info for the specific type if info_type.value not in info: info[info_type.value] = {} info[info_type.value][server_id] = { Mc.MSG_TYPE: info_type.value, Mc.MSG_INFO: {} } elif start_flag.get(info_type, {}).get( server_id, False) and DataAnalyzer.__is_ending(message): # done, analyze the message and put filtered message to queue self.__producer.send( self.__topic, info[info_type.value][server_id]) self.__producer.flush() Mu.log_debug( self.__logger, "Filtered message {0} was sent".format( info[info_type.value][server_id])) # reset the start flag and value start_flag[info_type][server_id] = False info[info_type.value][server_id] = {} elif start_flag.get(info_type, {}).get(server_id, False): # only process after get the header DataAnalyzer.__process( info_analyzer, message, info[info_type.value][server_id]) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when analyzing message, Error: {0}".format( ex))
def shutdown_hana(self, ssh): if Mu.is_test_mod(): Mu.log_debug(self.__logger, "It's in test mode, skip shutting down hana.") return cmd_output = self.__ssh_exec_command( 'nohup bash -lc "HDB stop" >/dev/null 2>&1 &', ssh) Mu.log_debug(self.__logger, "shutting down hana, output:{0}".format(cmd_output))
def delivery(self, info_list, info_type, server_id): try: # send heartbeat data self.__producer.send( self.__topic_heartbeat, { Mc.FIELD_SERVER_ID: server_id, Mc.MSG_TYPE: info_type, Mc.MSG_TIME: Mu.generate_check_id() }) self.__producer.flush() if info_list: # as producer doesn't support transaction # https://github.com/dpkp/kafka-python/issues/1396 # https://github.com/dpkp/kafka-python/issues/1063 # add message begin and end part, consumer will abandon all messages if missing begin or end # header Mu.log_debug( self.__logger, "Sending {0} message header to queue...".format(info_type)) self.__producer.send( self.__topic, MsgProducerService.__get_message_header( info_type, server_id)) # body for info in info_list: # for all messages, add type and server id info[Mc.MSG_TYPE] = info_type info[Mc.FIELD_SERVER_ID] = server_id Mu.log_debug( self.__logger, "Sending {0} info {1} to queue...".format( info_type, info)) self.__producer.send(self.__topic, info) Mu.log_debug( self.__logger, "{0} info {1} is sent to queue...".format( info_type, info)) # ending Mu.log_debug( self.__logger, "Sending {0} message ending to queue...".format(info_type)) self.__producer.send( self.__topic, MsgProducerService.__get_message_ending( info_type, server_id)) self.__producer.flush() Mu.log_debug( self.__logger, "Sending {0} message to queue is finished...".format( info_type)) except Exception as ex: Mu.log_error( self.__logger, "Some thing wrong when delivering, error: {0}".format(ex))
def clean_log_backup(self, ssh, sid): if Mu.is_test_mod(): Mu.log_debug( self.__logger, "It's in test mode, skip cleaning log backup for {0}.".format( sid)) return self.__ssh_exec_command( 'find /usr/sap/{0}/HDB[0-9][0-9]/backup -name "log_backup_*.*" -mtime +10 -type f -delete' .format(sid), ssh) Mu.log_debug(self.__logger, "cleaned log backup for {0}.".format(sid))
def __query_insert_batch(self, query, param_list): cursor = None try: cursor = self.connection.cursor() Mu.log_debug(self.__logger, "query:{0}, param:{1}".format(query, param_list)) cursor.executemany(query, param_list) except Exception as ex: Mu.log_error(self.__logger, "Query:{0} failed with error:{1}".format(query, ex)) Mu.log_exception(self.__logger, traceback.format_exc()) finally: if cursor is not None: cursor.close() Mu.log_debug(self.__logger, "Cursor closed.")
def __send_shutdown_message(self, server_name, sid, user_name): Mu.log_debug( self.__logger, "Sending shutdown message of {0} on {1} ...".format( sid, server_name)) # send shutdown message self.__producer.send( self.__topic, AlarmOperator.__generate_action_message(server_name, sid, user_name, ActionType.SHUTDOWN.value)) Mu.log_debug( self.__logger, "Shutdown message of {0} on {1} is sent".format(sid, server_name))
def monitoring(self, check_id): try: server_id = self.__server_id server_name = self._server_name Mu.log_debug(self.__logger, "[{0}]Instance Monitoring begin...".format(check_id)) Mu.log_debug( self.__logger, "Trying to get instance info of {0}".format(server_name)) # collect instance info for one server by server id instance_info = self._os_operator.get_all_hana_instance_info( server_id) Mu.log_debug( self.__logger, "Instance information of {0} is {1}".format( server_name, instance_info)) if instance_info: # will skip sending instance info if it is empty instance_info.insert(0, {Mc.FIELD_CHECK_ID: check_id}) self.accept(MonitorResourceDispatcher(instance_info, server_id)) else: Mu.log_debug( self.__logger, "Instance information for {0} is empty, skipped sending this info." .format(server_name)) Mu.log_info( self.__logger, "Instance Monitoring is done for {0}.".format(server_name)) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error Occurred when monitoring Instance, ERROR: {0}".format( ex))
def __restart_agent(self, server, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval): with Mu.open_ssh_connection(self.__logger, self.__os_operator, server, Mc.get_ssh_default_user(), Mc.get_ssh_default_password()) as ssh: Mu.log_debug(self.__logger, "Restarting {0}".format(server)) self.__os_operator.restart_agent(ssh, server_id, mount_point, agent_path, mem_interval, cpu_interval, disk_interval, instance_interval) Mu.log_debug(self.__logger, "Restarting of {0} is done".format(server))
def __send_cleaning_message(self, server_name, sid, user_name): Mu.log_debug( self.__logger, "Sending log backup cleaning message of {0} on {1} for {2} ...". format(sid, server_name, user_name)) # send shutdown message self.__producer.send( self.__topic, AlarmOperator.__generate_action_message( server_name, sid, user_name, ActionType.CLEAN_LOG_BACKUP.value)) Mu.log_debug( self.__logger, "Log backup cleaning message of {0} on {1} for {2} is sent".format( sid, server_name, user_name))
def __operate_disk(self, info): server_id = info[Mc.FIELD_SERVER_ID] check_id = info[Mc.FIELD_CHECK_ID] disk_free = info[Mc.FIELD_DISK_FREE] disk_total = info[Mc.FIELD_DISK_TOTAL] free_disk_threshold = ( (100 - self.disk_threshold) * disk_total) / 100.0 Mu.log_debug( self.__logger, "Server:{0}, check_id:{1}, free disk:{2}, threshold:{3}".format( server_id, check_id, disk_free, free_disk_threshold)) if disk_free is None or disk_free < 0 or disk_total is None or disk_total <= 0 or self.disk_threshold <= 0: return # prepare all info if size of free memory < threshold if disk_free < free_disk_threshold: # {"folder": {"user1":3245}, "folder2":{"user2":222}, "folder3":{"user3":99999}} disk_consumers = list(info[Mc.MSG_INFO].items()) disk_consumers.sort(key=lambda v: next(iter(v[1].values())), reverse=True) # sort by desc del disk_consumers[5:] # only keep the top 5 users = [next(iter(folder[1].keys())) for folder in disk_consumers] top_5_consumers = self.__get_users_info(server_id, check_id, users, InfoType.DISK, disk_free, disk_total) # combine usage info folders_info = [] for folder in disk_consumers: folder_info = { Mc.FIELD_FOLDER: folder[0], Mc.FIELD_USER_NAME: next(iter(folder[1].keys())), Mc.FIELD_USAGE: next(iter(folder[1].values())) } for user_info in top_5_consumers.get(Mc.INFO_USAGE, []): if folder_info[Mc.FIELD_USER_NAME] == user_info[ Mc.FIELD_USER_NAME]: folder_info.update(user_info) folders_info.append(folder_info) top_5_consumers[Mc.INFO_USAGE] = folders_info return top_5_consumers
def __query_select(self, query): cursor = None try: cursor = self.connection.cursor() self.__logger.debug("query:{0}".format(query)) cursor.execute(query) ret = cursor.fetchall() Mu.log_debug(self.__logger, "Result record count {0}".format(len(ret))) return ret except Exception as ex: Mu.log_error(self.__logger, "Query:{0} failed with error:{1}".format(query, ex)) Mu.log_exception(self.__logger, traceback.format_exc()) return [] finally: if cursor is not None: cursor.close() Mu.log_debug(self.__logger, "Cursor closed.")
def operate(self, parameter): server = parameter[Mc.FIELD_SERVER_FULL_NAME] user = parameter[Mc.FIELD_USER_NAME] with Mu.open_ssh_connection(self._logger, self._os_operator, server, user, Mc.get_ssh_default_password()) as ssh: if ssh is None: # TODO: notify alarm operator because of the non-standard password ?? Mu.log_warning( self._logger, "Failed to log in {0} with user {1}".format( server, user)) else: Mu.log_debug( self._logger, "Trying shutdown HANA on {0} for user {1}".format( server, user)) self._os_operator.shutdown_hana(ssh)
def __operate_memory(self, info): mem_free = info[Mc.FIELD_MEM_FREE] mem_total = info[Mc.FIELD_MEM_TOTAL] server_id = info[Mc.FIELD_SERVER_ID] check_id = info[Mc.FIELD_CHECK_ID] if mem_free is None or mem_free < 0 or mem_total is None or mem_total <= 0 or self.mem_threshold <= 0: return free_mem_threshold = ((100 - self.mem_threshold) * mem_total) / 100.0 Mu.log_debug( self.__logger, "Server:{0}, check_id:{1}, free Memory:{2}, threshold:{3}".format( server_id, check_id, mem_free, free_mem_threshold)) # prepare all info if size of free memory < threshold if mem_free < free_mem_threshold: mem_consumers = list(info[Mc.MSG_INFO].items()) mem_consumers.sort(key=lambda v: v[1], reverse=True) # sort by desc del mem_consumers[5:] # only keep the top 5 users = [user[0] for user in mem_consumers] top_5_consumers = self.__get_users_info(server_id, check_id, users, InfoType.MEMORY, mem_free, mem_total) # combine usage info for user_info in top_5_consumers.get(Mc.INFO_USAGE, []): # {"user1": 12.2, "user2": 13.2} user_info[Mc.FIELD_USAGE] = info[Mc.MSG_INFO][user_info[ Mc.FIELD_USER_NAME]] # set the usage top_5_consumers.get(Mc.INFO_USAGE, []).sort(key=lambda v: v[Mc.FIELD_USAGE], reverse=True) return top_5_consumers
def __coordinating_monitors(self, consumer): """ Coordinating (start/stop/restart) all the agents :param consumer: kafka consumer """ Mu.log_debug(self.__logger, "Coordinator is listening on topic for configurations.") for msg in consumer: try: Mu.log_debug(self.__logger, "New configs are coming...") if self.__update_configs(msg.value): # start/restart all agents, current design is restart all agents if any config is changed servers = self.__configs.get(Mc.DB_CONFIGURATION_SERVER, []) for server in servers: self.__restart_agent( server[Mc.FIELD_SERVER_FULL_NAME], server[Mc.FIELD_SERVER_ID], server[Mc.FIELD_MOUNT_POINT], Mc.get_agent_path(), self.__configs.get("CHECK_INTERVAL_MEM_INT", 60), self.__configs.get("CHECK_INTERVAL_CPU_INT", 300), self.__configs.get("CHECK_INTERVAL_DISK_INT", 3600), self.__configs.get("CHECK_INTERVAL_INSTANCE_INT", 300)) if self.__check_configuration() and not self.__heartbeat_flag: self.__heartbeat_flag = True # start heart beat thread heartbeat_thread = threading.Thread( target=self.__process_heartbeat) heartbeat_thread.start() except Exception as ex: Mu.log_warning_exc( self.__logger, "Error occurred when coordinating the monitors, Err: {0}". format(ex))
def __operate_cpu(self, info): cpu_usage = info[Mc.FIELD_CPU_UTILIZATION] cpu_num = info[Mc.FIELD_CPU_NUMBER] server_id = info[Mc.FIELD_SERVER_ID] check_id = info[Mc.FIELD_CHECK_ID] if cpu_usage is None or cpu_usage < 0 or self.cpu_threshold <= 0: return Mu.log_debug( self.__logger, "Server:{0}, check_id:{1}, cpu usage:{2}, threshold:{3}".format( server_id, check_id, cpu_usage, self.cpu_threshold)) # prepare all info if size of free memory < threshold if cpu_usage >= self.cpu_threshold: cpu_consumers = list(info[Mc.MSG_INFO].items()) cpu_consumers.sort(key=lambda v: v[1], reverse=True) # sort by desc del cpu_consumers[5:] # only keep the top 5 users = [user[0] for user in cpu_consumers] top_5_consumers = self.__get_users_info(server_id, check_id, users, InfoType.CPU, 100 - cpu_usage, -1) # combine usage info for user_info in top_5_consumers.get(Mc.INFO_USAGE, []): # set the usage {"user1": 12.2, "user2": 13.2} user_info[Mc.FIELD_USAGE] = info[Mc.MSG_INFO][user_info[ Mc.FIELD_USER_NAME]] / float(cpu_num) top_5_consumers.get(Mc.INFO_USAGE, []).sort(key=lambda v: v[Mc.FIELD_USAGE], reverse=True) return top_5_consumers
def monitoring(self, check_id): try: server_id = self.__server_id server_name = self._server_name Mu.log_debug(self.__logger, "[{0}]Memory Monitoring begin...".format(check_id)) Mu.log_debug( self.__logger, "Trying to get memory overview of {0}".format(server_name)) # collect memory info: total memory and free memory mem_total, mem_free = self._os_operator.collect_mem_info( server_name) Mu.log_debug( self.__logger, "Memory overview of {0} is (total:{1}, free:{2})".format( server_name, mem_total, mem_free)) mem_info = self._os_operator.get_mem_consumers(server_name) Mu.log_debug( self.__logger, "memory consuming information for {0}:{1}".format( server_name, mem_info)) # insert overview to memory info mem_info.insert( 0, { Mc.FIELD_CHECK_ID: check_id, Mc.FIELD_MEM_TOTAL: mem_total, Mc.FIELD_MEM_FREE: mem_free }) # send the info self.accept(MonitorResourceDispatcher(mem_info, server_id)) Mu.log_info( self.__logger, "Memory Monitoring is done for {0}.".format(server_name)) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error Occurred when performing Memory monitoring, ERROR: {0}". format(ex))
def monitoring(self, check_id): try: server_id = self.__server_id server_name = self._server_name mount_point = self.__mount_point Mu.log_debug(self.__logger, "[{0}]Disk Monitoring begin...".format(check_id)) Mu.log_debug( self.__logger, "Trying to get disk overview of {0}".format(server_name)) # collect disk overview info: disk_total, disk_free disk_total, disk_free = self._os_operator.collect_disk_info( server_name, mount_point) Mu.log_debug( self.__logger, "Disk overview of {0} is (total:{1}, free:{2})".format( server_name, disk_total, disk_free)) disk_info = self._os_operator.get_disk_consumers( server_name, mount_point) Mu.log_debug( self.__logger, "Disk consuming information for {0}:{1}".format( server_name, disk_info)) # insert overview to memory info disk_info.insert( 0, { Mc.FIELD_CHECK_ID: check_id, Mc.FIELD_DISK_TOTAL: disk_total, Mc.FIELD_DISK_FREE: disk_free }) self.accept(MonitorResourceDispatcher(disk_info, server_id)) Mu.log_info(self.__logger, "Disk Monitoring is done for {0}.".format(server_name)) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error Occurred when performing Disk monitoring, ERROR: {0}". format(ex))
def monitoring(self, check_id): try: server_id = self.__server_id server_name = self._server_name Mu.log_debug(self.__logger, "[{0}]CPU Monitoring begin...".format(check_id)) Mu.log_debug( self.__logger, "Trying to get CPU overview of {0}".format(server_name)) # collect cpu info: cpu_num, cpu_usage cpu_num, cpu_usage = self._os_operator.collect_cpu_info( server_name) Mu.log_debug( self.__logger, "CPU overview of {0} is (num:{1}, usage:{2})".format( server_name, cpu_num, cpu_usage)) cpu_info = self._os_operator.get_cpu_consumers(server_name) Mu.log_debug( self.__logger, "CPU consuming information for {0}:{1}".format( server_name, cpu_info)) # insert overview to cpu info cpu_info.insert( 0, { Mc.FIELD_CHECK_ID: check_id, Mc.FIELD_CPU_NUMBER: cpu_num, Mc.FIELD_CPU_UTILIZATION: cpu_usage }) self.accept(MonitorResourceDispatcher(cpu_info, server_id)) Mu.log_info(self.__logger, "CPU Monitoring is done for {0}.".format(server_name)) except Exception as ex: Mu.log_warning_exc( self.__logger, "Error Occurred when performing CPU monitoring, ERROR: {0}". format(ex))
def __monitoring_configurations(self, operator): updated_configs = {} for key, value in self.__get_configurations(operator).items(): if value != self.__configs.get(key, None): updated_configs[key] = value self.__configs[key] = value if updated_configs: Mu.log_debug( self.__logger, "Sending updated configs {0} to queue...".format( updated_configs)) self.__producer.send(self.__topic, updated_configs) # block until all async messages are sent self.__producer.flush() Mu.log_debug(self.__logger, "Sent updated configs to queue.") else: Mu.log_debug(self.__logger, "No update for configurations ...")
def publish(self): user = Mc.get_ssh_default_user() password = Mc.get_ssh_default_password() host_name = os.uname()[1] for server in self.servers: if host_name in server: Mu.log_debug(None, "Skipping local server on {0}".format(server)) continue with Mu.open_ssh_connection(None, self.__os_operator, server, user, password) as ssh: Mu.log_debug(None, "Publishing agent on {0}".format(server)) for file in self.files: # Currently, path for source and target is the same source = self.path + file target = self.path + file self.__os_operator.upload_file(ssh, source, target) Mu.log_debug(None, "Publishing agent on {0} is done".format(server))
def __operate(self, consumer): operators = { InfoType.MEMORY.value: self.__operate_memory, InfoType.CPU.value: self.__operate_cpu, InfoType.DISK.value: self.__operate_disk } alarm = {} emergency_alarm = {} Mu.log_info(self.__logger, "Start processing alarm.") for msg in consumer: if not msg or not msg.value: continue try: # process filtered message if Mc.MSG_TYPE not in msg.value or msg.value[ Mc.MSG_TYPE] not in operators: # update configuration self.__update_configuration(msg.value) # if configuration is ready, update subscription (all previous filtered info will be skipped) if self.__check_configuration() and len( consumer.assignment()) < 2: # start heartbeat checking # use assign instead subscribe because the error: # https://github.com/dpkp/kafka-python/issues/601 Ku.assign_and_seek_to_end( consumer, Mc.TOPIC_FILTERED_INFO, *[Mc.TOPIC_FILTERED_INFO, Mc.TOPIC_CONFIGURATION]) heartbeat_thread = threading.Thread( target=self.__process_heartbeat) heartbeat_thread.start() else: # if configuration is not initialized, all data will be ignored top5_consumers = operators[msg.value[Mc.MSG_TYPE]]( msg.value) server_id = msg.value[Mc.FIELD_SERVER_ID] msg_type = msg.value[Mc.MSG_TYPE] if top5_consumers: server_name = top5_consumers[Mc.FIELD_SERVER_FULL_NAME] # calculate emergency status if msg_type == InfoType.MEMORY.value: mem_free = top5_consumers[Mc.INFO_FREE] mem_total = top5_consumers[Mc.INFO_TOTAL] # calculate emergency status if float( mem_free ) / mem_total * 100 <= 100 - self.mem_emergency_threshold: cur_time = datetime.now() pre_time = emergency_alarm.get( server_id, cur_time) if cur_time != pre_time and ( cur_time - pre_time ).total_seconds() < self.check_interval: # only perform emergency shutdown every configured interval continue emergency_alarm[server_id] = cur_time try: email, employee_name, user_name, sid, mem_usage = \ AlarmOperator.__get_highest_consumption_info(top5_consumers[Mc.INFO_USAGE]) except Exception as ex: Mu.log_warning( self.__logger, "Call __get_highest_consumption_info for {0} failed with exception {1}." .format(top5_consumers[Mc.INFO_USAGE], ex)) continue # trigger the emergency shutdown admin = self.__db_operator.get_email_admin( server_id) Mu.log_info( self.__logger, "Try to sending emergency shutdown email for {0} on {1}, because server " "is running out of memory and {2} is consuming highest " "({3}%) memory.".format( sid, server_name, user_name, mem_usage)) # sending email to the owner of the instance Email.send_emergency_shutdown_email( self.email_sender, email, sid, server_name, employee_name, admin, mem_usage, InfoType.MEMORY) self.__send_shutdown_message( server_name, sid, user_name) # no need to check further continue else: # reset the emergency alarm for the server if it is not in emergency status emergency_alarm.pop(server_id, None) # If it's not working time, skip sending email and shutdown ) if not Mu.is_current_time_working_time( self.operation_time): Mu.log_info( self.__logger, "Skip alarm operations because of the non-working time." ) continue email_flag = 0 # update alarm info if server_id not in alarm: alarm[server_id] = { msg_type: { Mc.INFO_ALARM_TIME: datetime.now(), Mc.INFO_ALARM_NUM: 0 } } elif msg_type not in alarm[server_id]: alarm[server_id][msg_type] = { Mc.INFO_ALARM_TIME: datetime.now(), Mc.INFO_ALARM_NUM: 0 } if alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] == 0: alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] = 1 alarm[server_id][msg_type][ Mc.INFO_ALARM_TIME] = datetime.now() # send email email_flag = 1 else: pre_time = alarm[server_id][msg_type][ Mc.INFO_ALARM_TIME] cur_time = datetime.now() # every checking interval sending next alarm mail if (cur_time - pre_time ).total_seconds() >= self.check_interval: alarm[server_id][msg_type][ Mc.INFO_ALARM_NUM] += 1 alarm[server_id][msg_type][ Mc.INFO_ALARM_TIME] = cur_time if alarm[server_id][msg_type][ Mc. INFO_ALARM_NUM] > self.max_failure_times: email_flag = 2 else: email_flag = 1 if email_flag >= 1: # sending email Mu.log_debug( self.__logger, "Top 5 Consumers of server {1} ({2}): {0}". format(top5_consumers, server_id, msg_type)) email_to = [ c[Mc.FIELD_EMAIL] for c in top5_consumers[Mc.INFO_USAGE] if c.get(Mc.FIELD_EMAIL, None) ] admin = self.__db_operator.get_email_admin( server_id) Mu.log_debug( self.__logger, "Server {0}:{1} Sending email to:{2}".format( server_id, msg_type, email_to)) Email.send_warning_email( self.email_sender, email_to, top5_consumers[Mc.MSG_TYPE], server_name, top5_consumers, admin) if email_flag == 2: try: email, employee_name, user_name, sid, usage = AlarmOperator\ .__get_highest_consumption_info(top5_consumers[Mc.INFO_USAGE], msg_type) admin = self.__db_operator.get_email_admin( server_id) except Exception as ex: Mu.log_warning( self.__logger, "Call __get_highest_consumption_info for {0} failed with exception {1}." .format(top5_consumers[Mc.INFO_USAGE], ex)) continue if msg_type == InfoType.MEMORY.value: # sending email to the owner of the instance Mu.log_info( self.__logger, "Try to sending shutdown email for {0} on {1}, because server " "is running out of memory and {2} is consuming highest " "({3}%) memory.".format( sid, server_name, user_name, usage)) Email.send_shutdown_email( self.email_sender, email, sid, server_name, employee_name, admin, usage, InfoType.MEMORY) # trigger the shutdown --> send shutdown message self.__send_shutdown_message( server_name, sid, user_name) elif msg_type == InfoType.DISK.value: # sending email to the owner of the instance Mu.log_info( self.__logger, "Try to sending email for {0} on {1}, because server " "is running out of Disk and {2} is consuming highest " "({3}K) disk space.".format( sid, server_name, user_name, usage)) Email.send_cleaning_disk_email( self.email_sender, email, sid, server_name, employee_name, admin, usage, InfoType.DISK) # trigger the shutdown --> send shutdown message self.__send_cleaning_message( server_name, sid, user_name) else: # everything is good, reset the alarm for server_id and msg type if server_id in alarm and msg_type in alarm[server_id]: alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] = 0 except Exception as ex: Mu.log_warning_exc( self.__logger, "Processing alarm failed with {0}.".format(ex))
def close_connection(self): if hasattr(self, "connection") and self.connection is not None: self.connection.close() self.connection = None Mu.log_debug(self.__logger, "Connecting is closed.")