def __operate(self, consumer):
        """ poll from consumer, performing the related operation"""

        # for msg in consumer:
        #     # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}}
        #     for action, info in msg.value:
        #         if action in switcher:
        #             Mu.log_info(self.__logger, "Trying to perform action: {0}...")
        #             switcher[action].operate(info)
        #             Mu.log_info(self.__logger, "Action: {0} is done.")

        app_opp_msg_pack = consumer.poll(update_offsets=True)
        if app_opp_msg_pack:
            for tp, messages in app_opp_msg_pack.items():
                # {action: {Mc.FIELD_SERVER_FULL_NAME: server_name, Mc.FIELD_SID: sid, Mc.FIELD_USER_NAME: user_name}}
                for msg in messages:
                    for action, info in msg.value.items():
                        action_type = int(action)
                        if action_type in self.switcher:
                            Mu.log_info(
                                self.__logger,
                                "Trying to perform action: {0}...".format(
                                    action))
                            try:
                                Mu.log_debug(self.__logger,
                                             "Action detail: {0}".format(info))
                                self.switcher[action_type].operate(info)
                            except Exception as ex:
                                Mu.log_warning_exc(
                                    self.__logger,
                                    "Perform action failed with {0}, action detail is {1}"
                                    .format(ex, info))

                            Mu.log_info(self.__logger,
                                        "Action: {0} is done.".format(action))
예제 #2
0
    def __analyze(self, consumer):
        """
        process all the un-aggregated data from agent, produce aggregated data to db-operator
        :param consumer: kafka consumer
        """
        # info structure is { info_type : {server_id : info_detail}}
        # eg : { "memory" : {1: info_detail, 2; info_detail} , "disk" : {3: info_detail : 7: info_detail} }
        info = {}
        # start flag for recording all statuses for different resources and server Ids
        # eg : { "memory" : { 1 : True, 8 : False} , "disk" : { 1 : True, 7 : False } }
        start_flag = {}
        for msg in consumer:
            try:
                if msg and msg.value:
                    message = msg.value
                    server_id = message[Mc.FIELD_SERVER_ID]
                    info_analyzer = self.__get_info_analyzer(message)
                    if server_id is not None and info_analyzer is not None:
                        Mu.log_debug(self.__logger, message)
                        info_type = info_analyzer.type()
                        if DataAnalyzer.__is_header(message):
                            # init the value, if previous ending is lost, all the previous messages with
                            # same type and server id will be abandoned
                            if info_type not in start_flag:
                                start_flag[info_type] = {}
                            start_flag[info_type][server_id] = True
                            # init the info for the specific type
                            if info_type.value not in info:
                                info[info_type.value] = {}
                            info[info_type.value][server_id] = {
                                Mc.MSG_TYPE: info_type.value,
                                Mc.MSG_INFO: {}
                            }
                        elif start_flag.get(info_type, {}).get(
                                server_id,
                                False) and DataAnalyzer.__is_ending(message):
                            # done, analyze the message and put filtered message to queue
                            self.__producer.send(
                                self.__topic, info[info_type.value][server_id])
                            self.__producer.flush()
                            Mu.log_debug(
                                self.__logger,
                                "Filtered message {0} was sent".format(
                                    info[info_type.value][server_id]))

                            # reset the start flag and value
                            start_flag[info_type][server_id] = False
                            info[info_type.value][server_id] = {}
                        elif start_flag.get(info_type,
                                            {}).get(server_id, False):
                            # only process after get the header
                            DataAnalyzer.__process(
                                info_analyzer, message,
                                info[info_type.value][server_id])
            except Exception as ex:
                Mu.log_warning_exc(
                    self.__logger,
                    "Error occurred when analyzing message, Error: {0}".format(
                        ex))
    def shutdown_hana(self, ssh):
        if Mu.is_test_mod():
            Mu.log_debug(self.__logger,
                         "It's in test mode, skip shutting down hana.")
            return

        cmd_output = self.__ssh_exec_command(
            'nohup bash -lc "HDB stop" >/dev/null 2>&1 &', ssh)
        Mu.log_debug(self.__logger,
                     "shutting down hana, output:{0}".format(cmd_output))
예제 #4
0
 def delivery(self, info_list, info_type, server_id):
     try:
         # send heartbeat data
         self.__producer.send(
             self.__topic_heartbeat, {
                 Mc.FIELD_SERVER_ID: server_id,
                 Mc.MSG_TYPE: info_type,
                 Mc.MSG_TIME: Mu.generate_check_id()
             })
         self.__producer.flush()
         if info_list:
             # as producer doesn't support transaction
             # https://github.com/dpkp/kafka-python/issues/1396
             # https://github.com/dpkp/kafka-python/issues/1063
             # add message begin and end part, consumer will abandon all messages if missing begin or end
             # header
             Mu.log_debug(
                 self.__logger,
                 "Sending {0} message header to queue...".format(info_type))
             self.__producer.send(
                 self.__topic,
                 MsgProducerService.__get_message_header(
                     info_type, server_id))
             # body
             for info in info_list:
                 # for all messages, add type and server id
                 info[Mc.MSG_TYPE] = info_type
                 info[Mc.FIELD_SERVER_ID] = server_id
                 Mu.log_debug(
                     self.__logger,
                     "Sending {0} info {1} to queue...".format(
                         info_type, info))
                 self.__producer.send(self.__topic, info)
                 Mu.log_debug(
                     self.__logger,
                     "{0} info {1} is sent to queue...".format(
                         info_type, info))
             # ending
             Mu.log_debug(
                 self.__logger,
                 "Sending {0} message ending to queue...".format(info_type))
             self.__producer.send(
                 self.__topic,
                 MsgProducerService.__get_message_ending(
                     info_type, server_id))
             self.__producer.flush()
             Mu.log_debug(
                 self.__logger,
                 "Sending {0} message to queue is finished...".format(
                     info_type))
     except Exception as ex:
         Mu.log_error(
             self.__logger,
             "Some thing wrong when delivering, error: {0}".format(ex))
    def clean_log_backup(self, ssh, sid):
        if Mu.is_test_mod():
            Mu.log_debug(
                self.__logger,
                "It's in test mode, skip cleaning log backup for {0}.".format(
                    sid))
            return

        self.__ssh_exec_command(
            'find /usr/sap/{0}/HDB[0-9][0-9]/backup -name "log_backup_*.*" -mtime +10 -type f -delete'
            .format(sid), ssh)
        Mu.log_debug(self.__logger, "cleaned log backup for {0}.".format(sid))
예제 #6
0
 def __query_insert_batch(self, query, param_list):
     cursor = None
     try:
         cursor = self.connection.cursor()
         Mu.log_debug(self.__logger, "query:{0}, param:{1}".format(query, param_list))
         cursor.executemany(query, param_list)
     except Exception as ex:
         Mu.log_error(self.__logger, "Query:{0} failed with error:{1}".format(query, ex))
         Mu.log_exception(self.__logger, traceback.format_exc())
     finally:
         if cursor is not None:
             cursor.close()
         Mu.log_debug(self.__logger, "Cursor closed.")
    def __send_shutdown_message(self, server_name, sid, user_name):
        Mu.log_debug(
            self.__logger, "Sending shutdown message of {0} on {1} ...".format(
                sid, server_name))
        # send shutdown message
        self.__producer.send(
            self.__topic,
            AlarmOperator.__generate_action_message(server_name, sid,
                                                    user_name,
                                                    ActionType.SHUTDOWN.value))

        Mu.log_debug(
            self.__logger,
            "Shutdown message of {0} on {1} is sent".format(sid, server_name))
예제 #8
0
    def monitoring(self, check_id):
        try:
            server_id = self.__server_id
            server_name = self._server_name
            Mu.log_debug(self.__logger,
                         "[{0}]Instance Monitoring begin...".format(check_id))
            Mu.log_debug(
                self.__logger,
                "Trying to get instance info of {0}".format(server_name))
            # collect instance info for one server by server id
            instance_info = self._os_operator.get_all_hana_instance_info(
                server_id)
            Mu.log_debug(
                self.__logger, "Instance information of {0} is {1}".format(
                    server_name, instance_info))

            if instance_info:  # will skip sending instance info if it is empty
                instance_info.insert(0, {Mc.FIELD_CHECK_ID: check_id})
                self.accept(MonitorResourceDispatcher(instance_info,
                                                      server_id))
            else:
                Mu.log_debug(
                    self.__logger,
                    "Instance information for {0} is empty, skipped sending this info."
                    .format(server_name))
            Mu.log_info(
                self.__logger,
                "Instance Monitoring is done for {0}.".format(server_name))
        except Exception as ex:
            Mu.log_warning_exc(
                self.__logger,
                "Error Occurred when monitoring Instance, ERROR: {0}".format(
                    ex))
예제 #9
0
    def __restart_agent(self, server, server_id, mount_point, agent_path,
                        mem_interval, cpu_interval, disk_interval,
                        instance_interval):

        with Mu.open_ssh_connection(self.__logger, self.__os_operator, server,
                                    Mc.get_ssh_default_user(),
                                    Mc.get_ssh_default_password()) as ssh:
            Mu.log_debug(self.__logger, "Restarting {0}".format(server))
            self.__os_operator.restart_agent(ssh, server_id, mount_point,
                                             agent_path, mem_interval,
                                             cpu_interval, disk_interval,
                                             instance_interval)
            Mu.log_debug(self.__logger,
                         "Restarting of {0} is done".format(server))
    def __send_cleaning_message(self, server_name, sid, user_name):
        Mu.log_debug(
            self.__logger,
            "Sending log backup cleaning message of {0} on {1} for {2} ...".
            format(sid, server_name, user_name))
        # send shutdown message
        self.__producer.send(
            self.__topic,
            AlarmOperator.__generate_action_message(
                server_name, sid, user_name,
                ActionType.CLEAN_LOG_BACKUP.value))

        Mu.log_debug(
            self.__logger,
            "Log backup cleaning message of {0} on {1} for {2} is sent".format(
                sid, server_name, user_name))
    def __operate_disk(self, info):
        server_id = info[Mc.FIELD_SERVER_ID]
        check_id = info[Mc.FIELD_CHECK_ID]

        disk_free = info[Mc.FIELD_DISK_FREE]
        disk_total = info[Mc.FIELD_DISK_TOTAL]

        free_disk_threshold = (
            (100 - self.disk_threshold) * disk_total) / 100.0

        Mu.log_debug(
            self.__logger,
            "Server:{0}, check_id:{1}, free disk:{2}, threshold:{3}".format(
                server_id, check_id, disk_free, free_disk_threshold))

        if disk_free is None or disk_free < 0 or disk_total is None or disk_total <= 0 or self.disk_threshold <= 0:
            return

        # prepare all info if size of free memory < threshold
        if disk_free < free_disk_threshold:
            # {"folder": {"user1":3245}, "folder2":{"user2":222}, "folder3":{"user3":99999}}
            disk_consumers = list(info[Mc.MSG_INFO].items())
            disk_consumers.sort(key=lambda v: next(iter(v[1].values())),
                                reverse=True)  # sort by desc
            del disk_consumers[5:]  # only keep the top 5

            users = [next(iter(folder[1].keys())) for folder in disk_consumers]
            top_5_consumers = self.__get_users_info(server_id, check_id, users,
                                                    InfoType.DISK, disk_free,
                                                    disk_total)
            # combine usage info
            folders_info = []
            for folder in disk_consumers:
                folder_info = {
                    Mc.FIELD_FOLDER: folder[0],
                    Mc.FIELD_USER_NAME: next(iter(folder[1].keys())),
                    Mc.FIELD_USAGE: next(iter(folder[1].values()))
                }
                for user_info in top_5_consumers.get(Mc.INFO_USAGE, []):
                    if folder_info[Mc.FIELD_USER_NAME] == user_info[
                            Mc.FIELD_USER_NAME]:
                        folder_info.update(user_info)
                folders_info.append(folder_info)

            top_5_consumers[Mc.INFO_USAGE] = folders_info
            return top_5_consumers
예제 #12
0
 def __query_select(self, query):
     cursor = None
     try:
         cursor = self.connection.cursor()
         self.__logger.debug("query:{0}".format(query))
         cursor.execute(query)
         ret = cursor.fetchall()
         Mu.log_debug(self.__logger, "Result record count {0}".format(len(ret)))
         return ret
     except Exception as ex:
         Mu.log_error(self.__logger, "Query:{0} failed with error:{1}".format(query, ex))
         Mu.log_exception(self.__logger, traceback.format_exc())
         return []
     finally:
         if cursor is not None:
             cursor.close()
         Mu.log_debug(self.__logger, "Cursor closed.")
 def operate(self, parameter):
     server = parameter[Mc.FIELD_SERVER_FULL_NAME]
     user = parameter[Mc.FIELD_USER_NAME]
     with Mu.open_ssh_connection(self._logger, self._os_operator,
                                 server, user,
                                 Mc.get_ssh_default_password()) as ssh:
         if ssh is None:
             # TODO: notify alarm operator because of the non-standard password ??
             Mu.log_warning(
                 self._logger,
                 "Failed to log in {0} with user {1}".format(
                     server, user))
         else:
             Mu.log_debug(
                 self._logger,
                 "Trying shutdown HANA on {0} for user {1}".format(
                     server, user))
             self._os_operator.shutdown_hana(ssh)
    def __operate_memory(self, info):
        mem_free = info[Mc.FIELD_MEM_FREE]
        mem_total = info[Mc.FIELD_MEM_TOTAL]
        server_id = info[Mc.FIELD_SERVER_ID]
        check_id = info[Mc.FIELD_CHECK_ID]

        if mem_free is None or mem_free < 0 or mem_total is None or mem_total <= 0 or self.mem_threshold <= 0:
            return

        free_mem_threshold = ((100 - self.mem_threshold) * mem_total) / 100.0
        Mu.log_debug(
            self.__logger,
            "Server:{0}, check_id:{1}, free Memory:{2}, threshold:{3}".format(
                server_id, check_id, mem_free, free_mem_threshold))

        # prepare all info if size of free memory < threshold
        if mem_free < free_mem_threshold:
            mem_consumers = list(info[Mc.MSG_INFO].items())

            mem_consumers.sort(key=lambda v: v[1],
                               reverse=True)  # sort by desc
            del mem_consumers[5:]  # only keep the top 5

            users = [user[0] for user in mem_consumers]
            top_5_consumers = self.__get_users_info(server_id, check_id, users,
                                                    InfoType.MEMORY, mem_free,
                                                    mem_total)

            # combine usage info
            for user_info in top_5_consumers.get(Mc.INFO_USAGE, []):
                # {"user1": 12.2, "user2": 13.2}
                user_info[Mc.FIELD_USAGE] = info[Mc.MSG_INFO][user_info[
                    Mc.FIELD_USER_NAME]]  # set the usage

            top_5_consumers.get(Mc.INFO_USAGE,
                                []).sort(key=lambda v: v[Mc.FIELD_USAGE],
                                         reverse=True)
            return top_5_consumers
예제 #15
0
    def __coordinating_monitors(self, consumer):
        """
        Coordinating (start/stop/restart) all the agents
        :param consumer: kafka consumer
        """
        Mu.log_debug(self.__logger,
                     "Coordinator is listening on topic for configurations.")
        for msg in consumer:
            try:
                Mu.log_debug(self.__logger, "New configs are coming...")
                if self.__update_configs(msg.value):
                    # start/restart all agents, current design is restart all agents if any config is changed
                    servers = self.__configs.get(Mc.DB_CONFIGURATION_SERVER,
                                                 [])
                    for server in servers:
                        self.__restart_agent(
                            server[Mc.FIELD_SERVER_FULL_NAME],
                            server[Mc.FIELD_SERVER_ID],
                            server[Mc.FIELD_MOUNT_POINT], Mc.get_agent_path(),
                            self.__configs.get("CHECK_INTERVAL_MEM_INT", 60),
                            self.__configs.get("CHECK_INTERVAL_CPU_INT", 300),
                            self.__configs.get("CHECK_INTERVAL_DISK_INT",
                                               3600),
                            self.__configs.get("CHECK_INTERVAL_INSTANCE_INT",
                                               300))

                if self.__check_configuration() and not self.__heartbeat_flag:
                    self.__heartbeat_flag = True
                    # start heart beat thread
                    heartbeat_thread = threading.Thread(
                        target=self.__process_heartbeat)
                    heartbeat_thread.start()
            except Exception as ex:
                Mu.log_warning_exc(
                    self.__logger,
                    "Error occurred when coordinating the monitors, Err: {0}".
                    format(ex))
    def __operate_cpu(self, info):
        cpu_usage = info[Mc.FIELD_CPU_UTILIZATION]
        cpu_num = info[Mc.FIELD_CPU_NUMBER]
        server_id = info[Mc.FIELD_SERVER_ID]
        check_id = info[Mc.FIELD_CHECK_ID]

        if cpu_usage is None or cpu_usage < 0 or self.cpu_threshold <= 0:
            return

        Mu.log_debug(
            self.__logger,
            "Server:{0}, check_id:{1}, cpu usage:{2}, threshold:{3}".format(
                server_id, check_id, cpu_usage, self.cpu_threshold))

        # prepare all info if size of free memory < threshold
        if cpu_usage >= self.cpu_threshold:
            cpu_consumers = list(info[Mc.MSG_INFO].items())
            cpu_consumers.sort(key=lambda v: v[1],
                               reverse=True)  # sort by desc
            del cpu_consumers[5:]  # only keep the top 5

            users = [user[0] for user in cpu_consumers]
            top_5_consumers = self.__get_users_info(server_id, check_id, users,
                                                    InfoType.CPU,
                                                    100 - cpu_usage, -1)

            # combine usage info
            for user_info in top_5_consumers.get(Mc.INFO_USAGE, []):
                # set the usage {"user1": 12.2, "user2": 13.2}
                user_info[Mc.FIELD_USAGE] = info[Mc.MSG_INFO][user_info[
                    Mc.FIELD_USER_NAME]] / float(cpu_num)

            top_5_consumers.get(Mc.INFO_USAGE,
                                []).sort(key=lambda v: v[Mc.FIELD_USAGE],
                                         reverse=True)
            return top_5_consumers
예제 #17
0
    def monitoring(self, check_id):
        try:
            server_id = self.__server_id
            server_name = self._server_name

            Mu.log_debug(self.__logger,
                         "[{0}]Memory Monitoring begin...".format(check_id))
            Mu.log_debug(
                self.__logger,
                "Trying to get memory overview of {0}".format(server_name))

            # collect memory info: total memory and free memory
            mem_total, mem_free = self._os_operator.collect_mem_info(
                server_name)
            Mu.log_debug(
                self.__logger,
                "Memory overview of {0} is (total:{1}, free:{2})".format(
                    server_name, mem_total, mem_free))

            mem_info = self._os_operator.get_mem_consumers(server_name)
            Mu.log_debug(
                self.__logger,
                "memory consuming information for {0}:{1}".format(
                    server_name, mem_info))

            # insert overview to memory info
            mem_info.insert(
                0, {
                    Mc.FIELD_CHECK_ID: check_id,
                    Mc.FIELD_MEM_TOTAL: mem_total,
                    Mc.FIELD_MEM_FREE: mem_free
                })

            # send the info
            self.accept(MonitorResourceDispatcher(mem_info, server_id))

            Mu.log_info(
                self.__logger,
                "Memory Monitoring is done for {0}.".format(server_name))
        except Exception as ex:
            Mu.log_warning_exc(
                self.__logger,
                "Error Occurred when performing Memory monitoring, ERROR: {0}".
                format(ex))
예제 #18
0
    def monitoring(self, check_id):
        try:
            server_id = self.__server_id
            server_name = self._server_name
            mount_point = self.__mount_point

            Mu.log_debug(self.__logger,
                         "[{0}]Disk Monitoring begin...".format(check_id))
            Mu.log_debug(
                self.__logger,
                "Trying to get disk overview of {0}".format(server_name))
            # collect disk overview info: disk_total, disk_free
            disk_total, disk_free = self._os_operator.collect_disk_info(
                server_name, mount_point)
            Mu.log_debug(
                self.__logger,
                "Disk overview of {0} is (total:{1}, free:{2})".format(
                    server_name, disk_total, disk_free))

            disk_info = self._os_operator.get_disk_consumers(
                server_name, mount_point)

            Mu.log_debug(
                self.__logger, "Disk consuming information for {0}:{1}".format(
                    server_name, disk_info))

            # insert overview to memory info
            disk_info.insert(
                0, {
                    Mc.FIELD_CHECK_ID: check_id,
                    Mc.FIELD_DISK_TOTAL: disk_total,
                    Mc.FIELD_DISK_FREE: disk_free
                })

            self.accept(MonitorResourceDispatcher(disk_info, server_id))
            Mu.log_info(self.__logger,
                        "Disk Monitoring is done for {0}.".format(server_name))
        except Exception as ex:
            Mu.log_warning_exc(
                self.__logger,
                "Error Occurred when performing Disk monitoring, ERROR: {0}".
                format(ex))
예제 #19
0
    def monitoring(self, check_id):
        try:
            server_id = self.__server_id
            server_name = self._server_name

            Mu.log_debug(self.__logger,
                         "[{0}]CPU Monitoring begin...".format(check_id))
            Mu.log_debug(
                self.__logger,
                "Trying to get CPU overview of {0}".format(server_name))
            # collect cpu info: cpu_num, cpu_usage
            cpu_num, cpu_usage = self._os_operator.collect_cpu_info(
                server_name)

            Mu.log_debug(
                self.__logger,
                "CPU overview of {0} is (num:{1}, usage:{2})".format(
                    server_name, cpu_num, cpu_usage))

            cpu_info = self._os_operator.get_cpu_consumers(server_name)
            Mu.log_debug(
                self.__logger, "CPU consuming information for {0}:{1}".format(
                    server_name, cpu_info))

            # insert overview to cpu info
            cpu_info.insert(
                0, {
                    Mc.FIELD_CHECK_ID: check_id,
                    Mc.FIELD_CPU_NUMBER: cpu_num,
                    Mc.FIELD_CPU_UTILIZATION: cpu_usage
                })

            self.accept(MonitorResourceDispatcher(cpu_info, server_id))

            Mu.log_info(self.__logger,
                        "CPU Monitoring is done for {0}.".format(server_name))
        except Exception as ex:
            Mu.log_warning_exc(
                self.__logger,
                "Error Occurred when performing CPU monitoring, ERROR: {0}".
                format(ex))
    def __monitoring_configurations(self, operator):
        updated_configs = {}

        for key, value in self.__get_configurations(operator).items():
            if value != self.__configs.get(key, None):
                updated_configs[key] = value
                self.__configs[key] = value

        if updated_configs:
            Mu.log_debug(
                self.__logger,
                "Sending updated configs {0} to queue...".format(
                    updated_configs))
            self.__producer.send(self.__topic, updated_configs)
            # block until all async messages are sent
            self.__producer.flush()
            Mu.log_debug(self.__logger, "Sent updated configs to queue.")
        else:
            Mu.log_debug(self.__logger, "No update for configurations ...")
 def publish(self):
     user = Mc.get_ssh_default_user()
     password = Mc.get_ssh_default_password()
     host_name = os.uname()[1]
     for server in self.servers:
         if host_name in server:
             Mu.log_debug(None,
                          "Skipping local server on {0}".format(server))
             continue
         with Mu.open_ssh_connection(None, self.__os_operator, server, user,
                                     password) as ssh:
             Mu.log_debug(None, "Publishing agent on {0}".format(server))
             for file in self.files:
                 # Currently, path for source and target is the same
                 source = self.path + file
                 target = self.path + file
                 self.__os_operator.upload_file(ssh, source, target)
             Mu.log_debug(None,
                          "Publishing agent on {0} is done".format(server))
    def __operate(self, consumer):
        operators = {
            InfoType.MEMORY.value: self.__operate_memory,
            InfoType.CPU.value: self.__operate_cpu,
            InfoType.DISK.value: self.__operate_disk
        }
        alarm = {}
        emergency_alarm = {}
        Mu.log_info(self.__logger, "Start processing alarm.")

        for msg in consumer:
            if not msg or not msg.value:
                continue
            try:
                # process filtered message
                if Mc.MSG_TYPE not in msg.value or msg.value[
                        Mc.MSG_TYPE] not in operators:
                    # update configuration
                    self.__update_configuration(msg.value)
                    # if configuration is ready, update subscription (all previous filtered info will be skipped)
                    if self.__check_configuration() and len(
                            consumer.assignment()) < 2:
                        # start heartbeat checking
                        # use assign instead subscribe because the error:
                        # https://github.com/dpkp/kafka-python/issues/601
                        Ku.assign_and_seek_to_end(
                            consumer, Mc.TOPIC_FILTERED_INFO,
                            *[Mc.TOPIC_FILTERED_INFO, Mc.TOPIC_CONFIGURATION])

                        heartbeat_thread = threading.Thread(
                            target=self.__process_heartbeat)
                        heartbeat_thread.start()
                else:
                    # if configuration is not initialized, all data will be ignored
                    top5_consumers = operators[msg.value[Mc.MSG_TYPE]](
                        msg.value)
                    server_id = msg.value[Mc.FIELD_SERVER_ID]
                    msg_type = msg.value[Mc.MSG_TYPE]
                    if top5_consumers:
                        server_name = top5_consumers[Mc.FIELD_SERVER_FULL_NAME]

                        # calculate emergency status
                        if msg_type == InfoType.MEMORY.value:
                            mem_free = top5_consumers[Mc.INFO_FREE]
                            mem_total = top5_consumers[Mc.INFO_TOTAL]
                            # calculate emergency status
                            if float(
                                    mem_free
                            ) / mem_total * 100 <= 100 - self.mem_emergency_threshold:
                                cur_time = datetime.now()
                                pre_time = emergency_alarm.get(
                                    server_id, cur_time)
                                if cur_time != pre_time and (
                                        cur_time - pre_time
                                ).total_seconds() < self.check_interval:
                                    # only perform emergency shutdown every configured interval
                                    continue
                                emergency_alarm[server_id] = cur_time

                                try:
                                    email, employee_name, user_name, sid, mem_usage = \
                                        AlarmOperator.__get_highest_consumption_info(top5_consumers[Mc.INFO_USAGE])
                                except Exception as ex:
                                    Mu.log_warning(
                                        self.__logger,
                                        "Call __get_highest_consumption_info for {0} failed with exception {1}."
                                        .format(top5_consumers[Mc.INFO_USAGE],
                                                ex))
                                    continue

                                # trigger the emergency shutdown
                                admin = self.__db_operator.get_email_admin(
                                    server_id)

                                Mu.log_info(
                                    self.__logger,
                                    "Try to sending emergency shutdown email for {0} on {1}, because server "
                                    "is running out of memory and {2} is consuming highest "
                                    "({3}%) memory.".format(
                                        sid, server_name, user_name,
                                        mem_usage))
                                # sending email to the owner of the instance
                                Email.send_emergency_shutdown_email(
                                    self.email_sender, email, sid, server_name,
                                    employee_name, admin, mem_usage,
                                    InfoType.MEMORY)

                                self.__send_shutdown_message(
                                    server_name, sid, user_name)
                                # no need to check further
                                continue
                            else:
                                # reset the emergency alarm for the server if it is not in emergency status
                                emergency_alarm.pop(server_id, None)

                        # If it's not working time, skip sending email and shutdown )
                        if not Mu.is_current_time_working_time(
                                self.operation_time):
                            Mu.log_info(
                                self.__logger,
                                "Skip alarm operations because of the non-working time."
                            )
                            continue

                        email_flag = 0
                        # update alarm info
                        if server_id not in alarm:
                            alarm[server_id] = {
                                msg_type: {
                                    Mc.INFO_ALARM_TIME: datetime.now(),
                                    Mc.INFO_ALARM_NUM: 0
                                }
                            }
                        elif msg_type not in alarm[server_id]:
                            alarm[server_id][msg_type] = {
                                Mc.INFO_ALARM_TIME: datetime.now(),
                                Mc.INFO_ALARM_NUM: 0
                            }

                        if alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] == 0:
                            alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] = 1
                            alarm[server_id][msg_type][
                                Mc.INFO_ALARM_TIME] = datetime.now()
                            # send email
                            email_flag = 1
                        else:
                            pre_time = alarm[server_id][msg_type][
                                Mc.INFO_ALARM_TIME]
                            cur_time = datetime.now()
                            # every checking interval sending next alarm mail
                            if (cur_time - pre_time
                                ).total_seconds() >= self.check_interval:
                                alarm[server_id][msg_type][
                                    Mc.INFO_ALARM_NUM] += 1
                                alarm[server_id][msg_type][
                                    Mc.INFO_ALARM_TIME] = cur_time
                                if alarm[server_id][msg_type][
                                        Mc.
                                        INFO_ALARM_NUM] > self.max_failure_times:
                                    email_flag = 2
                                else:
                                    email_flag = 1
                        if email_flag >= 1:
                            # sending email
                            Mu.log_debug(
                                self.__logger,
                                "Top 5 Consumers of server {1} ({2}): {0}".
                                format(top5_consumers, server_id, msg_type))
                            email_to = [
                                c[Mc.FIELD_EMAIL]
                                for c in top5_consumers[Mc.INFO_USAGE]
                                if c.get(Mc.FIELD_EMAIL, None)
                            ]
                            admin = self.__db_operator.get_email_admin(
                                server_id)
                            Mu.log_debug(
                                self.__logger,
                                "Server {0}:{1} Sending email to:{2}".format(
                                    server_id, msg_type, email_to))

                            Email.send_warning_email(
                                self.email_sender, email_to,
                                top5_consumers[Mc.MSG_TYPE], server_name,
                                top5_consumers, admin)

                        if email_flag == 2:

                            try:
                                email, employee_name, user_name, sid, usage = AlarmOperator\
                                    .__get_highest_consumption_info(top5_consumers[Mc.INFO_USAGE], msg_type)
                                admin = self.__db_operator.get_email_admin(
                                    server_id)
                            except Exception as ex:
                                Mu.log_warning(
                                    self.__logger,
                                    "Call __get_highest_consumption_info for {0} failed with exception {1}."
                                    .format(top5_consumers[Mc.INFO_USAGE], ex))
                                continue
                            if msg_type == InfoType.MEMORY.value:
                                # sending email to the owner of the instance
                                Mu.log_info(
                                    self.__logger,
                                    "Try to sending shutdown email for {0} on {1}, because server "
                                    "is running out of memory and {2} is consuming highest "
                                    "({3}%) memory.".format(
                                        sid, server_name, user_name, usage))
                                Email.send_shutdown_email(
                                    self.email_sender, email, sid, server_name,
                                    employee_name, admin, usage,
                                    InfoType.MEMORY)
                                # trigger the shutdown --> send shutdown message
                                self.__send_shutdown_message(
                                    server_name, sid, user_name)
                            elif msg_type == InfoType.DISK.value:
                                # sending email to the owner of the instance
                                Mu.log_info(
                                    self.__logger,
                                    "Try to sending email for {0} on {1}, because server "
                                    "is running out of Disk and {2} is consuming highest "
                                    "({3}K) disk space.".format(
                                        sid, server_name, user_name, usage))
                                Email.send_cleaning_disk_email(
                                    self.email_sender, email, sid, server_name,
                                    employee_name, admin, usage, InfoType.DISK)

                                # trigger the shutdown --> send shutdown message
                                self.__send_cleaning_message(
                                    server_name, sid, user_name)

                    else:
                        # everything is good, reset the alarm for server_id and msg type
                        if server_id in alarm and msg_type in alarm[server_id]:
                            alarm[server_id][msg_type][Mc.INFO_ALARM_NUM] = 0
            except Exception as ex:
                Mu.log_warning_exc(
                    self.__logger,
                    "Processing alarm failed with {0}.".format(ex))
예제 #23
0
 def close_connection(self):
     if hasattr(self, "connection") and self.connection is not None:
         self.connection.close()
         self.connection = None
         Mu.log_debug(self.__logger, "Connecting is closed.")