def __init__(self):
     # implement the singleton class
     self.__monitor_dao = HANAMonitorDAO(Mc.get_hana_server(),
                                         Mc.get_hana_port(),
                                         Mc.get_hana_user(),
                                         Mc.get_hana_password())
     self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_ALARM)
Exemplo n.º 2
0
 def __init__(self):
     super().__init__()
     self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_COORDINATOR)
     self.__configs = {}
     self.__os_operator = LinuxOperator()
     self.__heartbeat_flag = False
     self.__heartbeat_interval = Mc.get_heartbeat_check_interval()
     self.__heartbeat_timeout = Mc.get_heartbeat_timeout()
     self.__heartbeat_restart_agent_interval = Mc.get_heartbeat_operation_interval(
     )
     self.__heartbeat_agent_restart_info = {}
 def __init__(self):
     self.__os_operator = LinuxOperator()
     operator = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(),
                               Mc.get_hana_user(), Mc.get_hana_password())
     self.servers = self.__get_servers(operator)
     # self.servers = []
     # move belows to config?
     self.path = Mc.get_agent_path()[:-len('agent.py')]
     self.files = [
         'agent.py', 'util.py', 'errors.py', 'config/configuration.ini',
         'config/logging.ini'
     ]
 def __init__(self):
     # implement the singleton class
     if HANAOperatorService.__instance is not None:
         raise MonitorDBOpError(
             "This class is a singleton, use HANAServerDBOperatorService.instance() instead"
         )
     else:
         HANAOperatorService.__instance = self
         self.__monitor_dao = HANAMonitorDAO(Mc.get_hana_server(),
                                             Mc.get_hana_port(),
                                             Mc.get_hana_user(),
                                             Mc.get_hana_password())
         self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_DB)
    def run(self):
        """run the thread"""
        operator = HANAMonitorDAO(Mc.get_hana_server(), Mc.get_hana_port(),
                                  Mc.get_hana_user(), Mc.get_hana_password())
        while True:
            try:
                self.__monitoring_configurations(operator)
            except Exception as ex:
                Mu.log_warning_exc(
                    self.__logger,
                    "Error occurred when monitoring configuration, Error: {0}".
                    format(ex))

            time.sleep(self.__configs.get("CHECK_INTERVAL_CONFIG_INT", 300))
Exemplo n.º 6
0
    def __restart_agent(self, server, server_id, mount_point, agent_path,
                        mem_interval, cpu_interval, disk_interval,
                        instance_interval):

        with Mu.open_ssh_connection(self.__logger, self.__os_operator, server,
                                    Mc.get_ssh_default_user(),
                                    Mc.get_ssh_default_password()) as ssh:
            Mu.log_debug(self.__logger, "Restarting {0}".format(server))
            self.__os_operator.restart_agent(ssh, server_id, mount_point,
                                             agent_path, mem_interval,
                                             cpu_interval, disk_interval,
                                             instance_interval)
            Mu.log_debug(self.__logger,
                         "Restarting of {0} is done".format(server))
    def get_failed_servers(self, check_id, location_id):
        """get the server which failed for all the 3 stages by location_id"""
        # query = ("SELECT B.SERVER_FULL_NAME FROM HANA_OS_MONITOR.M_MONITOR_CATALOG A "
        #          "INNER JOIN HANA_OS_MONITOR.SERVER_INFO B ON A.SERVER_ID = B.SERVER_ID "
        #          "WHERE A.STATUS = 'ERROR' AND A.CHECK_ID = '{0}' AND A.LOCATION_ID = {1} "
        #          "GROUP BY B.SERVER_FULL_NAME HAVING COUNT(1) >= 3".format(check_id, location_id))
        # commented on 2018/09/05 now only send the failing alert email at the first time after 8am of current day
        working_hour = Mc.get_operation_hours(self.__logger)[0]
        working_time = "0{0}".format(
            working_hour) if working_hour < 10 else "{0}".format(working_hour)
        query = (
            "SELECT SERVER_FULL_NAME FROM ( SELECT B.SERVER_FULL_NAME, A.SERVER_ID, A.CHECK_ID, COUNT(1) "
            "AS FAILED_NUM FROM HANA_OS_MONITOR.M_MONITOR_CATALOG A "
            "INNER JOIN HANA_OS_MONITOR.SERVER_INFO B ON A.SERVER_ID = B.SERVER_ID "
            "WHERE A.END_TIME >= TO_TIMESTAMP(TO_NVARCHAR(CURRENT_TIMESTAMP, 'YYYY-MM-DD') "
            "|| ' {0}:00:00', 'YYYY-MM-DD HH24:MI:SS') "
            "AND A.STATUS = 'ERROR' AND A.CHECK_ID <= '{1}' AND A.LOCATION_ID = {2} "
            "GROUP BY B.SERVER_FULL_NAME, A.SERVER_ID, A.CHECK_ID HAVING COUNT(1) >= 3 ) C "
            "WHERE NOT EXISTS (SELECT 1 FROM HANA_OS_MONITOR.M_MONITOR_CATALOG D "
            "WHERE D.CHECK_ID > C.CHECK_ID AND D.STATUS <> 'ERROR' AND D.SERVER_ID = C.SERVER_ID)"
            "GROUP BY SERVER_FULL_NAME "
            "HAVING SUM(FAILED_NUM) >= 3 AND SUM(FAILED_NUM) <6".format(
                working_time, check_id, location_id))

        return self.__query_select(query)
 def publish(self):
     user = Mc.get_ssh_default_user()
     password = Mc.get_ssh_default_password()
     host_name = os.uname()[1]
     for server in self.servers:
         if host_name in server:
             Mu.log_debug(None,
                          "Skipping local server on {0}".format(server))
             continue
         with Mu.open_ssh_connection(None, self.__os_operator, server, user,
                                     password) as ssh:
             Mu.log_debug(None, "Publishing agent on {0}".format(server))
             for file in self.files:
                 # Currently, path for source and target is the same
                 source = self.path + file
                 target = self.path + file
                 self.__os_operator.upload_file(ssh, source, target)
             Mu.log_debug(None,
                          "Publishing agent on {0} is done".format(server))
 def __init__(self):
     super().__init__()
     self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_APP)
     # currently, only support shutdown and log backup clean
     self.switcher = {
         ActionType.SHUTDOWN.value:
         AppOperator.__HANACloser(self.__logger),
         ActionType.CLEAN_LOG_BACKUP.value:
         AppOperator.__HANALogCleaner(self.__logger)
     }
     self.__app_operation_interval = Mc.get_app_operation_check_interval()
Exemplo n.º 10
0
    def test_cleaning(self):
        operator, mock_os_operator, mock_consumer = self.__get_mock_operator()
        mock_consumer.poll.return_value = {
            "test_topic": self.__get_mock_msg_list([self.cleaning_msg])
        }

        operator._AppOperator__operate(mock_consumer)
        Mu.open_ssh_connection.assert_called_once_with(
            ANY, ANY, self.server_name, self.user_name,
            Mc.get_ssh_default_password())
        mock_os_operator.clean_log_backup.assert_called_once()
    def __init__(self):
        super().__init__()

        self.__db_operator = AlarmOperator.__HANAOperatorService()
        self.__logger = Mu.get_logger(Mc.LOGGER_MONITOR_OPERATOR_ALARM)
        self.__heartbeat_interval = Mc.get_heartbeat_check_interval()
        self.__heartbeat_timeout = Mc.get_heartbeat_timeout()
        self.__heartbeat_email_interval = Mc.get_heartbeat_email_interval()
        self.cpu_threshold = 0
        self.mem_threshold = 0
        self.disk_threshold = 0
        self.email_sender = ""
        self.operation_time = ""
        self.max_failure_times = 3
        self.mem_emergency_threshold = 0
        self.check_interval = 0  # the interval for sending email / performing emergency shutdown
        self.servers = []

        self.__producer = Ku.get_producer()
        self.__topic = Mc.TOPIC_APP_OPERATION
        self.__heartbeat_email_info = {}
    def test_initialize_of_coordinator(self):
        coordinator, mock_os_operator, mock_consumer = self.__get_mock_objects(
        )
        # mock the heartbeat function
        coordinator._MonitorCoordinator__process_heartbeat = MagicMock(
            return_value=None)
        # start coordinator
        coordinator._MonitorCoordinator__coordinating_monitors(
            self.__get_mock_msg_list([self.config_msg]))
        # should tried to connect the two servers
        Mu.open_ssh_connection.assert_has_calls([
            call(ANY, ANY, self.server1, Mc.get_ssh_default_user(),
                 Mc.get_ssh_default_password()),
            call(ANY, ANY, self.server2, Mc.get_ssh_default_user(),
                 Mc.get_ssh_default_password())
        ])

        # should tried to start agent on two servers
        mock_os_operator.restart_agent.assert_has_calls([
            call(ANY, 1, "/usr/sap", Mc.get_agent_path(),
                 self.config_msg[self.mem_interval],
                 self.config_msg[self.cpu_interval],
                 self.config_msg[self.disk_interval],
                 self.config_msg[self.instance_interval]),
            call(ANY, 2, "/usr/sap", Mc.get_agent_path(),
                 self.config_msg[self.mem_interval],
                 self.config_msg[self.cpu_interval],
                 self.config_msg[self.disk_interval],
                 self.config_msg[self.instance_interval])
        ])
 def operate(self, parameter):
     server = parameter[Mc.FIELD_SERVER_FULL_NAME]
     user = parameter[Mc.FIELD_USER_NAME]
     with Mu.open_ssh_connection(self._logger, self._os_operator,
                                 server, user,
                                 Mc.get_ssh_default_password()) as ssh:
         if ssh is None:
             # TODO: notify alarm operator because of the non-standard password ??
             Mu.log_warning(
                 self._logger,
                 "Failed to log in {0} with user {1}".format(
                     server, user))
         else:
             Mu.log_debug(
                 self._logger,
                 "Trying shutdown HANA on {0} for user {1}".format(
                     server, user))
             self._os_operator.shutdown_hana(ssh)
Exemplo n.º 14
0
    def __restart_agent_via_server_id(self, server_id):
        pre_time = self.__heartbeat_agent_restart_info.get(
            server_id, datetime.min)
        cur_time = datetime.now()

        if (cur_time - pre_time
            ).total_seconds() >= self.__heartbeat_restart_agent_interval:
            servers = [
                s for s in self.__configs.get(Mc.DB_CONFIGURATION_SERVER, [])
                if s[Mc.FIELD_SERVER_ID] == server_id
            ]
            for server in servers:
                # update restart time before restarting
                # because restart agent takes more than 2 minutes if server is no responding
                self.__heartbeat_agent_restart_info[server_id] = datetime.now()

                Mu.log_info(
                    self.__logger, "Restarting agent on {0}.".format(
                        server[Mc.FIELD_SERVER_FULL_NAME]))
                self.__restart_agent(
                    server[Mc.FIELD_SERVER_FULL_NAME],
                    server[Mc.FIELD_SERVER_ID], server[Mc.FIELD_MOUNT_POINT],
                    Mc.get_agent_path(),
                    self.__configs.get("CHECK_INTERVAL_MEM_INT", 60),
                    self.__configs.get("CHECK_INTERVAL_CPU_INT", 300),
                    self.__configs.get("CHECK_INTERVAL_DISK_INT", 3600),
                    self.__configs.get("CHECK_INTERVAL_INSTANCE_INT", 300))
                Mu.log_info(
                    self.__logger,
                    "Restarting agent on {0} is finished.".format(
                        server[Mc.FIELD_SERVER_FULL_NAME]))
        else:
            Mu.log_info(self.__logger, (
                "heartbeat failed for {0}, but did not try to restart agent due to the "
                "configured operation interval time ({1}). (pre: {2}, cur: {3})"
            ).format(server_id, self.__heartbeat_restart_agent_interval,
                     pre_time.strftime("%Y-%m-%d %H:%M:%S"),
                     cur_time.strftime("%Y-%m-%d %H:%M:%S")))
Exemplo n.º 15
0
    def __coordinating_monitors(self, consumer):
        """
        Coordinating (start/stop/restart) all the agents
        :param consumer: kafka consumer
        """
        Mu.log_debug(self.__logger,
                     "Coordinator is listening on topic for configurations.")
        for msg in consumer:
            try:
                Mu.log_debug(self.__logger, "New configs are coming...")
                if self.__update_configs(msg.value):
                    # start/restart all agents, current design is restart all agents if any config is changed
                    servers = self.__configs.get(Mc.DB_CONFIGURATION_SERVER,
                                                 [])
                    for server in servers:
                        self.__restart_agent(
                            server[Mc.FIELD_SERVER_FULL_NAME],
                            server[Mc.FIELD_SERVER_ID],
                            server[Mc.FIELD_MOUNT_POINT], Mc.get_agent_path(),
                            self.__configs.get("CHECK_INTERVAL_MEM_INT", 60),
                            self.__configs.get("CHECK_INTERVAL_CPU_INT", 300),
                            self.__configs.get("CHECK_INTERVAL_DISK_INT",
                                               3600),
                            self.__configs.get("CHECK_INTERVAL_INSTANCE_INT",
                                               300))

                if self.__check_configuration() and not self.__heartbeat_flag:
                    self.__heartbeat_flag = True
                    # start heart beat thread
                    heartbeat_thread = threading.Thread(
                        target=self.__process_heartbeat)
                    heartbeat_thread.start()
            except Exception as ex:
                Mu.log_warning_exc(
                    self.__logger,
                    "Error occurred when coordinating the monitors, Err: {0}".
                    format(ex))
Exemplo n.º 16
0
 def initialize(self):
     self._db_operator.execute_from_script(Mc.get_init_sql_file())