Пример #1
0
    def daemon(self):
        try:
            # The protocol needs to be instantiated in the monitor thread itself (to avoid concurrency issues with the protocol object each
            # thread uses a different instance as per the SingletonPerThread model.
            protocol_util = get_protocol_util()
            protocol = protocol_util.get_protocol()
            health_service = HealthService(protocol.get_endpoint())
            periodic_operations = [
                ResetPeriodicLogMessages(),
                ReportNetworkErrors(),
                PollResourceUsage(),
                SendHostPluginHeartbeat(protocol, health_service),
                SendImdsHeartbeat(protocol_util, health_service)
            ]

            report_network_configuration_changes = ReportNetworkConfigurationChanges()
            if conf.get_monitor_network_configuration_changes():
                periodic_operations.append(report_network_configuration_changes)
            else:
                logger.info("Monitor.NetworkConfigurationChanges is disabled.")
                report_network_configuration_changes.log_network_configuration()

            while not self.stopped():
                try:
                    for op in periodic_operations:
                        op.run()
                finally:
                    PeriodicOperation.sleep_until_next_operation(periodic_operations)
        except Exception as e:
            logger.error("An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e))
Пример #2
0
    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._periodic_operations = [
            ResetPeriodicLogMessagesOperation(),
            PeriodicOperation("collect_and_send_events",
                              self.collect_and_send_events,
                              self.EVENT_COLLECTION_PERIOD),
            ReportNetworkErrorsOperation(),
            PollResourceUsageOperation(),
            PeriodicOperation("send_host_plugin_heartbeat",
                              self.send_host_plugin_heartbeat,
                              self.HOST_PLUGIN_HEARTBEAT_PERIOD),
            PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat,
                              self.IMDS_HEARTBEAT_PERIOD),
            ReportNetworkConfigurationChangesOperation(),
        ]
        self.protocol = None
        self.protocol_util = None
        self.health_service = None

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
    def daemon(self):
        periodic_operations = [
            _CollectAndEnqueueEvents(self._send_telemetry_events_handler)
        ]

        is_etp_enabled = get_supported_feature_by_name(
            SupportedFeatureNames.ExtensionTelemetryPipeline).is_supported
        logger.info(
            "Extension Telemetry pipeline enabled: {0}".format(is_etp_enabled))
        if is_etp_enabled:
            periodic_operations.append(
                _ProcessExtensionEvents(self._send_telemetry_events_handler))

        logger.info("Successfully started the {0} thread".format(
            self.get_thread_name()))
        while not self.stopped():
            try:
                for periodic_op in periodic_operations:
                    periodic_op.run()

            except Exception as error:
                logger.warn(
                    "An error occurred in the Telemetry Extension thread main loop; will skip the current iteration.\n{0}",
                    ustr(error))
            finally:
                PeriodicOperation.sleep_until_next_operation(
                    periodic_operations)
Пример #4
0
    def daemon(self):
        try:
            if self.protocol_util is None or self.protocol is None:
                self.init_protocols()

            if self.imds_client is None:
                self.init_imds_client()

            while not self.stopped():
                try:
                    self.protocol.update_host_plugin_from_goal_state()

                    for op in self._periodic_operations:  # pylint: disable=C0103
                        op.run()

                except Exception as e:  # pylint: disable=C0103
                    logger.error(
                        "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:  # pylint: disable=C0103
            logger.error(
                "An error occurred in the monitor thread; will exit the thread.\n{0}",
                ustr(e))
Пример #5
0
 def monitor(self):
     try:
         # The initialization of ProtocolUtil for the Environment thread should be done within the thread itself rather
         # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
         # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
         self.protocol_util = get_protocol_util()
         self._protocol = self.protocol_util.get_protocol()
         while not self.stopped:
             try:
                 for op in self._periodic_operations:
                     op.run()
             except Exception as e:
                 logger.error("An error occurred in the environment thread main loop; will skip the current iteration.\n{0}", ustr(e))
             finally:
                 PeriodicOperation.sleep_until_next_operation(self._periodic_operations)
     except Exception as e:
         logger.error("An error occurred in the environment thread; will exit the thread.\n{0}", ustr(e))
Пример #6
0
    def daemon(self, init_data=False):
        try:
            if init_data:
                self.init_protocols()

            while not self.stopped():
                try:
                    for op in self._periodic_operations:  # pylint: disable=C0103
                        op.run()
                except Exception as e:  # pylint: disable=C0103
                    logger.error(
                        "An error occurred in the log collection thread main loop; "
                        "will skip the current iteration.\n{0}", ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:  # pylint: disable=C0103
            logger.error(
                "An error occurred in the log collection thread; will exit the thread.\n{0}",
                ustr(e))
Пример #7
0
    def __init__(self):
        self.protocol = None
        self.protocol_util = None
        self.event_thread = None
        self.should_run = True
        self.last_state = None

        self._periodic_operations = [
            PeriodicOperation("collect_and_send_logs",
                              self.collect_and_send_logs,
                              conf.get_collect_logs_period())
        ]
Пример #8
0
    def daemon(self, init_data=False):
        try:
            if init_data:
                self.init_protocols()
                self.init_imds_client()

            while not self.stopped():
                try:
                    self.protocol.update_host_plugin_from_goal_state()

                    for op in self._periodic_operations:
                        op.run()

                except Exception as e:
                    logger.error(
                        "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:
            logger.error(
                "An error occurred in the monitor thread; will exit the thread.\n{0}",
                ustr(e))
Пример #9
0
    def daemon(self):
        try:
            # The initialization of the protocol needs to be done within the environment thread itself rather
            # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
            # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
            protocol_util = get_protocol_util()
            protocol = protocol_util.get_protocol()
            osutil = get_osutil()

            periodic_operations = [
                RemovePersistentNetworkRules(osutil),
                MonitorDhcpClientRestart(osutil),
                CleanupGoalStateHistory()
            ]

            if conf.enable_firewall():
                periodic_operations.append(EnableFirewall(osutil, protocol))
            if conf.get_root_device_scsi_timeout() is not None:
                periodic_operations.append(SetRootDeviceScsiTimeout(osutil))
            if conf.get_monitor_hostname():
                periodic_operations.append(MonitorHostNameChanges(osutil))
            while not self.stopped:
                try:
                    for op in periodic_operations:
                        op.run()
                except Exception as e:
                    logger.error(
                        "An error occurred in the environment thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        periodic_operations)
        except Exception as e:
            logger.error(
                "An error occurred in the environment thread; will exit the thread.\n{0}",
                ustr(e))
Пример #10
0
    def __init__(self):
        self.osutil = get_osutil()
        self.dhcp_handler = get_dhcp_handler()
        self.protocol_util = None
        self._protocol = None
        self.stopped = True
        self.hostname = None
        self.dhcp_id_list = []
        self.server_thread = None
        self.dhcp_warning_enabled = True
        self.archiver = StateArchiver(conf.get_lib_dir())
        self._reset_firewall_rules = False

        self._periodic_operations = [
            PeriodicOperation("_remove_persistent_net_rules", self._remove_persistent_net_rules_period, conf.get_remove_persistent_net_rules_period()),
            PeriodicOperation("_monitor_dhcp_client_restart", self._monitor_dhcp_client_restart, conf.get_monitor_dhcp_client_restart_period()),
            PeriodicOperation("_cleanup_goal_state_history", self._cleanup_goal_state_history, conf.get_goal_state_history_cleanup_period())
        ]
        if conf.enable_firewall():
            self._periodic_operations.append(PeriodicOperation("_enable_firewall", self._enable_firewall, conf.get_enable_firewall_period()))
        if conf.get_root_device_scsi_timeout() is not None:
            self._periodic_operations.append(PeriodicOperation("_set_root_device_scsi_timeout", self._set_root_device_scsi_timeout, conf.get_root_device_scsi_timeout_period()))
        if conf.get_monitor_hostname():
            self._periodic_operations.append(PeriodicOperation("_monitor_hostname", self._monitor_hostname_changes, conf.get_monitor_hostname_period()))
Пример #11
0
    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
Пример #12
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host health
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # cgroup data period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)
    # log network configuration
    LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1)
    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def daemon(self, init_data=False):

        if init_data:
            self.init_protocols()
            self.init_imds_client()

        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while not self.stopped():
            try:
                self.protocol.update_host_plugin_from_goal_state()
                self._send_telemetry_heartbeat_op.run()
                self._poll_telemetry_metrics_op.run()
                # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for
                # each cgroup.
                self._send_telemetry_metrics_op.run()
                self._collect_and_send_events_op.run()
                self._send_host_plugin_heartbeat_op.run()
                self._send_imds_heartbeat_op.run()
                self._log_altered_network_configuration_op.run()
                self._reset_loggers_op.run()
            except Exception as e:
                logger.warn(
                    "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                    ustr(e))
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        logger.reset_periodic()

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_telemetry_heartbeat(self):
        io_errors = IOErrorCounter.get_and_reset()
        hostplugin_errors = io_errors.get("hostplugin")
        protocol_errors = io_errors.get("protocol")
        other_errors = io_errors.get("other")

        if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
            msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                hostplugin_errors, protocol_errors, other_errors)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HttpErrors,
                      is_success=True,
                      message=msg,
                      log_event=False)

    def poll_telemetry_metrics(self):
        """
        This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly.

        :return: List of Metrics (which would be sent to PerfCounterMetrics directly.
        """
        metrics = CGroupsTelemetry.poll_all_tracked()

        if metrics:
            for metric in metrics:
                report_metric(metric.category, metric.counter, metric.instance,
                              metric.value)

    def send_telemetry_metrics(self):
        """
        The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly.
        """
        performance_metrics = CGroupsTelemetry.report_all_tracked()

        if performance_metrics:
            message = generate_extension_metrics_telemetry_dictionary(
                schema_version=1.0, performance_metrics=performance_metrics)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionMetricsData,
                      is_success=True,
                      message=ustr(message),
                      log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state