Exemplo n.º 1
0
    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._periodic_operations = [
            ResetPeriodicLogMessagesOperation(),
            PeriodicOperation("collect_and_send_events",
                              self.collect_and_send_events,
                              self.EVENT_COLLECTION_PERIOD),
            ReportNetworkErrorsOperation(),
            PollResourceUsageOperation(),
            PeriodicOperation("send_host_plugin_heartbeat",
                              self.send_host_plugin_heartbeat,
                              self.HOST_PLUGIN_HEARTBEAT_PERIOD),
            PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat,
                              self.IMDS_HEARTBEAT_PERIOD),
            ReportNetworkConfigurationChangesOperation(),
        ]
        self.protocol = None
        self.protocol_util = None
        self.health_service = None

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_reset_loggers_time = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_polling_telemetry = None
        self.last_cgroup_report_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
Exemplo n.º 3
0
 def test_errorstate00(self):
     """
     If ErrorState is never incremented, it will never trigger.
     """
     test_subject = ErrorState(timedelta(seconds=10000))
     self.assertFalse(test_subject.is_triggered())
     self.assertEqual(0, test_subject.count)
     self.assertEqual('unknown', test_subject.fail_time)
Exemplo n.º 4
0
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.last_upgrade_guids = {}
        self.log_report = False
        self.log_etag = True

        self.report_status_error_state = ErrorState()
Exemplo n.º 5
0
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(min_timedelta=ERROR_STATE_DELTA_INSTALL)
Exemplo n.º 6
0
    def test_errorstate02(self):
        """
        If ErrorState is triggered, and the current time is within timedelta
        of now it will trigger.
        """
        test_subject = ErrorState(timedelta(seconds=0))
        test_subject.incr()

        self.assertTrue(test_subject.is_triggered())
        self.assertEqual(1, test_subject.count)
        self.assertEqual('0.0 min', test_subject.fail_time)
Exemplo n.º 7
0
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(min_timedelta=ERROR_STATE_DELTA_INSTALL)
Exemplo n.º 8
0
class SendHostPluginHeartbeat(PeriodicOperation):
    """
    Periodic operation for reporting the HostGAPlugin's health. The signal is 'Healthy' when we have been able to communicate with
    plugin at least once in the last _HOST_PLUGIN_HEALTH_PERIOD.
    """
    def __init__(self, protocol, health_service):
        super(SendHostPluginHeartbeat, self).__init__(SendHostPluginHeartbeat._HOST_PLUGIN_HEARTBEAT_PERIOD)
        self.protocol = protocol
        self.health_service = health_service
        self.host_plugin_error_state = ErrorState(min_timedelta=SendHostPluginHeartbeat._HOST_PLUGIN_HEALTH_PERIOD)

    _HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    _HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)

    def _operation(self):
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            self.protocol.update_host_plugin_from_goal_state()

            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_error_state.reset()
            else:
                self.host_plugin_error_state.incr()

            is_healthy = self.host_plugin_error_state.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeatExtended,
                    is_success=False,
                    message='{0} since successful heartbeat'.format(self.host_plugin_error_state.fail_time),
                    log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
            add_event(
                name=AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.HostPluginHeartbeat,
                is_success=False,
                message=msg,
                log_event=False)
            raise
Exemplo n.º 9
0
 def __init__(self, endpoint, container_id, role_config_name):
     if endpoint is None:
         raise ProtocolError("HostGAPlugin: Endpoint not provided")
     self.is_initialized = False
     self.is_available = False
     self.api_versions = None
     self.endpoint = endpoint
     self.container_id = container_id
     self.deployment_id = None
     self.role_config_name = role_config_name
     self.manifest_uri = None
     self.health_service = HealthService(endpoint)
     self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
     self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
     self.fetch_last_timestamp = None
     self.status_last_timestamp = None
Exemplo n.º 10
0
 def __init__(self, endpoint, container_id, role_config_name):
     if endpoint is None:
         raise ProtocolError("HostGAPlugin: Endpoint not provided")
     self.is_initialized = False
     self.is_available = False
     self.api_versions = None
     self.endpoint = endpoint
     self.container_id = container_id
     self.deployment_id = None
     self.role_config_name = role_config_name
     self.manifest_uri = None
     self.health_service = HealthService(endpoint)
     self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
     self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
     self.fetch_last_timestamp = None
     self.status_last_timestamp = None
Exemplo n.º 11
0
    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_host_plugin_heartbeat = None
        self.protocol = None
        self.health_service = None

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
Exemplo n.º 12
0
    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
Exemplo n.º 13
0
    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
Exemplo n.º 14
0
class SendImdsHeartbeat(PeriodicOperation):
    """
    Periodic operation to report the IDMS's health. The signal is 'Healthy' when we have successfully called and validated
    a response in the last _IMDS_HEALTH_PERIOD.
    """
    def __init__(self, protocol_util, health_service):
        super(SendImdsHeartbeat, self).__init__(SendImdsHeartbeat._IMDS_HEARTBEAT_PERIOD)
        self.health_service = health_service
        self.imds_client = get_imds_client(protocol_util.get_wireserver_endpoint())
        self.imds_error_state = ErrorState(min_timedelta=SendImdsHeartbeat._IMDS_HEALTH_PERIOD)

    _IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    _IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def _operation(self):
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_error_state.reset()
            else:
                self.imds_error_state.incr()

            is_healthy = self.imds_error_state.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(
                name=AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ImdsHeartbeat,
                is_success=False,
                message=msg,
                log_event=False)
            raise
Exemplo n.º 15
0
    def test_should_report(self):
        host_plugin = self._init_host()
        error_state = ErrorState(min_timedelta=datetime.timedelta(minutes=5))
        period = datetime.timedelta(minutes=1)
        last_timestamp = None

        # first measurement at 0s, should report
        is_healthy = True
        actual = host_plugin.should_report(is_healthy,
                                           error_state,
                                           last_timestamp,
                                           period)
        self.assertEqual(True, actual)

        # second measurement at 30s, should not report
        last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=30)
        actual = host_plugin.should_report(is_healthy,
                                           error_state,
                                           last_timestamp,
                                           period)
        self.assertEqual(False, actual)

        # third measurement at 60s, should report
        last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=60)
        actual = host_plugin.should_report(is_healthy,
                                           error_state,
                                           last_timestamp,
                                           period)
        self.assertEqual(True, actual)

        # fourth measurement unhealthy, should report and increment counter
        is_healthy = False
        self.assertEqual(0, error_state.count)
        actual = host_plugin.should_report(is_healthy,
                                           error_state,
                                           last_timestamp,
                                           period)
        self.assertEqual(1, error_state.count)
        self.assertEqual(True, actual)

        # fifth measurement, should not report and reset counter
        is_healthy = True
        last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=30)
        self.assertEqual(1, error_state.count)
        actual = host_plugin.should_report(is_healthy,
                                           error_state,
                                           last_timestamp,
                                           period)
        self.assertEqual(0, error_state.count)
        self.assertEqual(False, actual)
Exemplo n.º 16
0
    def test_errorstate03(self, mock_time):
        """
        ErrorState will not trigger until
         1. ErrorState has been incr() at least once.
         2. The timedelta from the first incr() has elapsed.
        """
        test_subject = ErrorState(timedelta(minutes=15))

        for x in range(1, 10): # pylint: disable=invalid-name
            mock_time.utcnow = Mock(return_value=datetime.utcnow() + timedelta(minutes=x))

            test_subject.incr()
            self.assertFalse(test_subject.is_triggered())

        mock_time.utcnow = Mock(return_value=datetime.utcnow() + timedelta(minutes=30))
        test_subject.incr()
        self.assertTrue(test_subject.is_triggered())
        self.assertEqual('29.0 min', test_subject.fail_time)
Exemplo n.º 17
0
    def test_errorstate04(self):
        """
        If ErrorState is reset the timestamp of the last incr() is reset to
        None.
        """

        test_subject = ErrorState(timedelta(minutes=15))
        self.assertTrue(test_subject.timestamp is None)

        test_subject.incr()
        self.assertTrue(test_subject.timestamp is not None)

        test_subject.reset()
        self.assertTrue(test_subject.timestamp is None)
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(
            min_timedelta=ERROR_STATE_DELTA_INSTALL)

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
            self.get_artifact_error_state.reset()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(
                ustr(e))
            detailed_msg = '{0} {1}'.format(msg, traceback.format_exc())

            self.get_artifact_error_state.incr()

            if self.get_artifact_error_state.is_triggered():
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.GetArtifactExtended,
                          is_success=False,
                          message="Failed to get extension artifact for over "
                          "{0): {1}".format(
                              self.get_artifact_error_state.min_timedelta,
                              msg))
                self.get_artifact_error_state.reset()
            else:
                logger.warn(msg)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=detailed_msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(
                        FlexibleVersion(item[separator + 1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose(
                    "Removed orphaned extension package {0}".format(pkg))
            except OSError as e:
                logger.warn(
                    "Failed to remove orphaned package {0}: {1}".format(
                        pkg, e.strerror))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(),
                               handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package {0}".format(pkg))
                except OSError as e:
                    logger.warn(
                        "Failed to remove extension package {0}: {1}".format(
                            pkg, e.strerror))

    def handle_ext_handlers(self, etag=None):
        if not conf.get_extensions_enabled():
            logger.verbose("Extension handling is disabled")
            return

        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(
            key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)

    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            if ext_handler_i.decide_version(target_state=state) is None:
                version = ext_handler_i.ext_handler.properties.version
                name = ext_handler_i.ext_handler.name
                err_msg = "Unable to find version {0} in manifest for extension {1}".format(
                    version, name)
                ext_handler_i.set_operation(WALAEventOperation.Download)
                ext_handler_i.set_handler_status(message=ustr(err_msg),
                                                 code=-1)
                ext_handler_i.report_event(message=ustr(err_msg),
                                           is_success=False)
                return

            self.get_artifact_error_state.reset()
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose(
                        "Version {0} is current for etag {1}",
                        ext_handler_i.pkg.version, etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except ExtensionError as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
        except Exception as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e)

    def handle_handle_ext_handler_error(self, ext_handler_i, e, code=-1):
        msg = ustr(e)
        ext_handler_i.set_handler_status(message=msg, code=code)

        self.get_artifact_error_state.incr()
        if self.get_artifact_error_state.is_triggered():
            report_event(op=WALAEventOperation.GetArtifactExtended,
                         message="Failed to get artifact for over "
                         "{0}: {1}".format(
                             self.get_artifact_error_state.min_timedelta, msg),
                         is_success=False)
            self.get_artifact_error_state.reset()
        else:
            ext_handler_i.logger.warn(msg)

    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_ne(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                if ext_handler_i.version_gt(old_ext_handler_i):
                    ext_handler_i.update()
                else:
                    old_ext_handler_i.update(
                        version=ext_handler_i.ext_handler.properties.version)
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable()

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """
        Go through handler_state dir, collect and report status
        """
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.ExtensionProcessing,
                              is_success=False,
                              message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolNotFoundError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            logger.verbose(message)
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(self.report_status_error_state.min_timedelta)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ReportStatusExtended,
                      is_success=False,
                      message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        handler_status = ext_handler_i.get_handler_status()
        if handler_status is None:
            return

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

        vm_status.vmAgent.extensionHandlers.append(handler_status)
Exemplo n.º 19
0
class MonitorHandler(object):

    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_host_plugin_heartbeat = None
        self.protocol = None
        self.health_service = None

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME,
                                                 DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName",
                                                    vminfo.vmName))
            self.sysinfo.append(TelemetryEventParam("TenantName",
                                                    vminfo.tenantName))
            self.sysinfo.append(TelemetryEventParam("RoleName",
                                                    vminfo.roleName))
            self.sysinfo.append(TelemetryEventParam("RoleInstanceName",
                                                    vminfo.roleInstanceName))
            self.sysinfo.append(TelemetryEventParam("ContainerId",
                                                    vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(TelemetryEventParam('Location',
                                                    vminfo.location))
            self.sysinfo.append(TelemetryEventParam('SubscriptionId',
                                                    vminfo.subscriptionId))
            self.sysinfo.append(TelemetryEventParam('ResourceGroupName',
                                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId',
                                                    vminfo.vmId))
            self.sysinfo.append(TelemetryEventParam('ImageOrigin',
                                                    vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name,
                               param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()
Exemplo n.º 20
0
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME,
                                                 DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName",
                                                    vminfo.vmName))
            self.sysinfo.append(TelemetryEventParam("TenantName",
                                                    vminfo.tenantName))
            self.sysinfo.append(TelemetryEventParam("RoleName",
                                                    vminfo.roleName))
            self.sysinfo.append(TelemetryEventParam("RoleInstanceName",
                                                    vminfo.roleInstanceName))
            self.sysinfo.append(TelemetryEventParam("ContainerId",
                                                    vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(TelemetryEventParam('Location',
                                                    vminfo.location))
            self.sysinfo.append(TelemetryEventParam('SubscriptionId',
                                                    vminfo.subscriptionId))
            self.sysinfo.append(TelemetryEventParam('ResourceGroupName',
                                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId',
                                                    vminfo.vmId))
            self.sysinfo.append(TelemetryEventParam('ImageOrigin',
                                                    vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name,
                               param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.ImdsHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            # when a hierarchy is not mounted, we raise an exception
            # and we should therefore only issue a warning, since this
            # is not unexpected
            logger.warn("Monitor: cgroups not initialized: {0}", ustr(e))
            logger.verbose(traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            if value >= thresholds["memory"]:
                                msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["memory"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds["cpu"]:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["cpu"])
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Exemplo n.º 21
0
 def __init__(self, protocol_util, health_service):
     super(SendImdsHeartbeat, self).__init__(SendImdsHeartbeat._IMDS_HEARTBEAT_PERIOD)
     self.health_service = health_service
     self.imds_client = get_imds_client(protocol_util.get_wireserver_endpoint())
     self.imds_error_state = ErrorState(min_timedelta=SendImdsHeartbeat._IMDS_HEALTH_PERIOD)
Exemplo n.º 22
0
class HostPluginProtocol(object):
    _is_default_channel = False

    FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1)
    STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1)

    def __init__(self, endpoint, container_id, role_config_name):
        if endpoint is None:
            raise ProtocolError("HostGAPlugin: Endpoint not provided")
        self.is_initialized = False
        self.is_available = False
        self.api_versions = None
        self.endpoint = endpoint
        self.container_id = container_id
        self.deployment_id = None
        self.role_config_name = role_config_name
        self.manifest_uri = None
        self.health_service = HealthService(endpoint)
        self.fetch_error_state = ErrorState(
            min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
        self.status_error_state = ErrorState(
            min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
        self.fetch_last_timestamp = None
        self.status_last_timestamp = None

    @staticmethod
    def is_default_channel():
        return HostPluginProtocol._is_default_channel

    @staticmethod
    def set_default_channel(is_default):
        HostPluginProtocol._is_default_channel = is_default

    def ensure_initialized(self):
        if not self.is_initialized:
            self.api_versions = self.get_api_versions()
            self.is_available = API_VERSION in self.api_versions
            self.is_initialized = self.is_available
            from azurelinuxagent.common.event import WALAEventOperation, report_event
            report_event(WALAEventOperation.InitializeHostPlugin,
                         is_success=self.is_available)
        return self.is_available

    def get_health(self):
        """
        Call the /health endpoint
        :return: True if 200 received, False otherwise
        """
        url = URI_FORMAT_HEALTH.format(self.endpoint, HOST_PLUGIN_PORT)
        logger.verbose("HostGAPlugin: Getting health from [{0}]", url)
        status_ok = False
        try:
            response = restutil.http_get(url, max_retry=1)
            status_ok = restutil.request_succeeded(response)
        except HttpError as e:
            logger.verbose("HostGAPlugin: Exception getting health", ustr(e))
        return status_ok

    def get_api_versions(self):
        url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint,
                                                 HOST_PLUGIN_PORT)
        logger.verbose(
            "HostGAPlugin: Getting API versions at [{0}]".format(url))
        return_val = []
        error_response = ''
        is_healthy = False
        try:
            headers = {HEADER_CONTAINER_ID: self.container_id}
            response = restutil.http_get(url, headers)
            if restutil.request_failed(response):
                error_response = restutil.read_response_error(response)
                logger.error(
                    "HostGAPlugin: Failed Get API versions: {0}".format(
                        error_response))
            else:
                return_val = ustr(remove_bom(response.read()),
                                  encoding='utf-8')
                is_healthy = True
        except HttpError as e:
            logger.error(
                "HostGAPlugin: Exception Get API versions: {0}".format(e))

        self.health_service.report_host_plugin_versions(
            is_healthy=is_healthy, response=error_response)

        return return_val

    def get_artifact_request(self, artifact_url, artifact_manifest_url=None):
        if not self.ensure_initialized():
            raise ProtocolError(
                "HostGAPlugin: Host plugin channel is not available")

        if textutil.is_str_none_or_whitespace(artifact_url):
            raise ProtocolError(
                "HostGAPlugin: No extension artifact url was provided")

        url = URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint,
                                                       HOST_PLUGIN_PORT)
        headers = {
            HEADER_VERSION: API_VERSION,
            HEADER_CONTAINER_ID: self.container_id,
            HEADER_HOST_CONFIG_NAME: self.role_config_name,
            HEADER_ARTIFACT_LOCATION: artifact_url
        }

        if artifact_manifest_url is not None:
            headers[HEADER_ARTIFACT_MANIFEST_LOCATION] = artifact_manifest_url

        return url, headers

    def report_fetch_health(self,
                            uri,
                            is_healthy=True,
                            source='',
                            response=''):

        if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(
                self.endpoint, HOST_PLUGIN_PORT):
            return

        if self.should_report(is_healthy, self.fetch_error_state,
                              self.fetch_last_timestamp,
                              HostPluginProtocol.FETCH_REPORTING_PERIOD):
            self.fetch_last_timestamp = datetime.datetime.utcnow()
            health_signal = self.fetch_error_state.is_triggered() is False
            self.health_service.report_host_plugin_extension_artifact(
                is_healthy=health_signal, source=source, response=response)

    def report_status_health(self, is_healthy, response=''):
        if self.should_report(is_healthy, self.status_error_state,
                              self.status_last_timestamp,
                              HostPluginProtocol.STATUS_REPORTING_PERIOD):
            self.status_last_timestamp = datetime.datetime.utcnow()
            health_signal = self.status_error_state.is_triggered() is False
            self.health_service.report_host_plugin_status(
                is_healthy=health_signal, response=response)

    @staticmethod
    def should_report(is_healthy, error_state, last_timestamp, period):
        """
        Determine whether a health signal should be reported
        :param is_healthy: whether the current measurement is healthy
        :param error_state: the error state which is tracking time since failure
        :param last_timestamp: the last measurement time stamp
        :param period: the reporting period
        :return: True if the signal should be reported, False otherwise
        """

        if is_healthy:
            # we only reset the error state upon success, since we want to keep
            # reporting the failure; this is different to other uses of error states
            # which do not have a separate periodicity
            error_state.reset()
        else:
            error_state.incr()

        if last_timestamp is None:
            last_timestamp = datetime.datetime.utcnow() - period

        return datetime.datetime.utcnow() >= (last_timestamp + period)

    def put_vm_log(self, content):
        raise NotImplementedError("Unimplemented")

    def put_vm_status(self, status_blob, sas_url, config_blob_type=None):
        """
        Try to upload the VM status via the host plugin /status channel
        :param sas_url: the blob SAS url to pass to the host plugin
        :param config_blob_type: the blob type from the extension config
        :type status_blob: StatusBlob
        """
        if not self.ensure_initialized():
            raise ProtocolError("HostGAPlugin: HostGAPlugin is not available")

        if status_blob is None or status_blob.vm_status is None:
            raise ProtocolError("HostGAPlugin: Status blob was not provided")

        logger.verbose("HostGAPlugin: Posting VM status")
        blob_type = status_blob.type if status_blob.type else config_blob_type

        if blob_type == "BlockBlob":
            self._put_block_blob_status(sas_url, status_blob)
        else:
            self._put_page_blob_status(sas_url, status_blob)

    def _put_block_blob_status(self, sas_url, status_blob):
        url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT)

        response = restutil.http_put(url,
                                     data=self._build_status_data(
                                         sas_url,
                                         status_blob.get_block_blob_headers(
                                             len(status_blob.data)),
                                         bytearray(status_blob.data,
                                                   encoding='utf-8')),
                                     headers=self._build_status_headers())

        if restutil.request_failed(response):
            error_response = restutil.read_response_error(response)
            is_healthy = not restutil.request_failed_at_hostplugin(response)
            self.report_status_health(is_healthy=is_healthy,
                                      response=error_response)
            raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}".format(
                error_response))
        else:
            self.report_status_health(is_healthy=True)
            logger.verbose("HostGAPlugin: Put BlockBlob status succeeded")

    def _put_page_blob_status(self, sas_url, status_blob):
        url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT)

        # Convert the status into a blank-padded string whose length is modulo 512
        status = bytearray(status_blob.data, encoding='utf-8')
        status_size = int((len(status) + 511) / 512) * 512
        status = bytearray(status_blob.data.ljust(status_size),
                           encoding='utf-8')

        # First, initialize an empty blob
        response = restutil.http_put(
            url,
            data=self._build_status_data(
                sas_url,
                status_blob.get_page_blob_create_headers(status_size)),
            headers=self._build_status_headers())

        if restutil.request_failed(response):
            error_response = restutil.read_response_error(response)
            is_healthy = not restutil.request_failed_at_hostplugin(response)
            self.report_status_health(is_healthy=is_healthy,
                                      response=error_response)
            raise HttpError(
                "HostGAPlugin: Failed PageBlob clean-up: {0}".format(
                    error_response))
        else:
            self.report_status_health(is_healthy=True)
            logger.verbose("HostGAPlugin: PageBlob clean-up succeeded")

        # Then, upload the blob in pages
        if sas_url.count("?") <= 0:
            sas_url = "{0}?comp=page".format(sas_url)
        else:
            sas_url = "{0}&comp=page".format(sas_url)

        start = 0
        end = 0
        while start < len(status):
            # Create the next page
            end = start + min(len(status) - start, MAXIMUM_PAGEBLOB_PAGE_SIZE)
            page_size = int((end - start + 511) / 512) * 512
            buf = bytearray(page_size)
            buf[0:end - start] = status[start:end]

            # Send the page
            response = restutil.http_put(
                url,
                data=self._build_status_data(
                    sas_url,
                    status_blob.get_page_blob_page_headers(start, end), buf),
                headers=self._build_status_headers())

            if restutil.request_failed(response):
                error_response = restutil.read_response_error(response)
                is_healthy = not restutil.request_failed_at_hostplugin(
                    response)
                self.report_status_health(is_healthy=is_healthy,
                                          response=error_response)
                raise HttpError("HostGAPlugin Error: Put PageBlob bytes "
                                "[{0},{1}]: {2}".format(
                                    start, end, error_response))

            # Advance to the next page (if any)
            start = end

    def _build_status_data(self, sas_url, blob_headers, content=None):
        headers = []
        for name in iter(blob_headers.keys()):
            headers.append({
                'headerName': name,
                'headerValue': blob_headers[name]
            })

        data = {'requestUri': sas_url, 'headers': headers}
        if not content is None:
            data['content'] = self._base64_encode(content)
        return json.dumps(data, sort_keys=True)

    def _build_status_headers(self):
        return {
            HEADER_VERSION: API_VERSION,
            "Content-type": "application/json",
            HEADER_CONTAINER_ID: self.container_id,
            HEADER_HOST_CONFIG_NAME: self.role_config_name
        }

    def _base64_encode(self, data):
        s = base64.b64encode(bytes(data))
        if PY_VERSION_MAJOR > 2:
            return s.decode('utf-8')
        return s
Exemplo n.º 23
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._periodic_operations = [
            ResetPeriodicLogMessagesOperation(),
            PeriodicOperation("collect_and_send_events",
                              self.collect_and_send_events,
                              self.EVENT_COLLECTION_PERIOD),
            ReportNetworkErrorsOperation(),
            PollResourceUsageOperation(),
            PeriodicOperation("send_host_plugin_heartbeat",
                              self.send_host_plugin_heartbeat,
                              self.HOST_PLUGIN_HEARTBEAT_PERIOD),
            PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat,
                              self.IMDS_HEARTBEAT_PERIOD),
            ReportNetworkConfigurationChangesOperation(),
        ]
        self.protocol = None
        self.protocol_util = None
        self.health_service = None

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def daemon(self, init_data=False):
        try:
            if init_data:
                self.init_protocols()
                self.init_imds_client()

            while not self.stopped():
                try:
                    self.protocol.update_host_plugin_from_goal_state()

                    for op in self._periodic_operations:
                        op.run()

                except Exception as e:
                    logger.error(
                        "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                        ustr(e))
                finally:
                    PeriodicOperation.sleep_until_next_operation(
                        self._periodic_operations)
        except Exception as e:
            logger.error(
                "An error occurred in the monitor thread; will exit the thread.\n{0}",
                ustr(e))

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)
Exemplo n.º 24
0
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()
        self.get_artifact_error_state = ErrorState(min_timedelta=ERROR_STATE_DELTA_INSTALL)

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
            self.get_artifact_error_state.reset()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
            detailed_msg = '{0} {1}'.format(msg, traceback.format_exc())

            self.get_artifact_error_state.incr()

            if self.get_artifact_error_state.is_triggered():
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.GetArtifactExtended,
                          is_success=False,
                          message="Failed to get extension artifact for over "
                                  "{0): {1}".format(self.get_artifact_error_state.min_timedelta, msg))
                self.get_artifact_error_state.reset()
            else:
                logger.warn(msg)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=detailed_msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(FlexibleVersion(item[separator+1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose("Removed orphaned extension package {0}".format(pkg))
            except OSError as e:
                logger.warn("Failed to remove orphaned package {0}: {1}".format(pkg, e.strerror))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(), handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package {0}".format(pkg))
                except OSError as e:
                    logger.warn("Failed to remove extension package {0}: {1}".format(pkg, e.strerror))
   
    def handle_ext_handlers(self, etag=None):
        if not conf.get_extensions_enabled():
            logger.verbose("Extension handling is disabled")
            return

        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)
    
    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            if ext_handler_i.decide_version(target_state=state) is None:
                version = ext_handler_i.ext_handler.properties.version
                name = ext_handler_i.ext_handler.name
                err_msg = "Unable to find version {0} in manifest for extension {1}".format(version, name)
                ext_handler_i.set_operation(WALAEventOperation.Download)
                ext_handler_i.set_handler_status(message=ustr(err_msg), code=-1)
                ext_handler_i.report_event(message=ustr(err_msg), is_success=False)
                return

            self.get_artifact_error_state.reset()
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose("Version {0} is current for etag {1}",
                                                 ext_handler_i.pkg.version,
                                                 etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except ExtensionError as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
        except Exception as e:
            self.handle_handle_ext_handler_error(ext_handler_i, e)

    def handle_handle_ext_handler_error(self, ext_handler_i, e, code=-1):
        msg = ustr(e)
        ext_handler_i.set_handler_status(message=msg, code=code)

        self.get_artifact_error_state.incr()
        if self.get_artifact_error_state.is_triggered():
            report_event(op=WALAEventOperation.GetArtifactExtended,
                         message="Failed to get artifact for over "
                                 "{0}: {1}".format(self.get_artifact_error_state.min_timedelta, msg),
                         is_success=False)
            self.get_artifact_error_state.reset()
        else:
            ext_handler_i.logger.warn(msg)
    
    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_ne(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                if ext_handler_i.version_gt(old_ext_handler_i):
                    ext_handler_i.update()
                else:
                    old_ext_handler_i.update(version=ext_handler_i.ext_handler.properties.version)
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable() 

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """
        Go through handler_state dir, collect and report status
        """
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.ExtensionProcessing,
                        is_success=False,
                        message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolNotFoundError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            logger.verbose(message)
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(self.report_status_error_state.min_timedelta)

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ReportStatusExtended,
                      is_success=False,
                      message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)
        
        handler_status = ext_handler_i.get_handler_status() 
        if handler_status is None:
            return

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=e.code)

        vm_status.vmAgent.extensionHandlers.append(handler_status)
class MonitorHandler(object):
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # extension metrics period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_reset_loggers_time = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_polling_telemetry = None
        self.last_cgroup_report_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(
            TelemetryEventParam("ExecutionMode", AGENT_EXECUTION_MODE))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", ustr(e))

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", ustr(e))

    @staticmethod
    def collect_event(evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8")
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except (IOError, UnicodeDecodeError) as e:
            os.remove(evt_file_name)
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        """
        Periodically read, parse, and send events located in the events folder. Currently, this is done every minute.
        Any .tld file dropped in the events folder will be emitted. These event files can be created either by the
        agent or the extensions. We don't have control over extension's events parameters, but we will override
        any values they might have set for sys_info parameters.
        """
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", ustr(e))
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}",
                                    ustr(e))
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", ustr(e))
            except Exception as e:
                logger.warn("Failed to send events: {0}", ustr(e))

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.poll_telemetry_metrics()
            self.send_telemetry_metrics()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            self.reset_loggers()
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        time_now = datetime.datetime.utcnow()
        if not self.last_reset_loggers_time:
            self.last_reset_loggers_time = time_now

        if time_now >= (self.last_reset_loggers_time +
                        MonitorHandler.RESET_LOGGERS_PERIOD):
            try:
                logger.reset_periodic()
            finally:
                self.last_reset_loggers_time = time_now

    def add_sysinfo(self, event):
        """
        This method is called after parsing the event file in the events folder and before emitting it. This means
        all events, either coming from the agent or from the extensions, are passed through this method. The purpose
        is to add a static list of sys_info parameters such as VMName, Region, RAM, etc. If the sys_info parameters
        are already populated in the event, they will be overwritten by the sys_info values obtained from the agent.
        Since the ContainerId parameter is only populated on the fly for the agent events because it is not a static
        sys_info parameter, an event coming from an extension will not have it, so we explicitly add it.
        :param event: Event to be enriched with sys_info parameters
        :return: Event with all parameters added, ready to be reported
        """
        sysinfo_names = [v.name for v in self.sysinfo]
        final_parameters = []

        # Refer: azurelinuxagent.common.event.EventLogger.add_default_parameters_to_event for agent specific values.
        #
        # Default fields are only populated by Agent and not the extension. Agent will fill up any event if they don't
        # have the default params. Example: GAVersion and ContainerId are populated for agent events on the fly,
        # but not for extension events. Add it if it's missing.
        default_values = [("ContainerId", get_container_id_from_env()),
                          ("GAVersion", CURRENT_AGENT), ("OpcodeName", ""),
                          ("EventTid", 0), ("EventPid", 0), ("TaskName", ""),
                          ("KeywordName", "")]

        for param in event.parameters:
            # Discard any sys_info parameters already in the event, since they will be overwritten
            if param.name in sysinfo_names:
                continue
            final_parameters.append(param)

        # Add sys_info params populated by the agent
        final_parameters.extend(self.sysinfo)

        for default_value in default_values:
            if default_value[0] not in event:
                final_parameters.append(
                    TelemetryEventParam(default_value[0], default_value[1]))

        event.parameters = final_parameters

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

                if not is_healthy:
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HostPluginHeartbeatExtended,
                        is_success=False,
                        message='{0} since successful heartbeat'.format(
                            self.host_plugin_errorstate.fail_time),
                        log_event=False)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", ustr(e))

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    def poll_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()
        if not self.last_cgroup_polling_telemetry:
            self.last_cgroup_polling_telemetry = time_now

        if time_now >= (self.last_cgroup_polling_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD):
            CGroupsTelemetry.poll_all_tracked()
            self.last_cgroup_polling_telemetry = time_now

    def send_telemetry_metrics(self):
        time_now = datetime.datetime.utcnow()

        if not self.last_cgroup_report_telemetry:
            self.last_cgroup_report_telemetry = time_now

        if time_now >= (self.last_cgroup_report_telemetry +
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD):
            performance_metrics = CGroupsTelemetry.report_all_tracked()
            self.last_cgroup_report_telemetry = time_now

            if performance_metrics:
                message = generate_extension_metrics_telemetry_dictionary(
                    schema_version=1.0,
                    performance_metrics=performance_metrics)
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ExtensionMetricsData,
                          is_success=True,
                          message=ustr(message),
                          log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Exemplo n.º 26
0
    def test_errorstate05(self):
        """
        Test the fail_time for various scenarios
        """

        test_subject = ErrorState(timedelta(minutes=15))
        self.assertEqual('unknown', test_subject.fail_time)

        test_subject.incr()
        self.assertEqual('0.0 min', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=60)
        self.assertEqual('1.0 min', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=73)
        self.assertEqual('1.22 min', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=120)
        self.assertEqual('2.0 min', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=60 * 59)
        self.assertEqual('59.0 min', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=60 * 60)
        self.assertEqual('1.0 hr', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=60 * 95)
        self.assertEqual('1.58 hr', test_subject.fail_time)

        test_subject.timestamp = datetime.utcnow() - timedelta(seconds=60 * 60 * 3)
        self.assertEqual('3.0 hr', test_subject.fail_time)
Exemplo n.º 27
0
class MonitorHandler(object):
    # telemetry
    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    # host health
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    # cgroup data period
    CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5)
    CGROUP_TELEMETRY_REPORTING_PERIOD = datetime.timedelta(minutes=30)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)
    # log network configuration
    LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1)
    # Resetting loggers period
    RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12)

    def __init__(self):
        self.osutil = get_osutil()
        self.imds_client = None

        self.event_thread = None
        self._reset_loggers_op = PeriodicOperation("reset_loggers",
                                                   self.reset_loggers,
                                                   self.RESET_LOGGERS_PERIOD)
        self._collect_and_send_events_op = PeriodicOperation(
            "collect_and_send_events", self.collect_and_send_events,
            self.EVENT_COLLECTION_PERIOD)
        self._send_telemetry_heartbeat_op = PeriodicOperation(
            "send_telemetry_heartbeat", self.send_telemetry_heartbeat,
            self.TELEMETRY_HEARTBEAT_PERIOD)
        self._poll_telemetry_metrics_op = PeriodicOperation(
            "poll_telemetry_metrics usage", self.poll_telemetry_metrics,
            self.CGROUP_TELEMETRY_POLLING_PERIOD)
        self._send_telemetry_metrics_op = PeriodicOperation(
            "send_telemetry_metrics usage", self.send_telemetry_metrics,
            self.CGROUP_TELEMETRY_REPORTING_PERIOD)
        self._send_host_plugin_heartbeat_op = PeriodicOperation(
            "send_host_plugin_heartbeat", self.send_host_plugin_heartbeat,
            self.HOST_PLUGIN_HEARTBEAT_PERIOD)
        self._send_imds_heartbeat_op = PeriodicOperation(
            "send_imds_heartbeat", self.send_imds_heartbeat,
            self.IMDS_HEARTBEAT_PERIOD)
        self._log_altered_network_configuration_op = PeriodicOperation(
            "log_altered_network_configuration",
            self.log_altered_network_configuration,
            self.LOG_NETWORK_CONFIGURATION_PERIOD)
        self.protocol = None
        self.protocol_util = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.start(init_data=True)

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.join()

    def join(self):
        self.event_thread.join()

    def stopped(self):
        return not self.should_run

    def init_protocols(self):
        # The initialization of ProtocolUtil for the Monitor thread should be done within the thread itself rather
        # than initializing it in the ExtHandler thread. This is done to avoid any concurrency issues as each
        # thread would now have its own ProtocolUtil object as per the SingletonPerThread model.
        self.protocol_util = get_protocol_util()
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.get_endpoint())

    def init_imds_client(self):
        wireserver_endpoint = self.protocol_util.get_wireserver_endpoint()
        self.imds_client = get_imds_client(wireserver_endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self, init_data=False):
        self.event_thread = threading.Thread(target=self.daemon,
                                             args=(init_data, ))
        self.event_thread.setDaemon(True)
        self.event_thread.setName("MonitorHandler")
        self.event_thread.start()

    def collect_and_send_events(self):
        """
        Periodically send any events located in the events folder
        """
        event_list = collect_events()

        if len(event_list.events) > 0:
            self.protocol.report_event(event_list)

    def daemon(self, init_data=False):

        if init_data:
            self.init_protocols()
            self.init_imds_client()

        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while not self.stopped():
            try:
                self.protocol.update_host_plugin_from_goal_state()
                self._send_telemetry_heartbeat_op.run()
                self._poll_telemetry_metrics_op.run()
                # This will be removed in favor of poll_telemetry_metrics() and it'll directly send the perf data for
                # each cgroup.
                self._send_telemetry_metrics_op.run()
                self._collect_and_send_events_op.run()
                self._send_host_plugin_heartbeat_op.run()
                self._send_imds_heartbeat_op.run()
                self._log_altered_network_configuration_op.run()
                self._reset_loggers_op.run()
            except Exception as e:
                logger.warn(
                    "An error occurred in the monitor thread main loop; will skip the current iteration.\n{0}",
                    ustr(e))
            time.sleep(min_delta)

    def reset_loggers(self):
        """
        The loggers maintain hash-tables in memory and they need to be cleaned up from time to time.
        For reference, please check azurelinuxagent.common.logger.Logger and
        azurelinuxagent.common.event.EventLogger classes
        """
        logger.reset_periodic()

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        try:
            host_plugin = self.protocol.client.get_host_plugin()
            host_plugin.ensure_initialized()
            is_currently_healthy = host_plugin.get_health()

            if is_currently_healthy:
                self.host_plugin_errorstate.reset()
            else:
                self.host_plugin_errorstate.incr()

            is_healthy = self.host_plugin_errorstate.is_triggered() is False
            logger.verbose("HostGAPlugin health: {0}", is_healthy)

            self.health_service.report_host_plugin_heartbeat(is_healthy)

            if not is_healthy:
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeatExtended,
                          is_success=False,
                          message='{0} since successful heartbeat'.format(
                              self.host_plugin_errorstate.fail_time),
                          log_event=False)

        except Exception as e:
            msg = "Exception sending host plugin heartbeat: {0}".format(
                ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HostPluginHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)

    def send_telemetry_heartbeat(self):
        io_errors = IOErrorCounter.get_and_reset()
        hostplugin_errors = io_errors.get("hostplugin")
        protocol_errors = io_errors.get("protocol")
        other_errors = io_errors.get("other")

        if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
            msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                hostplugin_errors, protocol_errors, other_errors)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HttpErrors,
                      is_success=True,
                      message=msg,
                      log_event=False)

    def poll_telemetry_metrics(self):
        """
        This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly.

        :return: List of Metrics (which would be sent to PerfCounterMetrics directly.
        """
        metrics = CGroupsTelemetry.poll_all_tracked()

        if metrics:
            for metric in metrics:
                report_metric(metric.category, metric.counter, metric.instance,
                              metric.value)

    def send_telemetry_metrics(self):
        """
        The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly.
        """
        performance_metrics = CGroupsTelemetry.report_all_tracked()

        if performance_metrics:
            message = generate_extension_metrics_telemetry_dictionary(
                schema_version=1.0, performance_metrics=performance_metrics)
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionMetricsData,
                      is_success=True,
                      message=ustr(message),
                      log_event=False)

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Exemplo n.º 28
0
class MonitorHandler(object):

    EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1)
    TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30)
    CGROUP_TELEMETRY_PERIOD = datetime.timedelta(minutes=5)
    # host plugin
    HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5)
    # imds
    IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1)
    IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3)

    def __init__(self):
        self.osutil = get_osutil()
        self.protocol_util = get_protocol_util()
        self.imds_client = get_imds_client()

        self.event_thread = None
        self.last_event_collection = None
        self.last_telemetry_heartbeat = None
        self.last_cgroup_telemetry = None
        self.last_host_plugin_heartbeat = None
        self.last_imds_heartbeat = None
        self.protocol = None
        self.health_service = None
        self.last_route_table_hash = b''
        self.last_nic_state = {}

        self.counter = 0
        self.sysinfo = []
        self.should_run = True
        self.heartbeat_id = str(uuid.uuid4()).upper()
        self.host_plugin_errorstate = ErrorState(
            min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD)
        self.imds_errorstate = ErrorState(
            min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)

    def run(self):
        self.init_protocols()
        self.init_sysinfo()
        self.init_cgroups()
        self.start()

    def stop(self):
        self.should_run = False
        if self.is_alive():
            self.event_thread.join()

    def init_protocols(self):
        self.protocol = self.protocol_util.get_protocol()
        self.health_service = HealthService(self.protocol.endpoint)

    def is_alive(self):
        return self.event_thread is not None and self.event_thread.is_alive()

    def start(self):
        self.event_thread = threading.Thread(target=self.daemon)
        self.event_thread.setDaemon(True)
        self.event_thread.start()

    def init_sysinfo(self):
        osversion = "{0}:{1}-{2}-{3}:{4}".format(platform.system(),
                                                 DISTRO_NAME, DISTRO_VERSION,
                                                 DISTRO_CODE_NAME,
                                                 platform.release())
        self.sysinfo.append(TelemetryEventParam("OSVersion", osversion))
        self.sysinfo.append(TelemetryEventParam("GAVersion", CURRENT_AGENT))

        try:
            ram = self.osutil.get_total_mem()
            processors = self.osutil.get_processor_cores()
            self.sysinfo.append(TelemetryEventParam("RAM", ram))
            self.sysinfo.append(TelemetryEventParam("Processors", processors))
        except OSUtilError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.protocol.get_vminfo()
            self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName))
            self.sysinfo.append(
                TelemetryEventParam("TenantName", vminfo.tenantName))
            self.sysinfo.append(
                TelemetryEventParam("RoleName", vminfo.roleName))
            self.sysinfo.append(
                TelemetryEventParam("RoleInstanceName",
                                    vminfo.roleInstanceName))
            self.sysinfo.append(
                TelemetryEventParam("ContainerId", vminfo.containerId))
        except ProtocolError as e:
            logger.warn("Failed to get system info: {0}", e)

        try:
            vminfo = self.imds_client.get_compute()
            self.sysinfo.append(
                TelemetryEventParam('Location', vminfo.location))
            self.sysinfo.append(
                TelemetryEventParam('SubscriptionId', vminfo.subscriptionId))
            self.sysinfo.append(
                TelemetryEventParam('ResourceGroupName',
                                    vminfo.resourceGroupName))
            self.sysinfo.append(TelemetryEventParam('VMId', vminfo.vmId))
            self.sysinfo.append(
                TelemetryEventParam('ImageOrigin', vminfo.image_origin))
        except (HttpError, ValueError) as e:
            logger.warn("failed to get IMDS info: {0}", e)

    def collect_event(self, evt_file_name):
        try:
            logger.verbose("Found event file: {0}", evt_file_name)
            with open(evt_file_name, "rb") as evt_file:
                # if fail to open or delete the file, throw exception
                data_str = evt_file.read().decode("utf-8", 'ignore')
            logger.verbose("Processed event file: {0}", evt_file_name)
            os.remove(evt_file_name)
            return data_str
        except IOError as e:
            msg = "Failed to process {0}, {1}".format(evt_file_name, e)
            raise EventError(msg)

    def collect_and_send_events(self):
        if self.last_event_collection is None:
            self.last_event_collection = datetime.datetime.utcnow(
            ) - MonitorHandler.EVENT_COLLECTION_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_event_collection +
                MonitorHandler.EVENT_COLLECTION_PERIOD):
            try:
                event_list = TelemetryEventList()
                event_dir = os.path.join(conf.get_lib_dir(), "events")
                event_files = os.listdir(event_dir)
                for event_file in event_files:
                    if not event_file.endswith(".tld"):
                        continue
                    event_file_path = os.path.join(event_dir, event_file)
                    try:
                        data_str = self.collect_event(event_file_path)
                    except EventError as e:
                        logger.error("{0}", e)
                        continue

                    try:
                        event = parse_event(data_str)
                        self.add_sysinfo(event)
                        event_list.events.append(event)
                    except (ValueError, ProtocolError) as e:
                        logger.warn("Failed to decode event file: {0}", e)
                        continue

                if len(event_list.events) == 0:
                    return

                try:
                    self.protocol.report_event(event_list)
                except ProtocolError as e:
                    logger.error("{0}", e)
            except Exception as e:
                logger.warn("Failed to send events: {0}", e)

            self.last_event_collection = datetime.datetime.utcnow()

    def daemon(self):
        min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD,
                        MonitorHandler.CGROUP_TELEMETRY_PERIOD,
                        MonitorHandler.EVENT_COLLECTION_PERIOD,
                        MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD,
                        MonitorHandler.IMDS_HEARTBEAT_PERIOD).seconds
        while self.should_run:
            self.send_telemetry_heartbeat()
            self.send_cgroup_telemetry()
            self.collect_and_send_events()
            self.send_host_plugin_heartbeat()
            self.send_imds_heartbeat()
            self.log_altered_network_configuration()
            time.sleep(min_delta)

    def add_sysinfo(self, event):
        sysinfo_names = [v.name for v in self.sysinfo]
        for param in event.parameters:
            if param.name in sysinfo_names:
                logger.verbose("Remove existing event parameter: [{0}:{1}]",
                               param.name, param.value)
                event.parameters.remove(param)
        event.parameters.extend(self.sysinfo)

    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_imds_heartbeat +
                MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ImdsHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()

    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_host_plugin_heartbeat +
                MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered(
                ) is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HostPluginHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()

    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(
                    self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter,
                                               self.heartbeat_id,
                                               dropped_packets)

                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.HeartBeat,
                          is_success=True,
                          message=msg,
                          log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(
                        hostplugin_errors, protocol_errors, other_errors)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.HttpErrors,
                              is_success=True,
                              message=msg,
                              log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()

    @staticmethod
    def init_cgroups():
        # Track metrics for the roll-up cgroup and for the agent cgroup
        try:
            CGroupsTelemetry.track_cgroup(CGroups.for_extension(""))
            CGroupsTelemetry.track_agent()
        except Exception as e:
            logger.error(
                "monitor: Exception tracking wrapper and agent: {0} [{1}]", e,
                traceback.format_exc())

    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                for cgroup_name, metrics in CGroupsTelemetry.collect_all_tracked(
                ).items():
                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)
            except Exception as e:
                logger.warn("Failed to collect performance metrics: {0} [{1}]",
                            e, traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: updating tracked extensions raised {0}: {1}", e,
                    traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()

    def log_altered_network_configuration(self):
        """
        Check various pieces of network configuration and, if altered since the last check, log the new state.
        """
        raw_route_list = self.osutil.read_route_table()
        digest = hash_strings(raw_route_list)
        if digest != self.last_route_table_hash:
            self.last_route_table_hash = digest
            route_list = self.osutil.get_list_of_routes(raw_route_list)
            logger.info("Route table: [{0}]".format(",".join(
                map(networkutil.RouteEntry.to_json, route_list))))

        nic_state = self.osutil.get_nic_state()
        if nic_state != self.last_nic_state:
            description = "Initial" if self.last_nic_state == {} else "Updated"
            logger.info("{0} NIC state: [{1}]".format(
                description, ", ".join(map(str, nic_state.values()))))
            self.last_nic_state = nic_state
Exemplo n.º 29
0
class HostPluginProtocol(object):
    _is_default_channel = False

    FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1)
    STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1)

    def __init__(self, endpoint, container_id, role_config_name):
        if endpoint is None:
            raise ProtocolError("HostGAPlugin: Endpoint not provided")
        self.is_initialized = False
        self.is_available = False
        self.api_versions = None
        self.endpoint = endpoint
        self.container_id = container_id
        self.deployment_id = None
        self.role_config_name = role_config_name
        self.manifest_uri = None
        self.health_service = HealthService(endpoint)
        self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
        self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
        self.fetch_last_timestamp = None
        self.status_last_timestamp = None

    @staticmethod
    def is_default_channel():
        return HostPluginProtocol._is_default_channel

    @staticmethod
    def set_default_channel(is_default):
        HostPluginProtocol._is_default_channel = is_default

    def ensure_initialized(self):
        if not self.is_initialized:
            self.api_versions = self.get_api_versions()
            self.is_available = API_VERSION in self.api_versions
            self.is_initialized = self.is_available
            from azurelinuxagent.common.event import WALAEventOperation, report_event
            report_event(WALAEventOperation.InitializeHostPlugin,
                         is_success=self.is_available)
        return self.is_available

    def get_health(self):
        """
        Call the /health endpoint
        :return: True if 200 received, False otherwise
        """
        url = URI_FORMAT_HEALTH.format(self.endpoint,
                                       HOST_PLUGIN_PORT)
        logger.verbose("HostGAPlugin: Getting health from [{0}]", url)
        response = restutil.http_get(url, max_retry=1)
        return restutil.request_succeeded(response)

    def get_api_versions(self):
        url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint,
                                                 HOST_PLUGIN_PORT)
        logger.verbose("HostGAPlugin: Getting API versions at [{0}]"
                       .format(url))
        return_val = []
        error_response = ''
        is_healthy = False
        try:
            headers = {HEADER_CONTAINER_ID: self.container_id}
            response = restutil.http_get(url, headers)
            if restutil.request_failed(response):
                error_response = restutil.read_response_error(response)
                logger.error("HostGAPlugin: Failed Get API versions: {0}".format(error_response))
                is_healthy = not restutil.request_failed_at_hostplugin(response)
            else:
                return_val = ustr(remove_bom(response.read()), encoding='utf-8')
                is_healthy = True
        except HttpError as e:
            logger.error("HostGAPlugin: Exception Get API versions: {0}".format(e))

        self.health_service.report_host_plugin_versions(is_healthy=is_healthy, response=error_response)

        return return_val

    def get_artifact_request(self, artifact_url, artifact_manifest_url=None):
        if not self.ensure_initialized():
            raise ProtocolError("HostGAPlugin: Host plugin channel is not available")

        if textutil.is_str_none_or_whitespace(artifact_url):
            raise ProtocolError("HostGAPlugin: No extension artifact url was provided")

        url = URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint,
                                                       HOST_PLUGIN_PORT)
        headers = {HEADER_VERSION: API_VERSION,
                   HEADER_CONTAINER_ID: self.container_id,
                   HEADER_HOST_CONFIG_NAME: self.role_config_name,
                   HEADER_ARTIFACT_LOCATION: artifact_url}

        if artifact_manifest_url is not None:
            headers[HEADER_ARTIFACT_MANIFEST_LOCATION] = artifact_manifest_url

        return url, headers

    def report_fetch_health(self, uri, is_healthy=True, source='', response=''):

        if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT):
            return

        if self.should_report(is_healthy,
                              self.fetch_error_state,
                              self.fetch_last_timestamp,
                              HostPluginProtocol.FETCH_REPORTING_PERIOD):
            self.fetch_last_timestamp = datetime.datetime.utcnow()
            health_signal = self.fetch_error_state.is_triggered() is False
            self.health_service.report_host_plugin_extension_artifact(is_healthy=health_signal,
                                                                      source=source,
                                                                      response=response)

    def report_status_health(self, is_healthy, response=''):
        if self.should_report(is_healthy,
                              self.status_error_state,
                              self.status_last_timestamp,
                              HostPluginProtocol.STATUS_REPORTING_PERIOD):
            self.status_last_timestamp = datetime.datetime.utcnow()
            health_signal = self.status_error_state.is_triggered() is False
            self.health_service.report_host_plugin_status(is_healthy=health_signal,
                                                          response=response)

    @staticmethod
    def should_report(is_healthy, error_state, last_timestamp, period):
        """
        Determine whether a health signal should be reported
        :param is_healthy: whether the current measurement is healthy
        :param error_state: the error state which is tracking time since failure
        :param last_timestamp: the last measurement time stamp
        :param period: the reporting period
        :return: True if the signal should be reported, False otherwise
        """

        if is_healthy:
            # we only reset the error state upon success, since we want to keep
            # reporting the failure; this is different to other uses of error states
            # which do not have a separate periodicity
            error_state.reset()
        else:
            error_state.incr()

        if last_timestamp is None:
            last_timestamp = datetime.datetime.utcnow() - period

        return datetime.datetime.utcnow() >= (last_timestamp + period)

    def put_vm_log(self, content):
        raise NotImplementedError("Unimplemented")

    def put_vm_status(self, status_blob, sas_url, config_blob_type=None):
        """
        Try to upload the VM status via the host plugin /status channel
        :param sas_url: the blob SAS url to pass to the host plugin
        :param config_blob_type: the blob type from the extension config
        :type status_blob: StatusBlob
        """
        if not self.ensure_initialized():
            raise ProtocolError("HostGAPlugin: HostGAPlugin is not available")

        if status_blob is None or status_blob.vm_status is None:
            raise ProtocolError("HostGAPlugin: Status blob was not provided")

        logger.verbose("HostGAPlugin: Posting VM status")
        blob_type = status_blob.type if status_blob.type else config_blob_type

        if blob_type == "BlockBlob":
            self._put_block_blob_status(sas_url, status_blob)
        else:
            self._put_page_blob_status(sas_url, status_blob)

    def _put_block_blob_status(self, sas_url, status_blob):
        url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT)

        response = restutil.http_put(url,
                                     data=self._build_status_data(
                                         sas_url,
                                         status_blob.get_block_blob_headers(len(status_blob.data)),
                                         bytearray(status_blob.data, encoding='utf-8')),
                                     headers=self._build_status_headers())

        if restutil.request_failed(response):
            error_response = restutil.read_response_error(response)
            is_healthy = not restutil.request_failed_at_hostplugin(response)
            self.report_status_health(is_healthy=is_healthy, response=error_response)
            raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}"
                            .format(error_response))
        else:
            self.report_status_health(is_healthy=True)
            logger.verbose("HostGAPlugin: Put BlockBlob status succeeded")

    def _put_page_blob_status(self, sas_url, status_blob):
        url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT)

        # Convert the status into a blank-padded string whose length is modulo 512
        status = bytearray(status_blob.data, encoding='utf-8')
        status_size = int((len(status) + 511) / 512) * 512
        status = bytearray(status_blob.data.ljust(status_size), encoding='utf-8')

        # First, initialize an empty blob
        response = restutil.http_put(url,
                                     data=self._build_status_data(
                                         sas_url,
                                         status_blob.get_page_blob_create_headers(status_size)),
                                     headers=self._build_status_headers())

        if restutil.request_failed(response):
            error_response = restutil.read_response_error(response)
            is_healthy = not restutil.request_failed_at_hostplugin(response)
            self.report_status_health(is_healthy=is_healthy, response=error_response)
            raise HttpError("HostGAPlugin: Failed PageBlob clean-up: {0}"
                            .format(error_response))
        else:
            self.report_status_health(is_healthy=True)
            logger.verbose("HostGAPlugin: PageBlob clean-up succeeded")
        
        # Then, upload the blob in pages
        if sas_url.count("?") <= 0:
            sas_url = "{0}?comp=page".format(sas_url)
        else:
            sas_url = "{0}&comp=page".format(sas_url)

        start = 0
        end = 0
        while start < len(status):
            # Create the next page
            end = start + min(len(status) - start, MAXIMUM_PAGEBLOB_PAGE_SIZE)
            page_size = int((end - start + 511) / 512) * 512
            buf = bytearray(page_size)
            buf[0: end - start] = status[start: end]

            # Send the page
            response = restutil.http_put(url,
                                         data=self._build_status_data(
                                             sas_url,
                                             status_blob.get_page_blob_page_headers(start, end),
                                             buf),
                                         headers=self._build_status_headers())

            if restutil.request_failed(response):
                error_response = restutil.read_response_error(response)
                is_healthy = not restutil.request_failed_at_hostplugin(response)
                self.report_status_health(is_healthy=is_healthy, response=error_response)
                raise HttpError(
                    "HostGAPlugin Error: Put PageBlob bytes "
                    "[{0},{1}]: {2}".format(start, end, error_response))

            # Advance to the next page (if any)
            start = end
        
    def _build_status_data(self, sas_url, blob_headers, content=None):
        headers = []
        for name in iter(blob_headers.keys()):
            headers.append({
                'headerName': name,
                'headerValue': blob_headers[name]
            })

        data = {
            'requestUri': sas_url,
            'headers': headers
        }
        if not content is None:
            data['content'] = self._base64_encode(content)
        return json.dumps(data, sort_keys=True)
    
    def _build_status_headers(self):
        return {
            HEADER_VERSION: API_VERSION,
            "Content-type": "application/json",
            HEADER_CONTAINER_ID: self.container_id,
            HEADER_HOST_CONFIG_NAME: self.role_config_name
        }
    
    def _base64_encode(self, data):
        s = base64.b64encode(bytes(data))
        if PY_VERSION_MAJOR > 2:
            return s.decode('utf-8')
        return s
Exemplo n.º 30
0
class ExtHandlersHandler(object):
    def __init__(self):
        self.protocol_util = get_protocol_util()
        self.protocol = None
        self.ext_handlers = None
        self.last_etag = None
        self.last_upgrade_guids = {}
        self.log_report = False
        self.log_etag = True
        self.log_process = False

        self.report_status_error_state = ErrorState()

    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except RestartError:
            raise
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return

    def run_status(self):
        self.report_ext_handlers_status()
        return

    def get_upgrade_guid(self, name):
        return self.last_upgrade_guids.get(name, (None, False))[0]

    def get_log_upgrade_guid(self, ext_handler):
        return self.last_upgrade_guids.get(ext_handler.name, (None, False))[1]

    def set_log_upgrade_guid(self, ext_handler, log_val):
        guid = self.get_upgrade_guid(ext_handler.name)
        if guid is not None:
            self.last_upgrade_guids[ext_handler.name] = (guid, log_val)

    def is_new_guid(self, ext_handler):
        last_guid = self.get_upgrade_guid(ext_handler.name)
        if last_guid is None:
            return True
        return last_guid != ext_handler.properties.upgradeGuid

    def cleanup_outdated_handlers(self):
        handlers = []
        pkgs = []

        # Build a collection of uninstalled handlers and orphaned packages
        # Note:
        # -- An orphaned package is one without a corresponding handler
        #    directory
        for item in os.listdir(conf.get_lib_dir()):
            path = os.path.join(conf.get_lib_dir(), item)

            if version.is_agent_package(path) or version.is_agent_path(path):
                continue

            if os.path.isdir(path):
                if re.match(HANDLER_NAME_PATTERN, item) is None:
                    continue
                try:
                    eh = ExtHandler()

                    separator = item.rfind('-')

                    eh.name = item[0:separator]
                    eh.properties.version = str(FlexibleVersion(item[separator+1:]))

                    handler = ExtHandlerInstance(eh, self.protocol)
                except Exception as e:
                    continue
                if handler.get_handler_state() != ExtHandlerState.NotInstalled:
                    continue
                handlers.append(handler)

            elif os.path.isfile(path) and \
                    not os.path.isdir(path[0:-len(HANDLER_PKG_EXT)]):
                if not re.match(HANDLER_PKG_PATTERN, item):
                    continue
                pkgs.append(path)

        # Then, remove the orphaned packages
        for pkg in pkgs:
            try:
                os.remove(pkg)
                logger.verbose("Removed orphaned extension package "
                            "{0}".format(pkg))
            except Exception as e:
                logger.warn("Failed to remove orphaned package: {0}".format(
                                pkg))

        # Finally, remove the directories and packages of the
        # uninstalled handlers
        for handler in handlers:
            handler.rm_ext_handler_dir()
            pkg = os.path.join(conf.get_lib_dir(),
                    handler.get_full_name() + HANDLER_PKG_EXT)
            if os.path.isfile(pkg):
                try:
                    os.remove(pkg)
                    logger.verbose("Removed extension package "
                                "{0}".format(pkg))
                except Exception as e:
                    logger.warn("Failed to remove extension package: "
                                "{0}".format(pkg))
   
    def handle_ext_handlers(self, etag=None):
        if self.ext_handlers.extHandlers is None or \
                len(self.ext_handlers.extHandlers) == 0:
            logger.verbose("No extension handler config found")
            return

        if conf.get_enable_overprovisioning():
            artifacts_profile = self.protocol.get_artifacts_profile()
            if artifacts_profile and artifacts_profile.is_on_hold():
                logger.info("Extension handling is on hold")
                return

        self.ext_handlers.extHandlers.sort(key=operator.methodcaller('sort_key'))
        for ext_handler in self.ext_handlers.extHandlers:
            # TODO: handle install in sequence, enable in parallel
            self.handle_ext_handler(ext_handler, etag)
    
    def handle_ext_handler(self, ext_handler, etag):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

        try:
            state = ext_handler.properties.state
            # The extension is to be enabled, there is an upgrade GUID
            # and the GUID is NOT new
            if state == u"enabled" and \
                    ext_handler.properties.upgradeGuid is not None and \
                    not self.is_new_guid(ext_handler):
                ext_handler_i.ext_handler.properties.version = ext_handler_i.get_installed_version()
                ext_handler_i.set_logger()
                if self.last_etag != etag:
                    self.set_log_upgrade_guid(ext_handler, True)

                msg = "New GUID is the same as the old GUID. Exiting without upgrading."
                if self.get_log_upgrade_guid(ext_handler):
                    ext_handler_i.logger.info(msg)
                    self.set_log_upgrade_guid(ext_handler, False)
                ext_handler_i.set_handler_state(ExtHandlerState.Enabled)
                ext_handler_i.set_handler_status(status="Ready", message="No change")
                ext_handler_i.set_operation(WALAEventOperation.SkipUpdate)
                ext_handler_i.report_event(message=ustr(msg), is_success=True)
                return

            self.set_log_upgrade_guid(ext_handler, True)
            ext_handler_i.decide_version(etag=etag, target_state=state)
            if not ext_handler_i.is_upgrade and self.last_etag == etag:
                if self.log_etag:
                    ext_handler_i.logger.verbose("Version {0} is current for etag {1}",
                                                 ext_handler_i.pkg.version,
                                                 etag)
                    self.log_etag = False
                return

            self.log_etag = True

            ext_handler_i.logger.info("Target handler state: {0}", state)
            if state == u"enabled":
                self.handle_enable(ext_handler_i)
                if ext_handler.properties.upgradeGuid is not None:
                    ext_handler_i.logger.info("New Upgrade GUID: {0}", ext_handler.properties.upgradeGuid)
                    self.last_upgrade_guids[ext_handler.name] = (ext_handler.properties.upgradeGuid, True)
            elif state == u"disabled":
                self.handle_disable(ext_handler_i)
                # Remove the GUID from the dictionary so that it is upgraded upon re-enable
                self.last_upgrade_guids.pop(ext_handler.name, None)
            elif state == u"uninstall":
                self.handle_uninstall(ext_handler_i)
                # Remove the GUID from the dictionary so that it is upgraded upon re-install
                self.last_upgrade_guids.pop(ext_handler.name, None)
            else:
                message = u"Unknown ext handler state:{0}".format(state)
                raise ExtensionError(message)
        except RestartError:
            ext_handler_i.logger.info("GoalState became stale during "
                                      "processing. Restarting with new "
                                      "GoalState")
            raise
        except Exception as e:
            ext_handler_i.set_handler_status(message=ustr(e), code=-1)
            ext_handler_i.report_event(message=ustr(e), is_success=False)
    
    def handle_enable(self, ext_handler_i):
        self.log_process = True
        old_ext_handler_i = ext_handler_i.get_installed_ext_handler()
        if old_ext_handler_i is not None and \
           old_ext_handler_i.version_gt(ext_handler_i):
            msg = "Downgrade is not allowed. Skipping install and enable."
            ext_handler_i.logger.error(msg)
            ext_handler_i.set_operation(WALAEventOperation.Downgrade)
            ext_handler_i.report_event(message=ustr(msg), is_success=True)
            return

        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Enable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.NotInstalled:
            ext_handler_i.set_handler_state(ExtHandlerState.NotInstalled)
            ext_handler_i.download()
            ext_handler_i.update_settings()
            if old_ext_handler_i is None:
                ext_handler_i.install()
            elif ext_handler_i.version_gt(old_ext_handler_i):
                old_ext_handler_i.disable()
                ext_handler_i.copy_status_files(old_ext_handler_i)
                ext_handler_i.update()
                old_ext_handler_i.uninstall()
                old_ext_handler_i.rm_ext_handler_dir()
                ext_handler_i.update_with_install()
        else:
            ext_handler_i.update_settings()

        ext_handler_i.enable() 

    def handle_disable(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Disable] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state == ExtHandlerState.Enabled:
            ext_handler_i.disable()

    def handle_uninstall(self, ext_handler_i):
        self.log_process = True
        handler_state = ext_handler_i.get_handler_state()
        ext_handler_i.logger.info("[Uninstall] current handler state is: {0}",
                                  handler_state.lower())
        if handler_state != ExtHandlerState.NotInstalled:
            if handler_state == ExtHandlerState.Enabled:
                ext_handler_i.disable()
            ext_handler_i.uninstall()
        ext_handler_i.rm_ext_handler_dir()

    def report_ext_handlers_status(self):
        """Go through handler_state dir, collect and report status"""
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.ExtensionProcessing,
                        is_success=False,
                        message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
            self.report_status_error_state.reset()
        except ProtocolError as e:
            self.report_status_error_state.incr()
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ExtensionProcessing,
                is_success=False,
                message=message)

        if self.report_status_error_state.is_triggered():
            message = "Failed to report vm agent status for more than {0}"\
                .format(ERROR_STATE_DELTA)

            add_event(AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.ExtensionProcessing,
                is_success=False,
                message=message)

            self.report_status_error_state.reset()

    def report_ext_handler_status(self, vm_status, ext_handler):
        ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)
        
        handler_status = ext_handler_i.get_handler_status() 
        if handler_status is None:
            return
        guid = self.get_upgrade_guid(ext_handler.name)
        if guid is not None:
            handler_status.upgradeGuid = guid

        handler_state = ext_handler_i.get_handler_state()
        if handler_state != ExtHandlerState.NotInstalled:
            try:
                active_exts = ext_handler_i.report_ext_status()
                handler_status.extensions.extend(active_exts)
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=-1)

            try:
                heartbeat = ext_handler_i.collect_heartbeat()
                if heartbeat is not None:
                    handler_status.status = heartbeat.get('status')
            except ExtensionError as e:
                ext_handler_i.set_handler_status(message=ustr(e), code=-1)

        vm_status.vmAgent.extensionHandlers.append(handler_status)
Exemplo n.º 31
0
 def __init__(self, protocol, health_service):
     super(SendHostPluginHeartbeat, self).__init__(SendHostPluginHeartbeat._HOST_PLUGIN_HEARTBEAT_PERIOD)
     self.protocol = protocol
     self.health_service = health_service
     self.host_plugin_error_state = ErrorState(min_timedelta=SendHostPluginHeartbeat._HOST_PLUGIN_HEALTH_PERIOD)