Пример #1
0
    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()
Пример #2
0
def is_log_collection_allowed():
    # There are three conditions that need to be met in order to allow periodic log collection:
    # 1) It should be enabled in the configuration.
    # 2) The system must be using cgroups to manage services. Needed for resource limiting of the log collection.
    # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting.
    conf_enabled = conf.get_collect_logs()
    cgroups_enabled = CGroupConfigurator.get_instance().enabled()
    supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3
    is_allowed = conf_enabled and cgroups_enabled and supported_python

    msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \
          "configuration enabled [{1}], cgroups enabled [{2}], python supported: [{3}]".format(is_allowed,
                                                                                               conf_enabled,
                                                                                               cgroups_enabled,
                                                                                               supported_python)
    logger.info(msg)
    add_event(
        name=AGENT_NAME,
        version=CURRENT_VERSION,
        op=WALAEventOperation.LogCollection,
        is_success=is_allowed,
        message=msg,
        log_event=False)

    return is_allowed
Пример #3
0
    def test_save_event_cleanup(self):
        tmp_evt = tempfile.mkdtemp()
        init_event_logger(tmp_evt)

        for i in range(0, 2000):
            evt = os.path.join(tmp_evt, '{0}.tld'.format(ustr(1491004920536531 + i)))
            with open(evt, 'w') as fh:
                fh.write('test event {0}'.format(i))

        events = os.listdir(tmp_evt)
        self.assertTrue(len(events) == 2000, "{0} events found, 2000 expected".format(len(events)))
        add_event('test', message='last event')

        events = os.listdir(tmp_evt)
        events.sort()
        self.assertTrue(len(events) == 1000, "{0} events found, 1000 expected".format(len(events)))
        first_event = os.path.join(tmp_evt, events[0])
        with open(first_event) as first_fh:
            first_event_text = first_fh.read()
            self.assertTrue('test event 1001' in first_event_text)

        last_event = os.path.join(tmp_evt, events[-1])
        with open(last_event) as last_fh:
            last_event_text = last_fh.read()
            self.assertTrue('last event' in last_event_text)
Пример #4
0
    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """

        if self.last_imds_heartbeat is None:
            self.last_imds_heartbeat = datetime.datetime.utcnow() - MonitorHandler.IMDS_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_imds_heartbeat + MonitorHandler.IMDS_HEARTBEAT_PERIOD):
            try:
                is_currently_healthy, response = self.imds_client.validate()

                if is_currently_healthy:
                    self.imds_errorstate.reset()
                else:
                    self.imds_errorstate.incr()

                is_healthy = self.imds_errorstate.is_triggered() is False
                logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

                self.health_service.report_imds_status(is_healthy, response)

            except Exception as e:
                msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.ImdsHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_imds_heartbeat = datetime.datetime.utcnow()
Пример #5
0
    def _send_heartbeat_telemetry(self, protocol):
        if self._last_telemetry_heartbeat is None:
            self._last_telemetry_heartbeat = datetime.utcnow(
            ) - UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.utcnow() >= (self._last_telemetry_heartbeat +
                                 UpdateHandler.TELEMETRY_HEARTBEAT_PERIOD):
            dropped_packets = self.osutil.get_firewall_dropped_packets(
                protocol.get_endpoint())
            auto_update_enabled = 1 if conf.get_autoupdate_enabled() else 0
            telemetry_msg = "{0};{1};{2};{3};{4}".format(
                self._heartbeat_counter, self._heartbeat_id, dropped_packets,
                self._heartbeat_update_goal_state_error_count,
                auto_update_enabled)

            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.HeartBeat,
                      is_success=True,
                      message=telemetry_msg,
                      log_event=False)
            self._heartbeat_counter += 1
            self._heartbeat_update_goal_state_error_count = 0

            debug_log_msg = "[DEBUG HeartbeatCounter: {0};HeartbeatId: {1};DroppedPackets: {2};" \
                            "UpdateGSErrors: {3};AutoUpdate: {4}]".format(self._heartbeat_counter,
                                                                          self._heartbeat_id, dropped_packets,
                                                                          self._heartbeat_update_goal_state_error_count,
                                                                          auto_update_enabled)
            logger.info(
                u"[HEARTBEAT] Agent {0} is running as the goal state agent {1}",
                CURRENT_AGENT, debug_log_msg)
            self._last_telemetry_heartbeat = datetime.utcnow()
Пример #6
0
    def report_ext_handlers_status(self):
        """Go thru handler_state dir, collect and report status"""
        vm_status = VMStatus()
        vm_status.vmAgent.version = str(CURRENT_VERSION)
        vm_status.vmAgent.status = "Ready"
        vm_status.vmAgent.message = "Guest Agent is running"

        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        is_success=False,
                        message=ustr(e))
        
        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Successfully reported vm agent status")
        except ProtocolError as e:
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=message)
Пример #7
0
 def disable(self, reason):
     self._cgroups_enabled = False
     message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason)
     logger.info(message)  # log as INFO for now, in the future it should be logged as WARNING
     add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
     self.__reset_cpu_quota()
     CGroupsTelemetry.reset()
Пример #8
0
 def log_cgroup_warn(format_string, *args):
     message = format_string.format(*args)
     logger.warn(message)
     add_event(op=WALAEventOperation.CGroupsInfo,
               message=message,
               is_success=False,
               log_event=False)
Пример #9
0
    def test_save_event_cleanup(self):
        for i in range(0, 2000):
            evt = os.path.join(self.tmp_dir,
                               '{0}.tld'.format(ustr(1491004920536531 + i)))
            with open(evt, 'w') as fh:
                fh.write('test event {0}'.format(i))

        events = os.listdir(self.tmp_dir)
        self.assertTrue(
            len(events) == 2000,
            "{0} events found, 2000 expected".format(len(events)))
        add_event('test', message='last event')

        events = os.listdir(self.tmp_dir)
        events.sort()
        self.assertTrue(
            len(events) == 1000,
            "{0} events found, 1000 expected".format(len(events)))
        first_event = os.path.join(self.tmp_dir, events[0])
        with open(first_event) as first_fh:
            first_event_text = first_fh.read()
            self.assertTrue('test event 1002' in first_event_text)

        last_event = os.path.join(self.tmp_dir, events[-1])
        with open(last_event) as last_fh:
            last_event_text = last_fh.read()
            self.assertTrue('last event' in last_event_text)
Пример #10
0
        def __collect_azure_unit_telemetry():
            azure_units = []

            try:
                units = shellutil.run_command(['systemctl', 'list-units', 'azure*', '-all'])
                for line in units.split('\n'):
                    match = re.match(r'\s?(azure[^\s]*)\s?', line, re.IGNORECASE)
                    if match is not None:
                        azure_units.append((match.group(1), line))
            except shellutil.CommandError as command_error:
                _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))

            for unit_name, unit_description in azure_units:
                unit_slice = "Unknown"
                try:
                    unit_slice = systemd.get_unit_property(unit_name, "Slice")
                except Exception as exception:
                    _log_cgroup_warning("Failed to query Slice for {0}: {1}", unit_name, ustr(exception))

                _log_cgroup_info("Found an Azure unit under slice {0}: {1}", unit_slice, unit_description)

            if len(azure_units) == 0:
                try:
                    cgroups = shellutil.run_command('systemd-cgls')
                    for line in cgroups.split('\n'):
                        if re.match(r'[^\x00-\xff]+azure\.slice\s*', line, re.UNICODE):
                            logger.info(ustr("Found a cgroup for azure.slice\n{0}").format(cgroups))
                            # Don't add the output of systemd-cgls to the telemetry, since currently it does not support Unicode
                            add_event(op=WALAEventOperation.CGroupsInfo, message="Found a cgroup for azure.slice")
                except shellutil.CommandError as command_error:
                    _log_cgroup_warning("Failed to list systemd units: {0}", ustr(command_error))
Пример #11
0
    def send_telemetry_metrics(self):
        """
        The send_telemetry_metrics would soon be removed in favor of sending performance metrics directly.

        :return:
        """
        time_now = datetime.datetime.utcnow()

        try:  # If there is an issue in reporting, it should not take down whole monitor thread.
            if not self.last_cgroup_report_telemetry:
                self.last_cgroup_report_telemetry = time_now

            if time_now >= (self.last_cgroup_report_telemetry +
                            MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD):
                performance_metrics = CGroupsTelemetry.report_all_tracked()
                self.last_cgroup_report_telemetry = time_now

                if performance_metrics:
                    message = generate_extension_metrics_telemetry_dictionary(
                        schema_version=1.0,
                        performance_metrics=performance_metrics)
                    add_event(name=AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.ExtensionMetricsData,
                              is_success=True,
                              message=ustr(message),
                              log_event=False)
        except Exception as e:
            logger.warn(
                "Could not report all the tracked telemetry due to {0}",
                ustr(e))
Пример #12
0
    def _download(self):
        for uri in self.pkg.uris:
            if self._fetch(uri.uri):
                break
            else:
                if self.host is not None and self.host.ensure_initialized():
                    logger.warn(
                        "Download unsuccessful, falling back to host plugin")
                    uri, headers = self.host.get_artifact_request(
                        uri.uri, self.host.manifest_uri)
                    if uri is not None \
                            and headers is not None \
                            and self._fetch(uri, headers=headers):
                        break
                else:
                    logger.warn(
                        "Download unsuccessful, host plugin not available")

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(
                self.name)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.Download,
                      version=CURRENT_VERSION,
                      is_success=False,
                      message=msg)
            raise UpdateError(msg)
        return
Пример #13
0
    def send_extension_healthstore_heartbeat(self):
        """
        Send health signals for extensions to the health store every EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD.
        """

        if self.last_extension_healthstore_heartbeat is None:
            self.last_extension_healthstore_heartbeat = datetime.datetime.utcnow(
            ) - MonitorHandler.EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (
                self.last_extension_healthstore_heartbeat +
                MonitorHandler.EXTENSION_HEALTHSTORE_HEARTBEAT_PERIOD):
            try:
                self.health_service.report_extension_health_observations()
            except Exception as e:
                msg = "Exception sending extension healthstore heartbeat: {0}".format(
                    ustr(e))
                add_event(name=AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ExtensionHeathstoreHeartbeat,
                          is_success=False,
                          message=msg,
                          log_event=False)

            self.last_extension_healthstore_heartbeat = datetime.datetime.utcnow(
            )
    def get_status_file_path(self, extension=None):
        path = None
        seq_no = self.get_largest_seq_no()

        # Issue 1116: use the sequence number from goal state where possible
        if extension is not None and extension.sequenceNumber is not None:
            try:
                gs_seq_no = int(extension.sequenceNumber)

                if gs_seq_no != seq_no:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.SequenceNumberMismatch,
                              is_success=False,
                              message="Goal state: {0}, disk: {1}".format(
                                  gs_seq_no, seq_no),
                              log_event=False)

                seq_no = gs_seq_no
            except ValueError:
                logger.error(
                    'Sequence number [{0}] does not appear to be valid'.format(
                        extension.sequenceNumber))

        if seq_no > -1:
            path = os.path.join(self.get_status_dir(),
                                "{0}.status".format(seq_no))

        return seq_no, path
Пример #15
0
    def _ensure_downloaded(self):
        logger.verbose(u"Ensuring Agent {0} is downloaded", self.name)

        if self.is_downloaded:
            logger.verbose(
                u"Agent {0} was previously downloaded - skipping download",
                self.name)
            return

        if self.pkg is None:
            raise UpdateError(
                u"Agent {0} is missing package and download URIs".format(
                    self.name))

        self._download()
        self._unpack()

        msg = u"Agent {0} downloaded successfully".format(self.name)
        logger.verbose(msg)
        add_event(AGENT_NAME,
                  version=self.version,
                  op=WALAEventOperation.Install,
                  is_success=True,
                  message=msg)
        return
Пример #16
0
    def test_save_event(self):
        add_event('test', message='test event')
        self.assertTrue(len(os.listdir(self.tmp_dir)) == 1)

        # checking the extension of the file created.
        for filename in os.listdir(self.tmp_dir):
            self.assertEqual(".tld", filename[-4:])
Пример #17
0
    def send_host_plugin_heartbeat(self):
        """
        Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to
        communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD.
        """
        if self.last_host_plugin_heartbeat is None:
            self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD):
            try:
                host_plugin = self.protocol.client.get_host_plugin()
                host_plugin.ensure_initialized()
                is_currently_healthy = host_plugin.get_health()

                if is_currently_healthy:
                    self.host_plugin_errorstate.reset()
                else:
                    self.host_plugin_errorstate.incr()

                is_healthy = self.host_plugin_errorstate.is_triggered() is False
                logger.verbose("HostGAPlugin health: {0}", is_healthy)

                self.health_service.report_host_plugin_heartbeat(is_healthy)

            except Exception as e:
                msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e))
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HostPluginHeartbeat,
                    is_success=False,
                    message=msg,
                    log_event=False)

            self.last_host_plugin_heartbeat = datetime.datetime.utcnow()
Пример #18
0
    def report_ext_handlers_status(self):
        """Go through handler_state dir, collect and report status"""
        vm_status = VMStatus(status="Ready", message="Guest Agent is running")
        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.ExtensionProcessing,
                              is_success=False,
                              message=ustr(e))

        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Completed vm agent status report")
        except ProtocolError as e:
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=message)
Пример #19
0
    def send_imds_heartbeat(self):
        """
        Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have
        successfully called and validated a response in the last IMDS_HEALTH_PERIOD.
        """
        try:
            is_currently_healthy, response = self.imds_client.validate()

            if is_currently_healthy:
                self.imds_errorstate.reset()
            else:
                self.imds_errorstate.incr()

            is_healthy = self.imds_errorstate.is_triggered() is False
            logger.verbose("IMDS health: {0} [{1}]", is_healthy, response)

            self.health_service.report_imds_status(is_healthy, response)

        except Exception as e:
            msg = "Exception sending imds heartbeat: {0}".format(ustr(e))
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ImdsHeartbeat,
                      is_success=False,
                      message=msg,
                      log_event=False)
Пример #20
0
    def test_collect_and_send_with_call_wireserver_returns_http_error(
            self, mock_lib_dir, *args):
        mock_lib_dir.return_value = self.lib_dir
        fileutil.mkdir(self.event_dir)
        add_event(name="MonitorTests",
                  op=WALAEventOperation.HeartBeat,
                  is_success=True,
                  message="Test heartbeat")

        with _create_monitor_handler(
                enabled_operations=["collect_and_send_events"
                                    ]) as monitor_handler:

            def http_post_handler(url, _, **__):
                if self.is_telemetry_request(url):
                    return HttpError("A test exception")
                return None

            monitor_handler.get_mock_wire_protocol().set_http_handlers(
                http_post_handler=http_post_handler)

            with patch("azurelinuxagent.common.logger.warn") as mock_warn:
                monitor_handler.run_and_wait()

                self.assertEqual(1, mock_warn.call_count)
                self.assertEqual(0, len(os.listdir(self.event_dir)))
Пример #21
0
    def _download(self):
        for uri in self.pkg.uris:
            if not HostPluginProtocol.is_default_channel() and self._fetch(
                    uri.uri):
                break
            elif self.host is not None and self.host.ensure_initialized():
                if not HostPluginProtocol.is_default_channel():
                    logger.warn(
                        "Download unsuccessful, falling back to host plugin")
                else:
                    logger.verbose("Using host plugin as default channel")

                uri, headers = self.host.get_artifact_request(
                    uri.uri, self.host.manifest_uri)
                if self._fetch(uri, headers=headers):
                    if not HostPluginProtocol.is_default_channel():
                        logger.verbose(
                            "Setting host plugin as default channel")
                        HostPluginProtocol.set_default_channel(True)
                    break
                else:
                    logger.warn("Host plugin download unsuccessful")
            else:
                logger.error("No download channels available")

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(
                self.name)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.Download,
                      version=CURRENT_VERSION,
                      is_success=False,
                      message=msg)
            raise UpdateError(msg)
        return
Пример #22
0
    def run(self, child_args=None):
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        CGroups.setup()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:
                err_msg = traceback.format_exc()
                add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn("Daemon ended with exception -- Sleep 15 seconds and restart daemon")
                time.sleep(15)
Пример #23
0
 def report_event(self, message="", is_success=True):
     version = self.ext_handler.properties.version
     add_event(name=self.ext_handler.name,
               version=version,
               message=message,
               op=self.operation,
               is_success=is_success)
Пример #24
0
    def report_ext_handlers_status(self):
        """Go thru handler_state dir, collect and report status"""
        vm_status = VMStatus()
        vm_status.vmAgent.version = str(CURRENT_VERSION)
        vm_status.vmAgent.status = "Ready"
        vm_status.vmAgent.message = "Guest Agent is running"

        if self.ext_handlers is not None:
            for ext_handler in self.ext_handlers.extHandlers:
                try:
                    self.report_ext_handler_status(vm_status, ext_handler)
                except ExtensionError as e:
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        is_success=False,
                        message=ustr(e))
        
        logger.verbose("Report vm agent status")
        try:
            self.protocol.report_vm_status(vm_status)
            if self.log_report:
                logger.verbose("Successfully reported vm agent status")
        except ProtocolError as e:
            message = "Failed to report vm agent status: {0}".format(e)
            add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=message)
Пример #25
0
    def run(self, child_args=None):
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        CGroups.setup()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:
                err_msg = traceback.format_exc()
                add_event(name=AGENT_NAME,
                          is_success=False,
                          message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn(
                    "Daemon ended with exception -- Sleep 15 seconds and restart daemon"
                )
                time.sleep(15)
Пример #26
0
 def report_event(self, message, is_success=False, duration=0,
                  operation=WALAEventOperation.Provision):
     add_event(name=AGENT_NAME,
                 message=message,
                 duration=duration,
                 is_success=is_success,
                 op=operation)
Пример #27
0
    def _evaluate_deployments(self):
        agents = []
        fSuccess = False
        msg = ""
        try:
            agents = [
                a for a in self.agents
                if a.in_safe_deployment_mode and not a.safe_deploy.is_deployed
            ]

            self._enable_agents(agents)

            for blacklist in [a.safe_deploy.blacklisted for a in agents]:
                self._blacklist_agents(blacklist)

            for agent in agents:
                agent.mark_deployed()

            fSuccess = True

        except Exception as e:
            msg = "Exception evaluating agents for safe deployment: {0}".format(
                e)
            logger.warn(msg)

        if len(agents) > 0:
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.Deploy,
                      is_success=fSuccess,
                      message=msg)
Пример #28
0
    def get_status_file_path(self, extension=None):
        path = None
        seq_no = self.get_largest_seq_no()

        # Issue 1116: use the sequence number from goal state where possible
        if extension is not None and extension.sequenceNumber is not None:
            try:
                gs_seq_no = int(extension.sequenceNumber)

                if gs_seq_no != seq_no:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.SequenceNumberMismatch,
                              is_success=False,
                              message="Goal state: {0}, disk: {1}".format(gs_seq_no, seq_no),
                              log_event=False)

                seq_no = gs_seq_no
            except ValueError:
                logger.error('Sequence number [{0}] does not appear to be valid'.format(extension.sequenceNumber))

        if seq_no > -1:
            path = os.path.join(
                        self.get_status_dir(),
                        "{0}.status".format(seq_no))

        return seq_no, path
Пример #29
0
 def report_event(self, message, is_success=False, duration=0,
                  operation=WALAEventOperation.Provision):
     add_event(name=AGENT_NAME,
                 message=message,
                 duration=duration,
                 is_success=is_success,
                 op=operation)
Пример #30
0
    def _download(self):
        package = None

        for uri in self.pkg.uris:
            try:
                resp = restutil.http_get(uri.uri, chk_proxy=True)
                if resp.status == restutil.httpclient.OK:
                    package = resp.read()
                    fileutil.write_file(self.get_agent_pkg_path(),
                                        bytearray(package),
                                        asbin=True)
                    logger.info(u"Agent {0} downloaded from {1}", self.name,
                                uri.uri)
                    break
            except restutil.HttpError as e:
                logger.warn(u"Agent {0} download from {1} failed", self.name,
                            uri.uri)

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(
                self.name)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.Download,
                      version=CURRENT_VERSION,
                      is_success=False,
                      message=msg)
            raise UpdateError(msg)
        return
Пример #31
0
    def run(self, child_args=None):
        #
        # The Container ID in telemetry events is retrieved from the goal state. We can fetch the goal state
        # only after protocol detection, which is done during provisioning.
        #
        # Be aware that telemetry events emitted before that will not include the Container ID.
        #
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()
        self.initialize_environment()

        # If FIPS is enabled, set the OpenSSL environment variable
        # Note:
        # -- Subprocesses inherit the current environment
        if conf.get_fips_enabled():
            os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1'

        while self.running:
            try:
                self.daemon(child_args)
            except Exception as e:  # pylint: disable=W0612
                err_msg = traceback.format_exc()
                add_event(name=AGENT_NAME,
                          is_success=False,
                          message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.warn(
                    "Daemon ended with exception -- Sleep 15 seconds and restart daemon"
                )
                time.sleep(15)
Пример #32
0
    def test_should_log_errors_if_failed_operation_and_not_empty_event_dir(
            self, mock_logger_info, mock_logger_warn, mock_logger_error,
            mock_reporter):
        mock_reporter.event_dir = "dummy"

        with patch("azurelinuxagent.common.event.should_emit_event",
                   return_value=True) as mock_should_emit_event:
            with patch("azurelinuxagent.common.event.mark_event_status"):
                with patch(
                        "azurelinuxagent.common.event.EventLogger._add_event"):
                    add_event("dummy name",
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.Download,
                              is_success=False,
                              message="dummy event message")

                    self.assertEquals(1, mock_should_emit_event.call_count)
                    self.assertEquals(1, mock_logger_error.call_count)
                    self.assertEquals(0, mock_logger_warn.call_count)
                    self.assertEquals(0, mock_logger_info.call_count)

                    args = mock_logger_error.call_args[0]
                    self.assertEquals(
                        ('dummy name', 'Download', 'dummy event message', 0),
                        args[1:])
Пример #33
0
    def test_save_event_message_with_non_ascii_characters(self):
        test_data_dir = os.path.join(
            data_dir, "events", "collect_and_send_extension_stdout_stderror")
        msg = ""

        with open(os.path.join(test_data_dir,
                               "dummy_stdout_with_non_ascii_characters"),
                  mode="r+b") as stdout:
            with open(os.path.join(test_data_dir,
                                   "dummy_stderr_with_non_ascii_characters"),
                      mode="r+b") as stderr:
                msg = read_output(stdout, stderr)

        duration = elapsed_milliseconds(datetime.utcnow())
        log_msg = "{0}\n{1}".format(
            "DummyCmd",
            "\n".join([line for line in msg.split('\n') if line != ""]))

        add_event('test_extension', message=log_msg, duration=duration)

        for tld_file in os.listdir(self.tmp_dir):
            event_str = MonitorHandler.collect_event(
                os.path.join(self.tmp_dir, tld_file))
            event_json = json.loads(event_str)

            self.assertEqual(len(event_json["parameters"]), 8)

            for i in event_json["parameters"]:
                if i["name"] == "Name":
                    self.assertEqual(i["value"], "test_extension")
                if i["name"] == "Message":
                    self.assertEqual(i["value"], log_msg)
Пример #34
0
    def _download(self):
        for uri in self.pkg.uris:
            if not HostPluginProtocol.is_default_channel() and self._fetch(uri.uri):
                break
            elif self.host is not None and self.host.ensure_initialized():
                if not HostPluginProtocol.is_default_channel():
                    logger.warn("Download unsuccessful, falling back to host plugin")
                else:
                    logger.verbose("Using host plugin as default channel")

                uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri)
                if self._fetch(uri, headers=headers):
                    if not HostPluginProtocol.is_default_channel():
                        logger.verbose("Setting host plugin as default channel")
                        HostPluginProtocol.set_default_channel(True)
                    break
                else:
                    logger.warn("Host plugin download unsuccessful")
            else:
                logger.error("No download channels available")

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(self.name)
            add_event(
                AGENT_NAME,
                op=WALAEventOperation.Download,
                version=CURRENT_VERSION,
                is_success=False,
                message=msg)
            raise UpdateError(msg)
        return
Пример #35
0
    def __log_network_setup_service_logs(self):
        # Get logs from journalctl - https://www.freedesktop.org/software/systemd/man/journalctl.html
        cmd = [
            "journalctl", "-u", self._network_setup_service_name, "-b", "--utc"
        ]
        service_failed = self.__verify_network_setup_service_failed()
        try:
            stdout = shellutil.run_command(cmd)
            msg = ustr("Logs from the {0} since system boot:\n {1}").format(
                self._network_setup_service_name, stdout)
            logger.info(msg)
        except CommandError as error:
            msg = "Unable to fetch service logs, Command: {0} failed with ExitCode: {1}\nStdout: {2}\nStderr: {3}".format(
                ' '.join(cmd), error.returncode, error.stdout, error.stderr)
            logger.warn(msg)
        except Exception as e:
            msg = "Ran into unexpected error when getting logs for {0} service. Error: {1}".format(
                self._network_setup_service_name, textutil.format_exception(e))
            logger.warn(msg)

        # Log service status and logs if we can fetch them from journalctl and send it to Kusto,
        # else just log the error of the failure of fetching logs
        add_event(op=WALAEventOperation.PersistFirewallRules,
                  is_success=(not service_failed),
                  message=msg,
                  log_event=False)
Пример #36
0
        def __init__(self):
            """
            Ensures the cgroups file system is mounted and selects the correct API to interact with it
            """
            osutil = get_osutil()

            self._cgroups_supported = osutil.is_cgroups_supported()

            if self._cgroups_supported:
                self._enabled = True
                try:
                    osutil.mount_cgroups()
                    self._cgroups_api = CGroupsApi.create()
                    status = "The cgroup filesystem is ready to use"
                except Exception as e:
                    status = ustr(e)
                    self._enabled = False
            else:
                self._enabled = False
                self._cgroups_api = None
                status = "Cgroups are not supported by the platform"

            logger.info("CGroups Status: {0}".format(status))

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.InitializeCGroups,
                      is_success=self._enabled,
                      message=status,
                      log_event=False)
Пример #37
0
    def test_save_event_message_with_non_ascii_characters(self):
        test_data_dir = os.path.join(data_dir, "events", "collect_and_send_extension_stdout_stderror")
        msg = ""

        with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout:
            with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr:
                msg = read_output(stdout, stderr)

        duration = elapsed_milliseconds(datetime.utcnow())
        log_msg = "{0}\n{1}".format("DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""]))

        with patch("azurelinuxagent.common.event.datetime") as patch_datetime:
            patch_datetime.utcnow = Mock(return_value=datetime.strptime("2019-01-01 01:30:00",
                                                                        '%Y-%m-%d %H:%M:%S'))
            with patch('os.getpid', return_value=42):
                with patch("threading.Thread.getName", return_value="HelloWorldTask"):
                    add_event('test_extension', message=log_msg, duration=duration)

        for tld_file in os.listdir(self.tmp_dir):
            event_str = MonitorHandler.collect_event(os.path.join(self.tmp_dir, tld_file))
            event_json = json.loads(event_str)

            self.assertEqual(len(event_json["parameters"]), 15)

            # Checking the contents passed above, and also validating the default values that were passed in.
            for i in event_json["parameters"]:
                if i["name"] == "Name":
                    self.assertEqual(i["value"], "test_extension")
                elif i["name"] == "Message":
                    self.assertEqual(i["value"], log_msg)
                elif i["name"] == "Version":
                    self.assertEqual(i["value"], str(CURRENT_VERSION))
                elif i['name'] == 'IsInternal':
                    self.assertEqual(i['value'], False)
                elif i['name'] == 'Operation':
                    self.assertEqual(i['value'], 'Unknown')
                elif i['name'] == 'OperationSuccess':
                    self.assertEqual(i['value'], True)
                elif i['name'] == 'Duration':
                    self.assertEqual(i['value'], 0)
                elif i['name'] == 'ExtensionType':
                    self.assertEqual(i['value'], '')
                elif i['name'] == 'ContainerId':
                    self.assertEqual(i['value'], 'UNINITIALIZED')
                elif i['name'] == 'OpcodeName':
                    self.assertEqual(i['value'], '2019-01-01 01:30:00')
                elif i['name'] == 'EventTid':
                    self.assertEqual(i['value'], threading.current_thread().ident)
                elif i['name'] == 'EventPid':
                    self.assertEqual(i['value'], 42)
                elif i['name'] == 'TaskName':
                    self.assertEqual(i['value'], 'HelloWorldTask')
                elif i['name'] == 'KeywordName':
                    self.assertEqual(i['value'], '')
                elif i['name'] == 'GAVersion':
                    self.assertEqual(i['value'], str(CURRENT_AGENT))
                else:
                    self.assertFalse(True, "Contains a field outside the defaults expected. Field Name: {0}".
                                     format(i['name']))
Пример #38
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (
                self.last_telemetry_heartbeat +
                MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked(
                )
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name,
                                          cgroup_name, value)

                        if metric_group == "Memory":
                            if value >= thresholds["memory"]:
                                msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["memory"])
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds["cpu"]:
                                msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
                                    cgroup_name, value, thresholds["cpu"])
                                add_event(
                                    name=AGENT_NAME,
                                    version=CURRENT_VERSION,
                                    op=WALAEventOperation.CGroupsLimitsCrossed,
                                    is_success=True,
                                    message=msg,
                                    log_event=True)

            except Exception as e:
                logger.warn(
                    "Monitor: failed to collect cgroups performance metrics: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(
                    self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn(
                    "Monitor: failed to update cgroups tracked extensions: {0}",
                    ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Пример #39
0
    def send_cgroup_telemetry(self):
        if self.last_cgroup_telemetry is None:
            self.last_cgroup_telemetry = datetime.datetime.utcnow()

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.CGROUP_TELEMETRY_PERIOD):
            try:
                metric_reported, metric_threshold = CGroupsTelemetry.collect_all_tracked()
                for cgroup_name, metrics in metric_reported.items():
                    thresholds = metric_threshold[cgroup_name]

                    for metric_group, metric_name, value in metrics:
                        if value > 0:
                            report_metric(metric_group, metric_name, cgroup_name, value)

                        if metric_group == "Memory":
                            # Memory is collected in bytes, and limit is set in megabytes.
                            if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit):
                                msg = "CGroup {0}: Crossed the Memory Threshold. " \
                                      "Current Value: {1} bytes, Threshold: {2} megabytes." \
                                       .format(cgroup_name, value, thresholds.memory_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

                        if metric_group == "Process":
                            if value >= thresholds.cpu_limit:
                                msg = "CGroup {0}: Crossed the Processor Threshold. " \
                                      "Current Value: {1}, Threshold: {2}." \
                                       .format(cgroup_name, value, thresholds.cpu_limit)

                                logger.warn(msg)
                                add_event(name=AGENT_NAME,
                                          version=CURRENT_VERSION,
                                          op=WALAEventOperation.CGroupsLimitsCrossed,
                                          is_success=True,
                                          message=msg,
                                          log_event=True)

            except Exception as e:
                logger.warn("Monitor: failed to collect cgroups performance metrics: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            # Look for extension cgroups we're not already tracking and track them
            try:
                CGroupsTelemetry.update_tracked(self.protocol.client.get_current_handlers())
            except Exception as e:
                logger.warn("Monitor: failed to update cgroups tracked extensions: {0}", ustr(e))
                logger.verbose(traceback.format_exc())

            self.last_cgroup_telemetry = datetime.datetime.utcnow()
Пример #40
0
 def daemon(self):
     last_heartbeat = datetime.datetime.min
     period = datetime.timedelta(minutes=30)
     while True:
         if (datetime.datetime.now() - last_heartbeat) > period:
             last_heartbeat = datetime.datetime.now()
             add_event(op=WALAEventOperation.HeartBeat, name=CURRENT_AGENT, version=CURRENT_VERSION, is_success=True)
         try:
             self.collect_and_send_events()
         except Exception as e:
             logger.warn("Failed to send events: {0}", e)
         time.sleep(60)
Пример #41
0
 def _report_failures(self):
     try:
         logger.verbose("HealthService: report failures as telemetry")
         from azurelinuxagent.common.event import add_event, WALAEventOperation
         for o in self.observations:
             if not o.is_healthy:
                 add_event(AGENT_NAME,
                           version=CURRENT_VERSION,
                           op=WALAEventOperation.HealthObservation,
                           is_success=False,
                           message=json.dumps(o.as_obj))
     except Exception as e:
         logger.verbose("HealthService: could not report failures: {0}".format(ustr(e)))
Пример #42
0
    def _emit_restart_event(self):
        if not self._is_clean_start:
            msg = u"{0} did not terminate cleanly".format(CURRENT_AGENT)
            logger.info(msg)
            add_event(
                AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.Restart,
                is_success=False,
                message=msg)

        self._set_sentinal() 
        return
Пример #43
0
    def setup(suppress_process_add=False):
        """
        Only needs to be called once, and should be called from the -daemon instance of the agent.
            Mount the cgroup fs if necessary
            Create wrapper cgroups for agent-plus-extensions and set limits on them;
            Add this process to the "agent" cgroup, if required
        Actual collection of metrics from cgroups happens in the -run-exthandlers instance
        """
        if CGroups.enabled():
            try:
                CGroups._osutil.mount_cgroups()
                if not suppress_process_add:
                    # Creates /sys/fs/cgroup/{cpu,memory}/WALinuxAgent wrapper cgroup
                    CGroups._setup_wrapper_groups()
                    pid = int(os.getpid())
                    if CGroups.is_systemd_manager():
                        # When daemon is running as a service, it's called walinuxagent.service
                        # and is created and tracked by systemd, so we don't explicitly add the PID ourselves,
                        # just track it for our reporting purposes
                        cg = CGroups.for_systemd_service(AGENT_CGROUP_NAME.lower() + ".service")
                        logger.info("Daemon process id {0} is tracked in systemd cgroup {1}".format(pid, cg.name))
                        # systemd sets limits; any limits we write would be overwritten
                    else:
                        # Creates /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent cgroup
                        cg = CGroups.for_extension(AGENT_CGROUP_NAME)
                        logger.info("Daemon process id {0} is tracked in cgroup {1}".format(pid, cg.name))
                        cg.add(pid)
                        cg.set_limits()

                status = "successfully set up agent cgroup"
            except CGroupsException as cge:
                status = cge.msg
                CGroups.disable()
            except Exception as ge:
                status = ustr(ge)
                CGroups.disable()
        else:
            status = "not supported by platform"
            CGroups.disable()

        logger.info("CGroups: {0}".format(status))

        from azurelinuxagent.common.event import add_event, WALAEventOperation
        add_event(
            AGENT_NAME,
            version=CURRENT_VERSION,
            op=WALAEventOperation.InitializeCGroups,
            is_success=CGroups.enabled(),
            message=status,
            log_event=False)
Пример #44
0
 def _ensure_partition_assigned(self):
     """
     Assign the VM to a partition (0 - 99). Downloaded updates may be configured
     to run on only some VMs; the assigned partition determines eligibility.
     """
     if not os.path.exists(self._partition_file):
         partition = ustr(int(datetime.utcnow().microsecond / 10000))
         fileutil.write_file(self._partition_file, partition)
         add_event(
             AGENT_NAME,
             version=CURRENT_VERSION,
             op=WALAEventOperation.Partition,
             is_success=True,
             message=partition)
Пример #45
0
    def __init__(self, path=None, pkg=None, host=None):
        self.pkg = pkg
        self.host = host
        version = None
        if path is not None:
            m = AGENT_DIR_PATTERN.match(path)
            if m == None:
                raise UpdateError(u"Illegal agent directory: {0}".format(path))
            version = m.group(1)
        elif self.pkg is not None:
            version = pkg.version

        if version == None:
            raise UpdateError(u"Illegal agent version: {0}".format(version))
        self.version = FlexibleVersion(version)

        location = u"disk" if path is not None else u"package"
        logger.verbose(u"Loading Agent {0} from {1}", self.name, location)

        self.error = GuestAgentError(self.get_agent_error_file())
        self.error.load()

        try:
            self._ensure_downloaded()
            self._ensure_loaded()
        except Exception as e:
            if isinstance(e, ResourceGoneError):
                raise

            # The agent was improperly blacklisting versions due to a timeout
            # encountered while downloading a later version. Errors of type
            # socket.error are IOError, so this should provide sufficient
            # protection against a large class of I/O operation failures.
            if isinstance(e, IOError):
                raise

            # Note the failure, blacklist the agent if the package downloaded
            # - An exception with a downloaded package indicates the package
            #   is corrupt (e.g., missing the HandlerManifest.json file)
            self.mark_failure(is_fatal=os.path.isfile(self.get_agent_pkg_path()))

            msg = u"Agent {0} install failed with exception: {1}".format(
                        self.name, ustr(e))
            logger.warn(msg)
            add_event(
                AGENT_NAME,
                version=self.version,
                op=WALAEventOperation.Install,
                is_success=False,
                message=msg)
Пример #46
0
 def activate_resource_disk(self):
     logger.info("Activate resource disk")
     try:
         mount_point = conf.get_resourcedisk_mountpoint()
         fs = conf.get_resourcedisk_filesystem()
         mount_point = self.mount_resource_disk(mount_point, fs)
         warning_file = os.path.join(mount_point, DATALOSS_WARNING_FILE_NAME)
         try:
             fileutil.write_file(warning_file, DATA_LOSS_WARNING)
         except IOError as e:
             logger.warn("Failed to write data loss warnning:{0}", e)
         return mount_point
     except ResourceDiskError as e:
         logger.error("Failed to mount resource disk {0}", e)
         add_event(name="WALA", is_success=False, message=ustr(e),
                           op=WALAEventOperation.ActivateResourceDisk)
Пример #47
0
    def _emit_restart_event(self):
        try:
            if not self._is_clean_start:
                msg = u"Agent did not terminate cleanly: {0}".format(
                            fileutil.read_file(self._sentinel_file_path()))
                logger.info(msg)
                add_event(
                    AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.Restart,
                    is_success=False,
                    message=msg)
        except Exception:
            pass

        return
Пример #48
0
    def _ensure_downloaded(self):
        try:
            logger.verbose(u"Ensuring Agent {0} is downloaded", self.name)

            if self.is_blacklisted:
                logger.info(u"Agent {0} is blacklisted - skipping download", self.name)
                return

            if self.is_downloaded:
                logger.verbose(u"Agent {0} was previously downloaded - skipping download", self.name)
                self._load_manifest()
                return

            if self.pkg is None:
                raise UpdateError(u"Agent {0} is missing package and download URIs".format(
                    self.name))
            
            self._download()
            self._unpack()
            self._load_manifest()
            self._load_error()
            self._load_supported()

            msg = u"Agent {0} downloaded successfully".format(self.name)
            logger.verbose(msg)
            add_event(
                AGENT_NAME,
                version=self.version,
                op=WALAEventOperation.Install,
                is_success=True,
                message=msg)

        except Exception as e:
            # Note the failure, blacklist the agent if the package downloaded
            # - An exception with a downloaded package indicates the package
            #   is corrupt (e.g., missing the HandlerManifest.json file)
            self.mark_failure(is_fatal=os.path.isfile(self.get_agent_pkg_path()))

            msg = u"Agent {0} download failed with exception: {1}".format(self.name, ustr(e))
            logger.warn(msg)
            add_event(
                AGENT_NAME,
                version=self.version,
                op=WALAEventOperation.Install,
                is_success=False,
                message=msg)
        return
Пример #49
0
    def run(self):
        logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION)
        logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION)
        logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR,
                    PY_VERSION_MICRO)

        self.check_pid()

        while self.running:
            try:
                self.daemon()
            except Exception as e:
                err_msg = traceback.format_exc()
                add_event("WALA", is_success=False, message=ustr(err_msg),
                          op=WALAEventOperation.UnhandledError)
                logger.info("Sleep 15 seconds and restart daemon")
                time.sleep(15)
Пример #50
0
    def set_limits(self):
        """
        Set per-hierarchy limits based on the cgroup name (agent or particular extension)
        """

        if not conf.get_cgroups_enforce_limits():
            return

        if self.name is None:
            return

        for ext in conf.get_cgroups_excluded():
            if ext in self.name.lower():
                logger.info('No cgroups limits for {0}'.format(self.name))
                return

        # default values
        cpu_limit = DEFAULT_CPU_LIMIT_EXT
        mem_limit = max(DEFAULT_MEM_LIMIT_MIN_MB, round(self._osutil.get_total_mem() * DEFAULT_MEM_LIMIT_PCT / 100, 0))

        # agent values
        if AGENT_NAME.lower() in self.name.lower():
            cpu_limit = DEFAULT_CPU_LIMIT_AGENT
            mem_limit = min(DEFAULT_MEM_LIMIT_MAX_MB, mem_limit)

        msg = '{0}: {1}% {2}mb'.format(self.name, cpu_limit, mem_limit)
        logger.info("Setting cgroups limits for {0}".format(msg))
        success = False

        try:
            self.set_cpu_limit(cpu_limit)
            self.set_memory_limit(mem_limit)
            success = True
        except Exception as ge:
            msg = '[{0}] {1}'.format(msg, ustr(ge))
            raise
        finally:
            from azurelinuxagent.common.event import add_event, WALAEventOperation
            add_event(
                AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.SetCGroupsLimits,
                is_success=success,
                message=msg,
                log_event=False)
Пример #51
0
 def run(self):
     try:
         if self.os_util.jit_enabled:
             self.protocol = self.protocol_util.get_protocol()
             current_incarnation = self.protocol.get_incarnation()
             if self.incarnation != current_incarnation:
                 # something changed. Handle remote access if any.
                 self.incarnation = current_incarnation
                 self.remote_access = self.protocol.client.get_remote_access()
                 self.handle_remote_access()
     except Exception as e:
         msg = u"Exception processing remote access handler: {0} {1}".format(ustr(e), traceback.format_exc())
         logger.error(msg)
         add_event(AGENT_NAME,
                   version=CURRENT_VERSION,
                   op=WALAEventOperation.RemoteAccessHandling,
                   is_success=False,
                   message=msg)
Пример #52
0
    def setup(suppress_process_add=False):
        """
        Only needs to be called once, and should be called from the -daemon instance of the agent.
            Mount the cgroup fs if necessary
            Create wrapper cgroups for agent-plus-extensions and set limits on them;
            Add this process to the "agent" cgroup, if required
        Actual collection of metrics from cgroups happens in the -run-exthandlers instance
        """
        if CGroups.enabled():
            try:
                CGroups._osutil.mount_cgroups()
                if not suppress_process_add:
                    CGroups._setup_wrapper_groups()
                    pid = int(os.getpid())
                    if not CGroups.is_systemd_manager():
                        cg = CGroups.for_extension(AGENT_NAME)
                        logger.info("Add daemon process pid {0} to {1} cgroup".format(pid, cg.name))
                        cg.add(pid)
                        cg.set_limits()
                    else:
                        cg = CGroups.for_systemd_service(AGENT_NAME)
                        logger.info("Add daemon process pid {0} to {1} systemd cgroup".format(pid, cg.name))
                        # systemd sets limits; any limits we write would be overwritten
                status = "ok"
            except CGroupsException as cge:
                status = cge.msg
                CGroups.disable()
            except Exception as ge:
                status = ustr(ge)
                CGroups.disable()
        else:
            status = "not supported by platform"
            CGroups.disable()

        logger.info("CGroups: {0}".format(status))

        from azurelinuxagent.common.event import add_event, WALAEventOperation
        add_event(
            AGENT_NAME,
            version=CURRENT_VERSION,
            op=WALAEventOperation.InitializeCGroups,
            is_success=CGroups.enabled(),
            message=status,
            log_event=False)
Пример #53
0
    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
        except ProtocolError as e:
            msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME, version=CURRENT_VERSION, is_success=False, message=msg)
            return

        msg = u"Handle extensions updates for incarnation {0}".format(etag)
        logger.verbose(msg)
        # Log status report success on new config
        self.log_report = True
        self.handle_ext_handlers(etag)
        self.last_etag = etag

        self.report_ext_handlers_status()
Пример #54
0
    def set_limits(self):
        """
        Set per-hierarchy limits based on the cgroup name (agent or particular extension)
        """

        if not conf.get_cgroups_enforce_limits():
            return

        if self.name is None:
            return

        for ext in conf.get_cgroups_excluded():
            if ext in self.name.lower():
                logger.info('No cgroups limits for {0}'.format(self.name))
                return

        # default values
        cpu_limit = self.get_cpu_limits()
        mem_limit = self.get_memory_limits()

        msg = '{0}: {1}% {2}mb'.format(self.name, cpu_limit, mem_limit)
        logger.info("Setting cgroups limits for {0}".format(msg))
        success = False

        try:
            self.set_cpu_limit(cpu_limit)
            self.set_memory_limit(mem_limit)
            success = True
        except Exception as ge:
            msg = '[{0}] {1}'.format(msg, ustr(ge))
            raise
        finally:
            from azurelinuxagent.common.event import add_event, WALAEventOperation
            add_event(
                AGENT_NAME,
                version=CURRENT_VERSION,
                op=WALAEventOperation.SetCGroupsLimits,
                is_success=success,
                message=msg,
                log_event=False)

        # Returning the limits -
        self.threshold = {"cpu": cpu_limit, "memory": mem_limit}
Пример #55
0
    def run(self):
        self.ext_handlers, etag = None, None
        try:
            self.protocol = self.protocol_util.get_protocol()
            self.ext_handlers, etag = self.protocol.get_ext_handlers()
            self.get_artifact_error_state.reset()
        except Exception as e:
            msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
            self.get_artifact_error_state.incr()

            if self.get_artifact_error_state.is_triggered():
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.GetArtifactExtended,
                          is_success=False,
                          message="Failed to get extension artifact for over "
                                  "{0): {1}".format(self.get_artifact_error_state.min_timedelta, msg))
                self.get_artifact_error_state.reset()
            else:
                logger.warn(msg)
            return

        try:
            msg = u"Handle extensions updates for incarnation {0}".format(etag)
            logger.verbose(msg)
            # Log status report success on new config
            self.log_report = True
            self.handle_ext_handlers(etag)
            self.last_etag = etag

            self.report_ext_handlers_status()
            self.cleanup_outdated_handlers()
        except Exception as e:
            msg = u"Exception processing extension handlers: {0}".format(
                ustr(e))
            logger.warn(msg)
            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.ExtensionProcessing,
                      is_success=False,
                      message=msg)
            return
Пример #56
0
    def _download(self):
        uris_shuffled = self.pkg.uris
        random.shuffle(uris_shuffled)
        for uri in uris_shuffled:
            if not HostPluginProtocol.is_default_channel() and self._fetch(uri.uri):
                break

            elif self.host is not None and self.host.ensure_initialized():
                if not HostPluginProtocol.is_default_channel():
                    logger.warn("Download failed, switching to host plugin")
                else:
                    logger.verbose("Using host plugin as default channel")

                uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri)
                try:
                    if self._fetch(uri, headers=headers, use_proxy=False):
                        if not HostPluginProtocol.is_default_channel():
                            logger.verbose("Setting host plugin as default channel")
                            HostPluginProtocol.set_default_channel(True)
                        break
                    else:
                        logger.warn("Host plugin download failed")

                # If the HostPlugin rejects the request,
                # let the error continue, but set to use the HostPlugin
                except ResourceGoneError:
                    HostPluginProtocol.set_default_channel(True)
                    raise

            else:
                logger.error("No download channels available")

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(self.name)
            add_event(
                AGENT_NAME,
                op=WALAEventOperation.Download,
                version=CURRENT_VERSION,
                is_success=False,
                message=msg)
            raise UpdateError(msg)
Пример #57
0
    def send_telemetry_heartbeat(self):

        if self.last_telemetry_heartbeat is None:
            self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD

        if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD):
            try:
                incarnation = self.protocol.get_incarnation()
                dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint)
                msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets)

                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.HeartBeat,
                    is_success=True,
                    message=msg,
                    log_event=False)

                self.counter += 1

                io_errors = IOErrorCounter.get_and_reset()
                hostplugin_errors = io_errors.get("hostplugin")
                protocol_errors = io_errors.get("protocol")
                other_errors = io_errors.get("other")

                if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0:
                    msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors,
                                                                         protocol_errors,
                                                                         other_errors)
                    add_event(
                        name=AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.HttpErrors,
                        is_success=True,
                        message=msg,
                        log_event=False)
            except Exception as e:
                logger.warn("Failed to send heartbeat: {0}", e)

            self.last_telemetry_heartbeat = datetime.datetime.utcnow()
Пример #58
0
    def _download(self):
        for uri in self.pkg.uris:
            if self._fetch(uri.uri):
                break
            else:
                if self.host is not None:
                    logger.info("Download unsuccessful, falling back to host plugin")
                    uri, headers = self.host.get_artifact_request(uri.uri, self.host.manifest_uri)
                    if self._fetch(uri, headers=headers):
                        break

        if not os.path.isfile(self.get_agent_pkg_path()):
            msg = u"Unable to download Agent {0} from any URI".format(self.name)
            add_event(
                AGENT_NAME,
                op=WALAEventOperation.Download,
                version=CURRENT_VERSION,
                is_success=False,
                message=msg)
            raise UpdateError(msg)
        return
Пример #59
0
    def test_save_event_rollover(self):
        tmp_evt = tempfile.mkdtemp()
        init_event_logger(tmp_evt)
        add_event('test', message='first event')
        for i in range(0, 999):
            add_event('test', message='test event {0}'.format(i))

        events = os.listdir(tmp_evt)
        events.sort()
        self.assertTrue(len(events) == 1000)

        first_event = os.path.join(tmp_evt, events[0])
        with open(first_event) as first_fh:
            first_event_text = first_fh.read()
            self.assertTrue('first event' in first_event_text)

        add_event('test', message='last event')
        events = os.listdir(tmp_evt)
        events.sort()
        self.assertTrue(len(events) == 1000, "{0} events found, 1000 expected".format(len(events)))

        first_event = os.path.join(tmp_evt, events[0])
        with open(first_event) as first_fh:
            first_event_text = first_fh.read()
            self.assertFalse('first event' in first_event_text)
            self.assertTrue('test event 0' in first_event_text)

        last_event = os.path.join(tmp_evt, events[-1])
        with open(last_event) as last_fh:
            last_event_text = last_fh.read()
            self.assertTrue('last event' in last_event_text)

        shutil.rmtree(tmp_evt)
Пример #60
0
    def _ensure_downloaded(self):
        logger.verbose(u"Ensuring Agent {0} is downloaded", self.name)

        if self.is_downloaded:
            logger.verbose(u"Agent {0} was previously downloaded - skipping download", self.name)
            return

        if self.pkg is None:
            raise UpdateError(u"Agent {0} is missing package and download URIs".format(
                self.name))
        
        self._download()
        self._unpack()

        msg = u"Agent {0} downloaded successfully".format(self.name)
        logger.verbose(msg)
        add_event(
            AGENT_NAME,
            version=self.version,
            op=WALAEventOperation.Install,
            is_success=True,
            message=msg)