Пример #1
0
    def test_foreach_controller_should_handle_errors_in_individual_controllers(
            self):
        successful_controllers = []

        def controller_operation(controller):
            if controller == 'cpu':
                raise Exception('A test exception')

            successful_controllers.append(controller)

        with patch("azurelinuxagent.common.cgroupapi.logger.warn"
                   ) as mock_logger_warn:
            CGroupsApi._foreach_controller(controller_operation,
                                           'A dummy message')

            self.assertIn(
                'memory', successful_controllers,
                'The operation was not executed on the memory controller')
            self.assertEqual(
                len(successful_controllers), 1,
                'The operation was not executed on unexpected controllers: {0}'
                .format(successful_controllers))

            args, kwargs = mock_logger_warn.call_args
            (message_format, controller, error, message) = args
            self.assertEquals(message_format,
                              'Error in cgroup controller "{0}": {1}. {2}')
            self.assertEquals(controller, 'cpu')
            self.assertEquals(error, 'A test exception')
            self.assertEquals(message, 'A dummy message')
        def __init__(self):
            """
            Ensures the cgroups file system is mounted and selects the correct API to interact with it
            """
            osutil = get_osutil()

            self._cgroups_supported = osutil.is_cgroups_supported()

            if self._cgroups_supported:
                self._enabled = True
                try:
                    osutil.mount_cgroups()
                    self._cgroups_api = CGroupsApi.create()
                    status = "The cgroup filesystem is ready to use"
                except Exception as e:
                    status = ustr(e)
                    self._enabled = False
            else:
                self._enabled = False
                self._cgroups_api = None
                status = "Cgroups are not supported by the platform"

            logger.info("CGroups Status: {0}".format(status))

            add_event(AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.InitializeCGroups,
                      is_success=self._enabled,
                      message=status,
                      log_event=False)
Пример #3
0
    def test_create_should_return_a_FileSystemCgroupsApi_on_non_systemd_platforms(
            self):
        with patch("azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd",
                   return_value=False):
            api = CGroupsApi.create()

        self.assertTrue(type(api) == FileSystemCgroupsApi)
Пример #4
0
    def test_is_systemd_should_return_false_when_systemd_does_not_manage_current_process(self):
        fileutil_read_file = fileutil.read_file

        def mock_read_file(filepath, asbin=False, remove_bom=False, encoding='utf-8'):
            if filepath == "/proc/cgroups":
                return """
#subsys_name	hierarchy	num_cgroups	enabled
cpuset	11	1	1
cpu	3	77	1
cpuacct	3	77	1
blkio	10	70	1
memory	12	124	1
devices	9	70	1
freezer	4	1	1
net_cls	2	1	1
perf_event	7	1	1
net_prio	2	1	1
hugetlb	8	1	1
pids	5	76	1
rdma	6	1	1
"""
            if filepath == "/proc/self/cgroup":
                return """
3:name=systemd:/
2:memory:/walinuxagent.service
1:cpu,cpuacct:/walinuxagent.service
"""
            return fileutil_read_file(filepath, asbin=asbin, remove_bom=remove_bom, encoding=encoding)

        with patch("azurelinuxagent.common.cgroupapi.fileutil.read_file", mock_read_file):
            is_systemd = CGroupsApi._is_systemd()

        self.assertFalse(is_systemd)
Пример #5
0
    def test_foreach_controller_should_execute_operation_on_all_mounted_controllers(
            self):
        executed_controllers = []

        def controller_operation(controller):
            executed_controllers.append(controller)

        CGroupsApi._foreach_controller(controller_operation, 'A dummy message')

        # The setUp method mocks azurelinuxagent.common.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT to have the cpu and memory controllers mounted
        self.assertIn('cpu', executed_controllers,
                      'The operation was not executed on the cpu controller')
        self.assertIn(
            'memory', executed_controllers,
            'The operation was not executed on the memory controller')
        self.assertEqual(
            len(executed_controllers), 2,
            'The operation was not executed on unexpected controllers: {0}'.
            format(executed_controllers))
Пример #6
0
        def initialize(self):
            try:
                if self._initialized:
                    return

                # check whether cgroup monitoring is supported on the current distro
                self._cgroups_supported = CGroupsApi.cgroups_supported()
                if not self._cgroups_supported:
                    logger.info("Cgroup monitoring is not supported on {0}", get_distro())
                    return

                # check that systemd is detected correctly
                self._cgroups_api = SystemdCgroupsApi()
                if not systemd.is_systemd():
                    _log_cgroup_warning("systemd was not detected on {0}", get_distro())
                    return

                _log_cgroup_info("systemd version: {0}", systemd.get_version())

                # This is temporarily disabled while we analyze telemetry. Likely it will be removed.
                # self.__collect_azure_unit_telemetry()
                # self.__collect_agent_unit_files_telemetry()

                if not self.__check_no_legacy_cgroups():
                    return

                agent_unit_name = systemd.get_agent_unit_name()
                agent_slice = systemd.get_unit_property(agent_unit_name, "Slice")
                if agent_slice not in (_AZURE_SLICE, "system.slice"):
                    _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice)
                    return

                self.__setup_azure_slice()

                cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers()
                self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, cpu_controller_root, memory_controller_root)

                if self._agent_cpu_cgroup_path is not None:
                    _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
                    self.enable()
                    CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))

                _log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled)

            except Exception as exception:
                _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception))
            finally:
                self._initialized = True
Пример #7
0
        def _check_processes_in_agent_cgroup(self):
            """
            Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run
            (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run
            commands used to start extensions on their own cgroup).
            Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported
            as unexpected, since they should belong to their own cgroup.

            Raises a CGroupsException if the check fails
            """
            unexpected = []

            try:
                daemon = os.getppid()
                extension_handler = os.getpid()
                agent_commands = set()
                agent_commands.update(shellutil.get_running_commands())
                systemd_run_commands = set()
                systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
                agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path)
                # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup;
                agent_commands.update(shellutil.get_running_commands())
                systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())

                for process in agent_cgroup:
                    # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
                    if process in (daemon, extension_handler) or process in systemd_run_commands:
                        continue
                    # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
                    if self._get_parent(process) in systemd_run_commands and self._get_command(process) == 'systemd-run':
                        continue
                    # check if the process is a command started by the agent or a descendant of one of those commands
                    current = process
                    while current != 0 and current not in agent_commands:
                        current = self._get_parent(current)
                    if current == 0:
                        unexpected.append(self.__format_process(process))
                        if len(unexpected) >= 5:  # collect just a small sample
                            break
            except Exception as exception:
                _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))

            if len(unexpected) > 0:
                raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
Пример #8
0
    def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self):
        test_cases = [
            (['ubuntu', '16.04', 'xenial'], True),
            (['ubuntu', '16.10', 'yakkety'], True),
            (['ubuntu', '18.04', 'bionic'], True),
            (['ubuntu', '18.10', 'cosmic'], True),
            (['ubuntu', '20.04', 'focal'], True),
            (['ubuntu', '20.10', 'groovy'], True),
            (['centos', '7.5', 'Source'], False),
            (['redhat', '7.7', 'Maipo'], False),
            (['redhat', '7.7.1908', 'Core'], False),
            (['bigip', '15.0.1', 'Final'], False),
            (['gaia', '273.562', 'R80.30'], False),
            (['debian', '9.1', ''], False),
        ]

        for (distro, supported) in test_cases:
            with patch("azurelinuxagent.common.cgroupapi.get_distro", return_value=distro):
                self.assertEqual(CGroupsApi.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro))
Пример #9
0
    def test_is_systemd_should_return_false_when_systemd_does_not_manage_current_process(
            self):
        path_exists = os.path.exists

        def mock_path_exists(path):
            if path == "/run/systemd/system/":
                mock_path_exists.path_tested = True
                return False
            return path_exists(path)

        mock_path_exists.path_tested = False

        with patch("azurelinuxagent.common.cgroupapi.os.path.exists",
                   mock_path_exists):
            is_systemd = CGroupsApi._is_systemd()

        self.assertFalse(is_systemd)

        self.assertTrue(
            mock_path_exists.path_tested,
            'The expected path was not tested; the implementation of CGroupsApi._is_systemd() may have changed.'
        )
Пример #10
0
def is_log_collection_allowed():
    # There are three conditions that need to be met in order to allow periodic log collection:
    # 1) It should be enabled in the configuration.
    # 2) The system must be using systemd to manage services. Needed for resource limiting of the log collection.
    # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting.
    conf_enabled = conf.get_collect_logs()
    systemd_present = CGroupsApi.is_systemd()
    supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3
    is_allowed = conf_enabled and systemd_present and supported_python

    msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \
          "configuration enabled [{1}], systemd present [{2}], python supported: [{3}]".format(is_allowed,
                                                                                               conf_enabled,
                                                                                               systemd_present,
                                                                                               supported_python)
    logger.info(msg)
    add_event(name=AGENT_NAME,
              version=CURRENT_VERSION,
              op=WALAEventOperation.LogCollection,
              is_success=is_allowed,
              message=msg,
              log_event=False)

    return is_allowed
Пример #11
0
        def initialize(self):
            try:
                if self._initialized:
                    return

                #
                # check whether cgroup monitoring is supported on the current distro
                #
                self._cgroups_supported = CGroupsApi.cgroups_supported()
                if not self._cgroups_supported:
                    logger.info("Cgroup monitoring is not supported on {0}",
                                get_distro())
                    return

                #
                # check systemd
                #
                self._cgroups_api = CGroupsApi.create()

                if not isinstance(self._cgroups_api, SystemdCgroupsApi):
                    message = "systemd was not detected on {0}".format(
                        get_distro())
                    logger.warn(message)
                    add_event(op=WALAEventOperation.CGroupsInitialize,
                              is_success=False,
                              message=message,
                              log_event=False)
                    return

                def log_cgroup_info(format_string, *args):
                    message = format_string.format(*args)
                    logger.info(message)
                    add_event(op=WALAEventOperation.CGroupsInfo,
                              message=message)

                def log_cgroup_warn(format_string, *args):
                    message = format_string.format(*args)
                    logger.warn(message)
                    add_event(op=WALAEventOperation.CGroupsInfo,
                              message=message,
                              is_success=False,
                              log_event=False)

                log_cgroup_info("systemd version: {0}",
                                self._cgroups_api.get_systemd_version())

                #
                # Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent.  When running
                # under systemd this could produce invalid resource usage data. Do not enable cgroups under this condition.
                #
                legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()

                if legacy_cgroups > 0:
                    log_cgroup_warn(
                        "The daemon's PID was added to a legacy cgroup; will not monitor resource usage."
                    )
                    return

                #
                # check v1 controllers
                #
                cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points(
                )

                if cpu_controller_root is not None:
                    logger.info("The CPU cgroup controller is mounted at {0}",
                                cpu_controller_root)
                else:
                    log_cgroup_warn("The CPU cgroup controller is not mounted")

                if memory_controller_root is not None:
                    logger.info(
                        "The memory cgroup controller is mounted at {0}",
                        memory_controller_root)
                else:
                    log_cgroup_warn(
                        "The memory cgroup controller is not mounted")

                #
                # check v2 controllers
                #
                cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers(
                )
                if cgroup2_mountpoint is not None:
                    log_cgroup_warn(
                        "cgroups v2 mounted at {0}.  Controllers: [{1}]",
                        cgroup2_mountpoint, cgroup2_controllers)

                #
                # check the cgroups for the agent
                #
                agent_unit_name = self._cgroups_api.get_agent_unit_name()
                cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths(
                    "self")
                if cpu_cgroup_relative_path is None:
                    log_cgroup_warn(
                        "The agent's process is not within a CPU cgroup")
                else:
                    cpu_accounting = self._cgroups_api.get_unit_property(
                        agent_unit_name, "CPUAccounting")
                    log_cgroup_info('CPUAccounting: {0}', cpu_accounting)

                if memory_cgroup_relative_path is None:
                    log_cgroup_warn(
                        "The agent's process is not within a memory cgroup")
                else:
                    memory_accounting = self._cgroups_api.get_unit_property(
                        agent_unit_name, "MemoryAccounting")
                    log_cgroup_info('MemoryAccounting: {0}', memory_accounting)

                #
                # All good, enable cgroups and start monitoring the agent
                #
                self._cgroups_enabled = True

                if cpu_controller_root is None or cpu_cgroup_relative_path is None:
                    logger.info("Will not track CPU for the agent's cgroup")
                else:
                    self._agent_cpu_cgroup_path = os.path.join(
                        cpu_controller_root, cpu_cgroup_relative_path)
                    CGroupsTelemetry.track_cgroup(
                        CpuCgroup(agent_unit_name,
                                  self._agent_cpu_cgroup_path))

                if memory_controller_root is None or memory_cgroup_relative_path is None:
                    logger.info("Will not track memory for the agent's cgroup")
                else:
                    self._agent_memory_cgroup_path = os.path.join(
                        memory_controller_root, memory_cgroup_relative_path)
                    CGroupsTelemetry.track_cgroup(
                        MemoryCgroup(agent_unit_name,
                                     self._agent_memory_cgroup_path))

                log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}",
                                self._agent_cpu_cgroup_path,
                                self._agent_memory_cgroup_path)

            except Exception as e:
                message = "Error initializing cgroups: {0}".format(ustr(e))
                logger.warn(message)
                add_event(op=WALAEventOperation.CGroupsInitialize,
                          is_success=False,
                          message=message,
                          log_event=False)
            finally:
                self._initialized = True
Пример #12
0
    def run(self, debug=False):  # pylint: disable=R0912
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            #
            # Fetch the goal state one time; some components depend on information provided by the goal state and this
            # call ensures the required info is initialized (e.g telemetry depends on the container ID.)
            #
            protocol = self.protocol_util.get_protocol()
            protocol.update_goal_state()

            # Initialize the common parameters for telemetry events
            initialize_event_logger_vminfo_common_parameters(protocol)

            # Log OS-specific info.
            os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\
                u"OSUtil: {util_name}; AgentService: {service_name}; "\
                u"Python: {py_major}.{py_minor}.{py_micro}; "\
                u"systemd: {systemd}; "\
                u"LISDrivers: {lis_ver}; "\
                u"logrotate: {has_logrotate};".format(
                    dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION,
                    util_name=type(self.osutil).__name__,
                    service_name=self.osutil.service_name,
                    py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR,
                    py_micro=PY_VERSION_MICRO, systemd=CGroupsApi.is_systemd(),
                    lis_ver=get_lis_version(), has_logrotate=has_logrotate()
            )

            logger.info(os_info_msg)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.OSInfo,
                      message=os_info_msg)

            #
            # Perform initialization tasks
            #
            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler(protocol)
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler(protocol)

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._emit_changes_in_default_configuration()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()
            self._ensure_cgroups_initialized()
            self._ensure_extension_telemetry_state_configured_properly(
                protocol)

            # Get all thread handlers
            telemetry_handler = get_send_telemetry_events_handler(
                self.protocol_util)
            all_thread_handlers = [
                get_monitor_handler(),
                get_env_handler(), telemetry_handler,
                get_collect_telemetry_events_handler(telemetry_handler)
            ]

            if is_log_collection_allowed():
                all_thread_handlers.append(get_collect_logs_handler())

            # Launch all monitoring threads
            for thread_handler in all_thread_handlers:
                thread_handler.run()

            goal_state_interval = conf.get_goal_state_period(
            ) if conf.get_extensions_enabled(
            ) else GOAL_STATE_INTERVAL_DISABLED

            while self.running:
                #
                # Check that the parent process (the agent's daemon) is still running
                #
                if not debug and self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                #
                # Check that all the threads are still running
                #
                for thread_handler in all_thread_handlers:
                    if not thread_handler.is_alive():
                        logger.warn("{0} thread died, restarting".format(
                            thread_handler.get_thread_name()))
                        thread_handler.start()

                #
                # Process the goal state
                #
                if not protocol.try_update_goal_state():
                    self._heartbeat_update_goal_state_error_count += 1
                else:
                    if self._upgrade_available(protocol):
                        available_agent = self.get_latest_agent()
                        if available_agent is None:
                            logger.info(
                                "Agent {0} is reverting to the installed agent -- exiting",
                                CURRENT_AGENT)
                        else:
                            logger.info(
                                u"Agent {0} discovered update {1} -- exiting",
                                CURRENT_AGENT, available_agent.name)
                        break

                    utc_start = datetime.utcnow()

                    last_etag = exthandlers_handler.last_etag
                    exthandlers_handler.run()

                    remote_access_handler.run()

                    if last_etag != exthandlers_handler.last_etag:
                        self._ensure_readonly_files()
                        duration = elapsed_milliseconds(utc_start)
                        logger.info(
                            'ProcessGoalState completed [incarnation {0}; {1} ms]',
                            exthandlers_handler.last_etag, duration)
                        add_event(AGENT_NAME,
                                  op=WALAEventOperation.ProcessGoalState,
                                  duration=duration,
                                  message="Incarnation {0}".format(
                                      exthandlers_handler.last_etag))

                self._send_heartbeat_telemetry(protocol)
                time.sleep(goal_state_interval)

        except Exception as e:  # pylint: disable=C0103
            msg = u"Agent {0} failed with exception: {1}".format(
                CURRENT_AGENT, ustr(e))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)