示例#1
0
    def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False):
        """
        Returns the number of USER_HZ of CPU time (user and system) consumed by this cgroup.

        If allow_no_such_file_or_directory_error is set to True and cpuacct.stat does not exist the function
        returns 0; this is useful when the function can be called before the cgroup has been created.
        """
        try:
            cpu_stat = self._get_file_contents('cpuacct.stat')
        except Exception as e:  # pylint: disable=C0103
            if not isinstance(e,
                              (IOError, OSError)) or e.errno != errno.ENOENT:  # pylint: disable=E1101
                raise CGroupsException(
                    "Failed to read cpuacct.stat: {0}".format(ustr(e)))
            if not allow_no_such_file_or_directory_error:
                raise e
            cpu_stat = None

        cpu_ticks = 0

        if cpu_stat is not None:
            match = re_user_system_times.match(cpu_stat)
            if not match:
                raise CGroupsException(
                    "The contents of {0} are invalid: {1}".format(
                        self._get_cgroup_file('cpuacct.stat'), cpu_stat))
            cpu_ticks = int(match.groups()[0]) + int(match.groups()[1])

        return cpu_ticks
示例#2
0
    def _get_parameters(self, parameter_name, first_line_only=False):
        """
        Retrieve the values of a parameter from a controller.
        Returns a list of values in the file.

        :param first_line_only: return only the first line.
        :param str parameter_name: Name of file within that metric controller
        :return: The first line of the file, without line terminator
        :rtype: [str]
        """
        result = []
        try:
            values = self._get_file_contents(parameter_name).splitlines()
            result = values[0] if first_line_only else values
        except IndexError:
            parameter_filename = self._get_cgroup_file(parameter_name)
            logger.error("File {0} is empty but should not be".format(
                parameter_filename))
            raise CGroupsException(
                "File {0} is empty but should not be".format(
                    parameter_filename))
        except Exception as e:
            if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
                raise e
            parameter_filename = self._get_cgroup_file(parameter_name)
            logger.error("Exception while attempting to read {0}: {1}".format(
                parameter_filename, ustr(e)))
            raise CGroupsException(
                "Exception while attempting to read {0}".format(
                    parameter_filename), e)
        return result
示例#3
0
    def _try_mkdir(path):
        """
        Try to create a directory, recursively. If it already exists as such, do nothing. Raise the appropriate
        exception should an error occur.

        :param path: str
        """
        if not os.path.isdir(path):
            try:
                os.makedirs(path, 0o755)
            except OSError as e:  # pylint: disable=C0103
                if e.errno == errno.EEXIST:
                    if not os.path.isdir(path):  # pylint: disable=R1720
                        raise CGroupsException(
                            "Create directory for cgroup {0}: normal file already exists with that name"
                            .format(path))
                    else:
                        pass  # There was a race to create the directory, but it's there now, and that's fine
                elif e.errno == errno.EACCES:
                    # This is unexpected, as the agent runs as root
                    raise CGroupsException(
                        "Create directory for cgroup {0}: permission denied".
                        format(path))
                else:
                    raise
示例#4
0
        def enable(self):
            if not self._cgroups_supported:
                raise CGroupsException(
                    "Attempted to enable cgroups, but they are not supported on the current platform"
                )

            self._cgroups_enabled = True
示例#5
0
    def create_agent_cgroups(self):
        """
        Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups.
        """
        cgroups = []

        pid = int(os.getpid())

        def create_cgroup(controller):
            try:
                path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller,
                                    VM_AGENT_CGROUP_NAME)

                if not os.path.isdir(path):
                    FileSystemCgroupsApi._try_mkdir(path)
                    logger.info("Created cgroup {0}".format(path))

                self._add_process_to_cgroup(pid, path)

                cgroups.append(
                    CGroup.create(path, controller, VM_AGENT_CGROUP_NAME))

            except Exception as e:
                logger.warn(
                    'Cannot create "{0}" cgroup for the agent. Error: {1}'.
                    format(controller, ustr(e)))

        self._foreach_controller(create_cgroup,
                                 'Will not create a cgroup for the VM Agent')

        if len(cgroups) == 0:
            raise CGroupsException(
                "Failed to create any cgroup for the VM Agent")

        return cgroups
示例#6
0
    def create_agent_cgroups(self):
        """
        Creates a cgroup for the VM Agent in each of the controllers we are tracking; returns the created cgroups.
        """
        cgroups = []

        pid = int(os.getpid())

        def create_cgroup(controller):
            path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)

            if not os.path.isdir(path):
                FileSystemCgroupsApi._try_mkdir(path)
                logger.info("Created cgroup {0}".format(path))

            self._add_process_to_cgroup(pid, path)

            cgroups.append(CGroup.create(path, controller, VM_AGENT_CGROUP_NAME))

        self._foreach_controller(create_cgroup, 'Failed to create a cgroup for the VM Agent; resource usage will not be tracked')

        if len(cgroups) == 0:
            raise CGroupsException("Failed to create any cgroup for the VM Agent")

        return cgroups
示例#7
0
    def get_cpu_usage(self):
        """
        Computes the CPU used by the cgroup since the last call to this function.

        The usage is measured as a percentage of utilization of all cores in the system. For example,
        using 1 core at 100% on a 4-core system would be reported as 25%.

        NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage()
        """
        if not self._cpu_usage_initialized():
            raise CGroupsException(
                "initialize_cpu_usage() must be invoked before the first call to get_cpu_usage()"
            )

        self._previous_cgroup_cpu = self._current_cgroup_cpu
        self._previous_system_cpu = self._current_system_cpu
        self._current_cgroup_cpu = self._get_cpu_ticks()
        self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot(
        )

        cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu
        system_delta = max(
            1, self._current_system_cpu - self._previous_system_cpu)

        return round(100.0 * float(cgroup_delta) / float(system_delta), 3)
示例#8
0
 def create_and_start_unit(unit_filename, unit_contents):
     try:
         unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
         fileutil.write_file(unit_path, unit_contents)
         shellutil.run_command(["systemctl", "daemon-reload"])
         shellutil.run_command(["systemctl", "start", unit_filename])
     except Exception as e:
         raise CGroupsException("Failed to create and start {0}. Error: {1}".format(unit_filename, ustr(e)))
 def create(cgroup_path, controller, extension_name):
     """
     Factory method to create the correct CGroup.
     """
     if controller == "cpu":
         return CpuCgroup(extension_name, cgroup_path)
     if controller == "memory":
         return MemoryCgroup(extension_name, cgroup_path)
     raise CGroupsException('CGroup controller {0} is not supported'.format(controller))
示例#10
0
 def initialize_cpu_usage(self):
     """
     Sets the initial values of CPU usage. This function must be invoked before calling get_cpu_usage().
     """
     if self._cpu_usage_initialized():
         raise CGroupsException(
             "initialize_cpu_usage() should be invoked only once")
     self._current_cgroup_cpu = self._get_cpu_ticks(
         allow_no_such_file_or_directory_error=True)
     self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot(
     )
示例#11
0
 def _get_controller_id(controller):
     """
     Get the ID for a given cgroup controller
     """
     cgroup_states = fileutil.read_file("/proc/cgroups")
     for entry in cgroup_states.splitlines():
         fields = entry.split('\t')
         if fields[0] == controller:
             return fields[1]
     raise CGroupsException(
         "Cgroup controller {0} not found in /proc/cgroups".format(
             controller))
示例#12
0
 def _get_current_process_cgroup_relative_path(controller_id):
     """
     Get the cgroup path "suffix" for this process for the given controller. The leading "/" is always stripped,
     so the suffix is suitable for passing to os.path.join(). (If the process is in the root cgroup, an empty
     string is returned, and os.path.join() will still do the right thing.)
     """
     cgroup_paths = fileutil.read_file("/proc/self/cgroup")
     for entry in cgroup_paths.splitlines():
         fields = entry.split(':')
         if fields[0] == controller_id:
             return fields[2].lstrip(os.path.sep)
     raise CGroupsException(
         "This process belongs to no cgroup for controller ID {0}".format(
             controller_id))
示例#13
0
    def get_throttled_time(self):
        """
        Computes the throttled time (in seconds) since the last call to this function.
        NOTE: initialize_cpu_usage() must be invoked before calling this function
        """
        if not self._cpu_usage_initialized():
            raise CGroupsException(
                "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()"
            )

        self._previous_throttled_time = self._current_throttled_time
        self._current_throttled_time = self._get_throttled_time()

        return float(self._current_throttled_time -
                     self._previous_throttled_time) / 1E9
示例#14
0
 def _get_throttled_time(self):
     try:
         with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat:
             #
             # Sample file:
             #
             #   # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat
             #   nr_periods  51660
             #   nr_throttled 19461
             #   throttled_time 1529590856339
             #
             for line in cpu_stat:
                 match = re.match(r'throttled_time\s+(\d+)', line)
                 if match is not None:
                     return int(match.groups()[0])
             raise Exception("Cannot find throttled_time")
     except (IOError, OSError) as e:
         if e.errno == errno.ENOENT:
             return 0
         raise CGroupsException("Failed to read cpu.stat: {0}".format(
             ustr(e)))
     except Exception as e:
         raise CGroupsException("Failed to read cpu.stat: {0}".format(
             ustr(e)))
    def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self):
        with self._get_cgroup_configurator() as configurator:
            checks = [
                "_check_processes_in_agent_cgroup",
                "_check_agent_throttled_time"
            ]
            for method_to_fail in checks:
                patchers = []
                try:
                    # mock 'method_to_fail' to raise an exception and the rest to do nothing
                    for method_to_mock in checks:
                        side_effect = CGroupsException(
                            method_to_fail
                        ) if method_to_mock == method_to_fail else lambda *_: None
                        p = patch.object(configurator,
                                         method_to_mock,
                                         side_effect=side_effect)
                        patchers.append(p)
                        p.start()

                    with patch(
                            "azurelinuxagent.common.cgroupconfigurator.add_event"
                    ) as add_event:
                        configurator.enable()

                        configurator.check_cgroups([])

                        self.assertFalse(
                            configurator.enabled(),
                            "An error in {0} should have disabled cgroups".
                            format(method_to_fail))

                        disable_events = [
                            kwargs for _, kwargs in add_event.call_args_list if
                            kwargs["op"] == WALAEventOperation.CGroupsDisabled
                        ]
                        self.assertTrue(
                            len(disable_events) == 1,
                            "Exactly 1 event should have been emitted when {0} fails. Got: {1}"
                            .format(method_to_fail, disable_events))
                        self.assertIn(
                            "[CGroupsException] {0}".format(method_to_fail),
                            disable_events[0]["message"],
                            "The error message is not correct when {0} failed".
                            format(method_to_fail))
                finally:
                    for p in patchers:
                        p.stop()
示例#16
0
 def remove_extension_cgroups(self, extension_name):
     # For transient units, cgroups are released automatically when the unit stops, so it is sufficient
     # to call stop on them. Persistent cgroups are released when the unit is disabled and its configuration
     # file is deleted.
     # The assumption is that this method is called after the extension has been uninstalled. For now, since
     # we're running extensions within transient scopes which clean up after they finish running, no removal
     # of units is needed. In the future, when the extension is running under its own slice,
     # the following clean up is needed.
     unit_filename = self._get_extension_slice_name(extension_name)
     try:
         unit_path = os.path.join(UNIT_FILES_FILE_SYSTEM_PATH, unit_filename)
         shellutil.run_command(["systemctl", "stop", unit_filename])
         fileutil.rm_files(unit_path)
         shellutil.run_command(["systemctl", "daemon-reload"])
     except Exception as e:
         raise CGroupsException("Failed to remove {0}. Error: {1}".format(unit_filename, ustr(e)))
示例#17
0
    def assert_limits(self,
                      _,
                      patch_set_cpu,
                      patch_set_memory_limit,
                      patch_get_enforce,
                      patch_add_event,
                      ext_name,
                      expected_cpu_limit,
                      limits_enforced=True,
                      exception_raised=False):

        should_limit = expected_cpu_limit > 0
        patch_get_enforce.return_value = limits_enforced

        if exception_raised:
            patch_set_memory_limit.side_effect = CGroupsException(
                'set_memory_limit error')

        try:
            cg = CGroupConfigurator.for_extension(ext_name)
            cg.set_limits()
            if exception_raised:
                self.fail('exception expected')
        except CGroupsException:
            if not exception_raised:
                self.fail('exception not expected')

        self.assertEqual(should_limit, patch_set_cpu.called)
        self.assertEqual(should_limit, patch_set_memory_limit.called)
        self.assertEqual(should_limit, patch_add_event.called)

        if should_limit:
            actual_cpu_limit = patch_set_cpu.call_args[0][0]
            actual_memory_limit = patch_set_memory_limit.call_args[0][0]
            event_kw_args = patch_add_event.call_args[1]

            self.assertEqual(expected_cpu_limit, actual_cpu_limit)
            self.assertTrue(actual_memory_limit >= DEFAULT_MEM_LIMIT_MIN_MB)
            self.assertEqual(event_kw_args['op'], 'SetCGroupsLimits')
            self.assertEqual(event_kw_args['is_success'], not exception_raised)
            self.assertTrue(
                '{0}%'.format(expected_cpu_limit) in event_kw_args['message'])
            self.assertTrue(ext_name in event_kw_args['message'])
            self.assertEqual(
                exception_raised, 'set_memory_limit error'
                in event_kw_args['message'])
示例#18
0
        def _check_processes_in_agent_cgroup(self):
            """
            Verifies that the agent's cgroup includes only the current process, its parent, commands started using shellutil and instances of systemd-run
            (those processes correspond, respectively, to the extension handler, the daemon, commands started by the extension handler, and the systemd-run
            commands used to start extensions on their own cgroup).
            Other processes started by the agent (e.g. extensions) and processes not started by the agent (e.g. services installed by extensions) are reported
            as unexpected, since they should belong to their own cgroup.

            Raises a CGroupsException if the check fails
            """
            unexpected = []

            try:
                daemon = os.getppid()
                extension_handler = os.getpid()
                agent_commands = set()
                agent_commands.update(shellutil.get_running_commands())
                systemd_run_commands = set()
                systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())
                agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path)
                # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup;
                agent_commands.update(shellutil.get_running_commands())
                systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())

                for process in agent_cgroup:
                    # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
                    if process in (daemon, extension_handler) or process in systemd_run_commands:
                        continue
                    # systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
                    if self._get_parent(process) in systemd_run_commands and self._get_command(process) == 'systemd-run':
                        continue
                    # check if the process is a command started by the agent or a descendant of one of those commands
                    current = process
                    while current != 0 and current not in agent_commands:
                        current = self._get_parent(current)
                    if current == 0:
                        unexpected.append(self.__format_process(process))
                        if len(unexpected) >= 5:  # collect just a small sample
                            break
            except Exception as exception:
                _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))

            if len(unexpected) > 0:
                raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))
示例#19
0
 def collect_data(self, cgroup):
     # noinspection PyBroadException
     try:
         if cgroup.controller == "cpu":
             self._cpu_usage.append(cgroup.get_cpu_usage())
         elif cgroup.controller == "memory":
             self._memory_usage.append(cgroup.get_memory_usage())
             self._max_memory_usage.append(cgroup.get_max_memory_usage())
         else:
             raise CGroupsException(
                 'CGroup controller {0} is not supported'.format(
                     controller))
     except Exception as e:
         if not isinstance(e,
                           (IOError, OSError)) or e.errno != errno.ENOENT:
             logger.periodic_warn(
                 logger.EVERY_HALF_HOUR,
                 'Could not collect metrics for cgroup {0}. Error : {1}'.
                 format(cgroup.path, ustr(e)))
示例#20
0
    def get_max_memory_usage(self):
        """
        Collect memory.usage_in_bytes from the cgroup.

        :return: Memory usage in bytes
        :rtype: int
        """
        usage = None
        try:
            usage = self._get_parameters('memory.max_usage_in_bytes',
                                         first_line_only=True)
        except Exception as e:  # pylint: disable=C0103
            if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:  # pylint: disable=E1101
                raise
            raise CGroupsException(
                "Exception while attempting to read {0}".format(
                    "memory.usage_in_bytes"), e)

        return int(usage)
示例#21
0
    def _get_current_cpu_total(self):
        """
        Compute the number of USER_HZ of CPU time (user and system) consumed by this cgroup since boot.

        :return: int
        """
        cpu_total = 0
        try:
            cpu_stat = self._get_file_contents('cpuacct.stat')
        except Exception as e:
            if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
                raise e
            raise CGroupsException("Exception while attempting to read {0}".format("cpuacct.stat"), e)

        if cpu_stat:
            m = re_user_system_times.match(cpu_stat)
            if m:
                cpu_total = int(m.groups()[0]) + int(m.groups()[1])
        return cpu_total
示例#22
0
    def create_agent_cgroups(self):
        try:
            cgroup_unit = None
            cgroup_paths = fileutil.read_file("/proc/self/cgroup")
            for entry in cgroup_paths.splitlines():
                fields = entry.split(':')
                if fields[1] == "name=systemd":
                    cgroup_unit = fields[2].lstrip(os.path.sep)

            cpu_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, 'cpu',
                                           cgroup_unit)
            memory_cgroup_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT,
                                              'memory', cgroup_unit)

            return [
                CGroup.create(cpu_cgroup_path, 'cpu', cgroup_unit),
                CGroup.create(memory_cgroup_path, 'memory', cgroup_unit)
            ]
        except Exception as e:
            raise CGroupsException(
                "Failed to get paths of agent's cgroups. Error: {0}".format(
                    ustr(e)))
示例#23
0
 def enable(self):
     if not self.supported():
         raise CGroupsException("Attempted to enable cgroups, but they are not supported on the current platform")
     self._agent_cgroups_enabled = True
     self._extensions_cgroups_enabled = True
     self.__set_cpu_quota(conf.get_agent_cpu_quota())
        def enable(self):
            if not self._cgroups_supported:
                raise CGroupsException(
                    "cgroups are not supported on the current platform")

            self._enabled = True
示例#25
0
    def poll_all_tracked():
        metrics = []

        with CGroupsTelemetry._rlock:
            for cgroup in CGroupsTelemetry._tracked[:]:
                if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name] = CgroupMetrics()
                try:
                    if cgroup.controller == CGroupContollers.CPU:
                        current_cpu_usage = cgroup.get_cpu_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_cpu_usage(current_cpu_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.PROCESS_CATEGORY,
                                        MetricsCounter.PROCESSOR_PERCENT_TIME,
                                        cgroup.name, current_cpu_usage))
                    elif cgroup.controller == CGroupContollers.MEMORY:
                        current_memory_usage = cgroup.get_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_memory_usage(current_memory_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.TOTAL_MEM_USAGE,
                                        cgroup.name, current_memory_usage))

                        max_memory_usage = cgroup.get_max_memory_usage()
                        CGroupsTelemetry._cgroup_metrics[
                            cgroup.name].add_max_memory_usage(max_memory_usage)
                        metrics.append(
                            MetricValue(MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.MAX_MEM_USAGE,
                                        cgroup.name, max_memory_usage))

                        pids = cgroup.get_tracked_processes()
                        for pid in pids:
                            try:
                                mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(
                                    pid)
                                metrics.append(
                                    MetricValue(
                                        MetricsCategory.MEMORY_CATEGORY,
                                        MetricsCounter.MEM_USED_BY_PROCESS,
                                        CGroupsTelemetry.
                                        get_process_info_summary(pid),
                                        mem_usage_from_procstatm))
                                CGroupsTelemetry._cgroup_metrics[
                                    cgroup.name].add_proc_statm_memory(
                                        CGroupsTelemetry.
                                        get_process_info_summary(pid),
                                        mem_usage_from_procstatm)
                            except Exception as e:
                                if not isinstance(
                                        e,
                                    (IOError,
                                     OSError)) or e.errno != errno.ENOENT:
                                    logger.periodic_warn(
                                        logger.EVERY_HOUR,
                                        "[PERIODIC] Could not collect proc_statm "
                                        "for pid {0}. Error : {1}", pid,
                                        ustr(e))
                    else:
                        raise CGroupsException(
                            'CGroup controller {0} is not supported for cgroup {1}'
                            .format(cgroup.controller, cgroup.name))
                except Exception as e:
                    # There can be scenarios when the CGroup has been deleted by the time we are fetching the values
                    # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
                    # every occurrences of such case as it would be very verbose. We do want to log all the other
                    # exceptions which could occur, which is why we do a periodic log for all the other errors.
                    if not isinstance(
                            e, (IOError, OSError)) or e.errno != errno.ENOENT:
                        logger.periodic_warn(
                            logger.EVERY_HOUR,
                            '[PERIODIC] Could not collect metrics for cgroup '
                            '{0}. Error : {1}'.format(cgroup.name, ustr(e)))
                if not cgroup.is_active():
                    CGroupsTelemetry.stop_tracking(cgroup)
                    CGroupsTelemetry._cgroup_metrics[
                        cgroup.name].marked_for_delete = True

        return metrics
示例#26
0
 def enable(self):
     if not self.supported():
         raise CGroupsException("Attempted to enable cgroups, but they are not supported on the current platform")
     self._cgroups_enabled = True
     self.__set_cpu_quota(_AGENT_CPU_QUOTA)
示例#27
0
 def _check_agent_throttled_time(cgroup_metrics):
     for metric in cgroup_metrics:
         if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
             if metric.value > _AGENT_THROTTLED_TIME_THRESHOLD:
                 raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))
 def report_error(_, daemon_pid):
     raise CGroupsException(
         "The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data."
         .format(daemon_pid))