def test_get_current_cpu_total_exception_handling(self):
        test_cpu_cg = CpuCgroup("test_extension", "dummy_path")
        self.assertRaises(IOError, test_cpu_cg._get_current_cpu_total)

        # Trying to raise ERRNO 20.
        test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount", "cpuacct.stat"))
        self.assertRaises(CGroupsException, test_cpu_cg._get_current_cpu_total)
Exemplo n.º 2
0
    def test_is_active(self):
        test_cgroup = CpuCgroup("test_extension", self.tmp_dir)
        self.assertEqual(False, test_cgroup.is_active())

        with open(os.path.join(self.tmp_dir, "tasks"), mode="wb") as tasks:
            tasks.write(str(1000).encode())

        self.assertEqual(True, test_cgroup.is_active())
Exemplo n.º 3
0
    def test_is_active_file_not_present(self, patch_periodic_warn):
        test_cgroup = CpuCgroup("test_extension", self.tmp_dir)
        self.assertEqual(False, test_cgroup.is_active())

        test_cgroup = MemoryCgroup(
            "test_extension",
            os.path.join(self.tmp_dir, "this_cgroup_does_not_exist"))
        self.assertEqual(False, test_cgroup.is_active())

        self.assertEqual(0, patch_periodic_warn.call_count)
    def test_get_cpu_usage(self, patch_get_proc_stat, *args):
        patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat"))
        test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount"))

        # Mocking CPU consumption
        patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups",
                                                                           "dummy_proc_stat_updated"))

        cpu_usage = test_cpu_cg.get_cpu_usage()

        self.assertEqual(5.114, cpu_usage)
Exemplo n.º 5
0
    def test_collect(self, patch_get_proc_stat, *args):
        patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat"))
        test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount"))

        # Mocking CPU consumption
        patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups",
                                                                           "dummy_proc_stat_updated"))

        collected_metric = test_cpu_cg.collect()[0]

        self.assertEqual("cpu", collected_metric.controller)
        self.assertEqual("% Processor Time", collected_metric.metric_name)
        self.assertEqual(5.114, collected_metric.value)
Exemplo n.º 6
0
    def test_initialize_cpu_usage_should_set_current_cpu_usage(self):
        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t0"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t0")
        }

        cgroup.initialize_cpu_usage()

        self.assertEqual(cgroup._current_cgroup_cpu, 63763)  # pylint: disable=protected-access
        self.assertEqual(cgroup._current_system_cpu, 5496872)  # pylint: disable=protected-access
Exemplo n.º 7
0
    def test_get_throttled_time_should_return_the_value_since_its_last_invocation(
            self):
        test_file = os.path.join(self.tmp_dir, "cpu.stat")
        shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"),
                        test_file)  # throttled_time = 50
        cgroup = CpuCgroup("test", self.tmp_dir)
        cgroup.initialize_cpu_usage()
        shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t1"),
                        test_file)  # throttled_time = 2075541442327

        throttled_time = cgroup.get_throttled_time()

        self.assertEqual(throttled_time,
                         float(2075541442327 - 50) / 1E9,
                         "The value of throttled_time is incorrect")
Exemplo n.º 8
0
    def test_initialie_cpu_usage_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist(
            self):
        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        io_error_2 = IOError()
        io_error_2.errno = errno.ENOENT  # "No such directory"

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"),
            os.path.join(cgroup.path, "cpuacct.stat"): io_error_2
        }

        cgroup.initialize_cpu_usage()

        self.assertEqual(cgroup._current_cgroup_cpu, 0)  # pylint: disable=protected-access
        self.assertEqual(cgroup._current_system_cpu, 5496872)  # check the system usage just for test sanity # pylint: disable=protected-access
Exemplo n.º 9
0
    def test_generate_extension_metrics_telemetry_dictionary(self, *args):  # pylint: disable=unused-argument
        num_polls = 10
        num_extensions = 1

        cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)]

        # only verifying calculations and not validity of the values.
        memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]
        max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)]

        # no need to initialize the CPU usage, since we mock get_cpu_usage() below
        with patch("azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage"):
            for i in range(num_extensions):
                dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i))
                CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup)

                dummy_memory_cgroup = MemoryCgroup("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i))
                CGroupsTelemetry.track_cgroup(dummy_memory_cgroup)

        self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked))

        with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage:
            with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage:
                with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage:
                    with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active:
                        for i in range(num_polls):
                            patch_is_active.return_value = True
                            patch_get_cpu_usage.return_value = cpu_percent_values[i]
                            patch_get_memory_usage.return_value = memory_usage_values[i]  # example 200 MB
                            patch_get_memory_max_usage.return_value = max_memory_usage_values[i]  # example 450 MB
                            CGroupsTelemetry.poll_all_tracked()
Exemplo n.º 10
0
    def test_telemetry_in_place_leaf_cgroup(self):
        """
        Ensure this leaf (i.e. not root of cgroup tree) cgroup has distinct metrics from the root cgroup.
        """
        # Does nothing on systems where the default cgroup for a randomly-created process (like this test invocation)
        # is the root cgroup.
        cg = make_self_cgroups()
        root = make_root_cgroups()
        if cg.cgroups['cpu'] != root.cgroups['cpu']:
            ct = CGroupsTelemetry("test", cg)
            cpu = CpuCgroup(ct)
            self.assertLess(cpu._current_cpu_total, cpu._current_system_cpu)

            consume_cpu_time()  # Eat some CPU
            time.sleep(1)  # Generate some idle time
            cpu._update_cpu_data()
            self.assertLess(cpu._current_cpu_total, cpu._current_system_cpu)
Exemplo n.º 11
0
    def _track_new_extension_cgroups(num_extensions):
        for i in range(num_extensions):
            dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i),
                                         "dummy_cpu_path_{0}".format(i))
            CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup)

            dummy_memory_cgroup = MemoryCgroup(
                "dummy_extension_{0}".format(i),
                "dummy_memory_path_{0}".format(i))
            CGroupsTelemetry.track_cgroup(dummy_memory_cgroup)
Exemplo n.º 12
0
    def test_telemetry_inplace(self):
        """
        Test raw measures and basic statistics for the cgroup in which this process is currently running.
        """
        cg = make_self_cgroups()
        self.assertIn('cpu', cg.cgroups)
        self.assertIn('memory', cg.cgroups)
        ct = CGroupsTelemetry("test", cg)
        cpu = CpuCgroup(ct)
        self.assertGreater(cpu._current_system_cpu, 0)

        consume_cpu_time()  # Eat some CPU
        cpu._update_cpu_data()

        self.assertGreater(cpu._current_cpu_total, cpu._previous_cpu_total)
        self.assertGreater(cpu._current_system_cpu, cpu._previous_system_cpu)

        percent_used = cpu._get_cpu_percent()
        self.assertGreater(percent_used, 0)
    def test_cpu_cgroup_create(self, patch_get_proc_stat):
        patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat"))
        test_cpu_cg = CpuCgroup("test_extension", "dummy_path")

        self.assertEqual(398488, test_cpu_cg._current_system_cpu)
        self.assertEqual(0, test_cpu_cg._current_cpu_total)
        self.assertEqual(0, test_cpu_cg._previous_cpu_total)
        self.assertEqual(0, test_cpu_cg._previous_system_cpu)

        self.assertEqual("cpu", test_cpu_cg.controller)
Exemplo n.º 14
0
    def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2(self, patch_periodic_warn,  # pylint: disable=unused-argument
                                                                                    patch_cpu_usage, patch_add_metric,
                                                                                    *args):
        ioerror = IOError()
        ioerror.errno = 2
        patch_cpu_usage.side_effect = ioerror

        CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name", "/test/path"))

        PollResourceUsage().run()
        self.assertEqual(0, patch_periodic_warn.call_count)
        self.assertEqual(0, patch_add_metric.call_count)  # No metrics should be sent.
Exemplo n.º 15
0
        def stop_tracking_unit_cgroups(self, unit_name):
            """
            TODO: remove Memory cgroups from tracked list.
            """
            try:
                cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name)

                if cpu_cgroup_path is not None:
                    CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path))

            except Exception as exception:
                logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
Exemplo n.º 16
0
 def test_cpu_telemetry(self):
     """
     Test Cpu telemetry class
     """
     cg = make_self_cgroups()
     self.assertIn('cpu', cg.cgroups)
     ct = CGroupsTelemetry('test', cg)
     self.assertIs(cg, ct.cgroup)
     cpu = CpuCgroup(ct)
     self.assertIs(cg, cpu.cgt.cgroup)
     ticks_before = cpu._current_cpu_total
     consume_cpu_time()
     time.sleep(1)
     cpu._update_cpu_data()
     ticks_after = cpu._current_cpu_total
     self.assertGreater(ticks_after, ticks_before)
     p2 = cpu._get_cpu_percent()
     self.assertGreater(p2, 0)
     # when running under PyCharm, this is often > 100
     # on a multi-core machine
     self.assertLess(p2, 200)
Exemplo n.º 17
0
    def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation(
            self):
        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t0"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t0")
        }

        cgroup.initialize_cpu_usage()

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t1"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t1")
        }

        cpu_usage = cgroup.get_cpu_usage()

        self.assertEqual(cpu_usage, 0.031)

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t2"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t2")
        }

        cpu_usage = cgroup.get_cpu_usage()

        self.assertEqual(cpu_usage, 0.045)
Exemplo n.º 18
0
        def start_tracking_unit_cgroups(self, unit_name):
            """
            TODO: Start tracking Memory Cgroups
            """
            try:
                cpu_cgroup_path, _ = self._cgroups_api.get_unit_cgroup_paths(unit_name)

                if cpu_cgroup_path is None:
                    logger.info("The CPU controller is not mounted; will not track resource usage")
                else:
                    CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path))

            except Exception as exception:
                logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception))
Exemplo n.º 19
0
        def initialize(self):
            try:
                if self._initialized:
                    return

                # check whether cgroup monitoring is supported on the current distro
                self._cgroups_supported = CGroupsApi.cgroups_supported()
                if not self._cgroups_supported:
                    logger.info("Cgroup monitoring is not supported on {0}", get_distro())
                    return

                # check that systemd is detected correctly
                self._cgroups_api = SystemdCgroupsApi()
                if not systemd.is_systemd():
                    _log_cgroup_warning("systemd was not detected on {0}", get_distro())
                    return

                _log_cgroup_info("systemd version: {0}", systemd.get_version())

                # This is temporarily disabled while we analyze telemetry. Likely it will be removed.
                # self.__collect_azure_unit_telemetry()
                # self.__collect_agent_unit_files_telemetry()

                if not self.__check_no_legacy_cgroups():
                    return

                agent_unit_name = systemd.get_agent_unit_name()
                agent_slice = systemd.get_unit_property(agent_unit_name, "Slice")
                if agent_slice not in (_AZURE_SLICE, "system.slice"):
                    _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice)
                    return

                self.__setup_azure_slice()

                cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers()
                self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, cpu_controller_root, memory_controller_root)

                if self._agent_cpu_cgroup_path is not None:
                    _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
                    self.enable()
                    CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))

                _log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled)

            except Exception as exception:
                _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception))
            finally:
                self._initialized = True
Exemplo n.º 20
0
        def disable(self, reason, disableCgroups):
            # Todo: disable/reset extension when ext quotas introduced
            if disableCgroups == DisableCgroups.ALL:                 # disable all
                self._agent_cgroups_enabled = False
                self._extensions_cgroups_enabled = False
                self.__reset_agent_cpu_quota()
                CGroupsTelemetry.reset()
            elif disableCgroups == DisableCgroups.AGENT: # disable agent
                self._agent_cgroups_enabled = False
                self.__reset_agent_cpu_quota()
                CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path))
            elif disableCgroups == DisableCgroups.EXTENSIONS: # disable extensions
                self._extensions_cgroups_enabled = False

            message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason)
            logger.info(message)  # log as INFO for now, in the future it should be logged as WARNING
            add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False)
Exemplo n.º 21
0
        def stop_tracking_extension_cgroups(self, extension_name):
            """
            TODO: remove extension Memory cgroups from tracked list
            """
            try:
                extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name)
                cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE,
                                                    extension_slice_name)

                cpu_cgroup_mountpoint, _ = self._cgroups_api.get_cgroup_mount_points()
                cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)

                if cpu_cgroup_path is not None:
                    CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path))

            except Exception as exception:
                logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception))
Exemplo n.º 22
0
    def test_initialize_cpu_usage_should_raise_an_exception_when_called_more_than_once(
            self):
        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t0"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t0")
        }

        cgroup.initialize_cpu_usage()

        with self.assertRaises(CGroupsException):
            cgroup.initialize_cpu_usage()
Exemplo n.º 23
0
    def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2(
            self, patch_periodic_warn, patch_cpu_usage, patch_add_metric,
            *args):
        ioerror = IOError()
        ioerror.errno = 2
        patch_cpu_usage.side_effect = ioerror

        CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name",
                                                   "/test/path"))

        monitor_handler = get_monitor_handler()
        monitor_handler.init_protocols()
        monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow(
        ) - timedelta(hours=1)
        monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow(
        ) - timedelta(hours=1)
        monitor_handler.poll_telemetry_metrics()
        self.assertEqual(0, patch_periodic_warn.call_count)
        self.assertEqual(
            0, patch_add_metric.call_count)  # No metrics should be sent.
        monitor_handler.stop()
Exemplo n.º 24
0
    def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation(
            self):
        osutil = get_osutil()

        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t0"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t0")
        }

        cgroup.initialize_cpu_usage()

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t1"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t1")
        }

        cpu_usage = cgroup.get_cpu_usage()

        self.assertEqual(
            cpu_usage,
            round(100.0 * 0.000307697876885 * osutil.get_processor_cores(), 3))

        TestCpuCgroup.mock_read_file_map = {
            "/proc/stat":
            os.path.join(data_dir, "cgroups", "proc_stat_t2"),
            os.path.join(cgroup.path, "cpuacct.stat"):
            os.path.join(data_dir, "cgroups", "cpuacct.stat_t2")
        }

        cpu_usage = cgroup.get_cpu_usage()

        self.assertEqual(
            cpu_usage,
            round(100.0 * 0.000445181085968 * osutil.get_processor_cores(), 3))
Exemplo n.º 25
0
    def test_get_tracked_metrics_should_return_the_throttled_time(self):
        cgroup = CpuCgroup("test", os.path.join(data_dir, "cgroups"))
        cgroup.initialize_cpu_usage()

        def find_throttled_time(metrics):
            return [
                m for m in metrics
                if m.counter == MetricsCounter.THROTTLED_TIME
            ]

        found = find_throttled_time(cgroup.get_tracked_metrics())
        self.assertTrue(
            len(found) == 0,
            "get_tracked_metrics should not fetch the throttled time by default. Found: {0}"
            .format(found))

        found = find_throttled_time(
            cgroup.get_tracked_metrics(track_throttled_time=True))
        self.assertTrue(
            len(found) == 1,
            "get_tracked_metrics should have fetched the throttled time by default. Found: {0}"
            .format(found))
Exemplo n.º 26
0
    def start_extension_command(
            self,
            extension_name,
            command,
            cmd_name,
            timeout,
            shell,
            cwd,
            env,
            stdout,
            stderr,
            error_code=ExtensionErrorCodes.PluginUnknownFailure):
        scope = "{0}_{1}".format(cmd_name, uuid.uuid4())
        extension_slice_name = self.get_extension_slice_name(extension_name)
        with self._systemd_run_commands_lock:
            process = subprocess.Popen(  # pylint: disable=W1509
                "systemd-run --unit={0} --scope --slice={1} {2}".format(
                    scope, extension_slice_name, command),
                shell=shell,
                cwd=cwd,
                stdout=stdout,
                stderr=stderr,
                env=env,
                preexec_fn=os.setsid)

            # We start systemd-run with shell == True so process.pid is the shell's pid, not the pid for systemd-run
            self._systemd_run_commands.append(process.pid)

        scope_name = scope + '.scope'

        logger.info("Started extension in unit '{0}'", scope_name)

        try:
            cgroup_relative_path = os.path.join(
                'azure.slice/azure-vmextensions.slice', extension_slice_name)

            cpu_cgroup_mountpoint, _ = self.get_cgroup_mount_points()

            if cpu_cgroup_mountpoint is None:
                logger.info(
                    "The CPU controller is not mounted; will not track resource usage"
                )
            else:
                cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint,
                                               cgroup_relative_path)
                CGroupsTelemetry.track_cgroup(
                    CpuCgroup(extension_name, cpu_cgroup_path))

        except IOError as e:
            if e.errno == 2:  # 'No such file or directory'
                logger.info(
                    "The extension command already completed; will not track resource usage"
                )
            logger.info(
                "Failed to start tracking resource usage for the extension: {0}",
                ustr(e))
        except Exception as e:
            logger.info(
                "Failed to start tracking resource usage for the extension: {0}",
                ustr(e))

        # Wait for process completion or timeout
        try:
            return handle_process_completion(process=process,
                                             command=command,
                                             timeout=timeout,
                                             stdout=stdout,
                                             stderr=stderr,
                                             error_code=error_code)
        except ExtensionError as e:
            # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
            # extension errors.
            if not self._is_systemd_failure(scope, stderr):
                # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
                raise

            # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
            process_output = read_output(stdout, stderr)
            # Reset the stdout and stderr
            stdout.truncate(0)
            stderr.truncate(0)

            if isinstance(e, ExtensionOperationError):
                # no-member: Instance of 'ExtensionError' has no 'exit_code' member (no-member) - Disabled: e is actually an ExtensionOperationError
                err_msg = 'Systemd process exited with code %s and output %s' % (
                    e.exit_code, process_output)  # pylint: disable=no-member
            else:
                err_msg = "Systemd timed-out, output: %s" % process_output
            raise SystemdRunError(err_msg)
        finally:
            with self._systemd_run_commands_lock:
                self._systemd_run_commands.remove(process.pid)
Exemplo n.º 27
0
        def initialize(self):
            try:
                if self._initialized:
                    return

                #
                # check whether cgroup monitoring is supported on the current distro
                #
                self._cgroups_supported = CGroupsApi.cgroups_supported()
                if not self._cgroups_supported:
                    logger.info("Cgroup monitoring is not supported on {0}",
                                get_distro())
                    return

                #
                # check systemd
                #
                self._cgroups_api = CGroupsApi.create()

                if not isinstance(self._cgroups_api, SystemdCgroupsApi):
                    message = "systemd was not detected on {0}".format(
                        get_distro())
                    logger.warn(message)
                    add_event(op=WALAEventOperation.CGroupsInitialize,
                              is_success=False,
                              message=message,
                              log_event=False)
                    return

                def log_cgroup_info(format_string, *args):
                    message = format_string.format(*args)
                    logger.info(message)
                    add_event(op=WALAEventOperation.CGroupsInfo,
                              message=message)

                def log_cgroup_warn(format_string, *args):
                    message = format_string.format(*args)
                    logger.warn(message)
                    add_event(op=WALAEventOperation.CGroupsInfo,
                              message=message,
                              is_success=False,
                              log_event=False)

                log_cgroup_info("systemd version: {0}",
                                self._cgroups_api.get_systemd_version())

                #
                # Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent.  When running
                # under systemd this could produce invalid resource usage data. Do not enable cgroups under this condition.
                #
                legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()

                if legacy_cgroups > 0:
                    log_cgroup_warn(
                        "The daemon's PID was added to a legacy cgroup; will not monitor resource usage."
                    )
                    return

                #
                # check v1 controllers
                #
                cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points(
                )

                if cpu_controller_root is not None:
                    logger.info("The CPU cgroup controller is mounted at {0}",
                                cpu_controller_root)
                else:
                    log_cgroup_warn("The CPU cgroup controller is not mounted")

                if memory_controller_root is not None:
                    logger.info(
                        "The memory cgroup controller is mounted at {0}",
                        memory_controller_root)
                else:
                    log_cgroup_warn(
                        "The memory cgroup controller is not mounted")

                #
                # check v2 controllers
                #
                cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers(
                )
                if cgroup2_mountpoint is not None:
                    log_cgroup_warn(
                        "cgroups v2 mounted at {0}.  Controllers: [{1}]",
                        cgroup2_mountpoint, cgroup2_controllers)

                #
                # check the cgroups for the agent
                #
                agent_unit_name = self._cgroups_api.get_agent_unit_name()
                cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths(
                    "self")
                if cpu_cgroup_relative_path is None:
                    log_cgroup_warn(
                        "The agent's process is not within a CPU cgroup")
                else:
                    cpu_accounting = self._cgroups_api.get_unit_property(
                        agent_unit_name, "CPUAccounting")
                    log_cgroup_info('CPUAccounting: {0}', cpu_accounting)

                if memory_cgroup_relative_path is None:
                    log_cgroup_warn(
                        "The agent's process is not within a memory cgroup")
                else:
                    memory_accounting = self._cgroups_api.get_unit_property(
                        agent_unit_name, "MemoryAccounting")
                    log_cgroup_info('MemoryAccounting: {0}', memory_accounting)

                #
                # All good, enable cgroups and start monitoring the agent
                #
                self._cgroups_enabled = True

                if cpu_controller_root is None or cpu_cgroup_relative_path is None:
                    logger.info("Will not track CPU for the agent's cgroup")
                else:
                    self._agent_cpu_cgroup_path = os.path.join(
                        cpu_controller_root, cpu_cgroup_relative_path)
                    CGroupsTelemetry.track_cgroup(
                        CpuCgroup(agent_unit_name,
                                  self._agent_cpu_cgroup_path))

                if memory_controller_root is None or memory_cgroup_relative_path is None:
                    logger.info("Will not track memory for the agent's cgroup")
                else:
                    self._agent_memory_cgroup_path = os.path.join(
                        memory_controller_root, memory_cgroup_relative_path)
                    CGroupsTelemetry.track_cgroup(
                        MemoryCgroup(agent_unit_name,
                                     self._agent_memory_cgroup_path))

                log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}",
                                self._agent_cpu_cgroup_path,
                                self._agent_memory_cgroup_path)

            except Exception as e:
                message = "Error initializing cgroups: {0}".format(ustr(e))
                logger.warn(message)
                add_event(op=WALAEventOperation.CGroupsInitialize,
                          is_success=False,
                          message=message,
                          log_event=False)
            finally:
                self._initialized = True
Exemplo n.º 28
0
 def test_is_active_incorrect_file(self, patch_periodic_warn):
     open(os.path.join(self.tmp_dir, "tasks"), mode="wb").close()
     test_cgroup = CpuCgroup("test_extension",
                             os.path.join(self.tmp_dir, "tasks"))
     self.assertEqual(False, test_cgroup.is_active())
     self.assertEqual(1, patch_periodic_warn.call_count)
Exemplo n.º 29
0
    def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
                                error_code=ExtensionErrorCodes.PluginUnknownFailure):
        scope = "{0}_{1}".format(self._get_extension_cgroup_name(extension_name), uuid.uuid4())

        process = subprocess.Popen(
            "systemd-run --unit={0} --scope {1}".format(scope, command),
            shell=shell,
            cwd=cwd,
            stdout=stdout,
            stderr=stderr,
            env=env,
            preexec_fn=os.setsid)

        scope_name = scope + '.scope'

        logger.info("Started extension in unit '{0}'", scope_name)

        try:
            # systemd-run creates the scope under the system slice by default
            cgroup_relative_path = os.path.join('system.slice', scope_name)

            cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points()

            if cpu_cgroup_mountpoint is None:
                logger.info("The CPU controller is not mounted; will not track resource usage")
            else:
                cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path)
                CGroupsTelemetry.track_cgroup(CpuCgroup(extension_name, cpu_cgroup_path))

            if memory_cgroup_mountpoint is None:
                logger.info("The memory controller is not mounted; will not track resource usage")
            else:
                memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path)
                CGroupsTelemetry.track_cgroup(MemoryCgroup(extension_name, memory_cgroup_path))

        except IOError as e:
            if e.errno == 2:  # 'No such file or directory'
                logger.info("The extension command already completed; will not track resource usage")
            logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))
        except Exception as e:
            logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e))

        # Wait for process completion or timeout
        try:
            process_output = handle_process_completion(process=process,
                                                       command=command,
                                                       timeout=timeout,
                                                       stdout=stdout,
                                                       stderr=stderr,
                                                       error_code=error_code)
        except ExtensionError as e:
            # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
            # extension errors.
            process_output = read_output(stdout, stderr)
            systemd_failure = self._is_systemd_failure(scope, process_output)

            if not systemd_failure:
                # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
                raise
            else:
                # There was an issue with systemd-run. We need to log it and retry the extension without systemd.
                err_msg = 'Systemd process exited with code %s and output %s' % (e.exit_code, process_output) \
                    if isinstance(e, ExtensionOperationError) else "Systemd timed-out, output: %s" % process_output
                event_msg = 'Failed to run systemd-run for unit {0}.scope. ' \
                            'Will retry invoking the extension without systemd. ' \
                            'Systemd-run error: {1}'.format(scope, err_msg)
                add_event(op=WALAEventOperation.InvokeCommandUsingSystemd, is_success=False, log_event=False, message=event_msg)
                logger.warn(event_msg)

                # Reset the stdout and stderr
                stdout.truncate(0)
                stderr.truncate(0)

                # Try invoking the process again, this time without systemd-run
                logger.info('Extension invocation using systemd failed, falling back to regular invocation '
                            'without cgroups tracking.')
                process = subprocess.Popen(command,
                                           shell=shell,
                                           cwd=cwd,
                                           env=env,
                                           stdout=stdout,
                                           stderr=stderr,
                                           preexec_fn=os.setsid)

                process_output = handle_process_completion(process=process,
                                                           command=command,
                                                           timeout=timeout,
                                                           stdout=stdout,
                                                           stderr=stderr,
                                                           error_code=error_code)

                return process_output

        # The process terminated in time and successfully
        return process_output
Exemplo n.º 30
0
    def test_get_cpu_usage_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked(
            self):
        cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test")

        with self.assertRaises(CGroupsException):
            cpu_usage = cgroup.get_cpu_usage()  # pylint: disable=unused-variable