def test_start_extension_command_should_start_tracking_the_extension_cgroups( self): # CPU usage is initialized when we begin tracking a CPU cgroup; since this test does not retrieve the # CPU usage, there is no need for initialization with patch( "azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage" ): CGroupConfigurator.get_instance().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", timeout=300, shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "cpu", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" ))) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "memory", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" )))
def run(self, child_args=None): logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) self.check_pid() self.initialize_environment() CGroupConfigurator.get_instance().create_agent_cgroups( track_cgroups=False) # If FIPS is enabled, set the OpenSSL environment variable # Note: # -- Subprocesses inherit the current environment if conf.get_fips_enabled(): os.environ[OPENSSL_FIPS_ENVIRONMENT] = '1' while self.running: try: self.daemon(child_args) except Exception as e: err_msg = traceback.format_exc() add_event(name=AGENT_NAME, is_success=False, message=ustr(err_msg), op=WALAEventOperation.UnhandledError) logger.warn( "Daemon ended with exception -- Sleep 15 seconds and restart daemon" ) time.sleep(15)
def test_init_should_mount_the_cgroups_file_system(self): with patch( "azurelinuxagent.common.osutil.default.DefaultOSUtil.mount_cgroups" ) as mock_mount_cgroups: CGroupConfigurator.get_instance() self.assertEqual(mock_mount_cgroups.call_count, 1)
def _operation(self): tracked_metrics = CGroupsTelemetry.poll_all_tracked() for metric in tracked_metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value, log_event=self.__log_metrics) CGroupConfigurator.get_instance().check_cgroups(tracked_metrics)
def setUp(self): AgentTestCase.setUp(self) ext_handler_properties = ExtHandlerProperties() ext_handler_properties.version = "1.2.3" self.ext_handler = ExtHandler(name='foo') self.ext_handler.properties = ext_handler_properties self.ext_handler_instance = ExtHandlerInstance( ext_handler=self.ext_handler, protocol=None) self.mock_get_base_dir = patch( "azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_base_dir", lambda *_: self.tmp_dir) self.mock_get_base_dir.start() self.log_dir = os.path.join(self.tmp_dir, "log") self.mock_get_log_dir = patch( "azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_log_dir", lambda *_: self.log_dir) self.mock_get_log_dir.start() self.mock_sleep = patch("time.sleep", lambda *_: mock_sleep(0.01)) self.mock_sleep.start() self.cgroups_enabled = CGroupConfigurator.get_instance().enabled() CGroupConfigurator.get_instance().disable()
def daemon(self, child_args=None): logger.info("Run daemon") self.protocol_util = get_protocol_util() self.scvmm_handler = get_scvmm_handler() self.resourcedisk_handler = get_resourcedisk_handler() self.rdma_handler = get_rdma_handler() self.provision_handler = get_provision_handler() self.update_handler = get_update_handler() if conf.get_detect_scvmm_env(): self.scvmm_handler.run() if conf.get_resourcedisk_format(): self.resourcedisk_handler.run() # Always redetermine the protocol start (e.g., wireserver vs. # on-premise) since a VHD can move between environments self.protocol_util.clear_protocol() self.provision_handler.run() # Once we have the protocol, complete initialization of the telemetry fields # that require the goal state and IMDS self._initialize_telemetry() # Initialize the agent cgroup CGroupConfigurator.get_instance().create_agent_cgroups( track_cgroups=False) # Enable RDMA, continue in errors if conf.enable_rdma(): nd_version = self.rdma_handler.get_rdma_version() self.rdma_handler.install_driver_if_needed() logger.info("RDMA capabilities are enabled in configuration") try: # Ensure the most recent SharedConfig is available # - Changes to RDMA state may not increment the goal state # incarnation number. A forced update ensures the most # current values. protocol = self.protocol_util.get_protocol() if type(protocol) is not WireProtocol: raise Exception("Attempt to setup RDMA without Wireserver") protocol.client.update_goal_state(forced=True) setup_rdma_device(nd_version, protocol.client.get_shared_conf()) except Exception as e: logger.error("Error setting up rdma device: %s" % e) else: logger.info("RDMA capabilities are not enabled, skipping") self.sleep_if_disabled() while self.running: self.update_handler.run_latest(child_args=child_args)
def test_enable_should_raise_CGroupsException_when_cgroups_are_not_supported( self): with patch( "azurelinuxagent.common.osutil.default.DefaultOSUtil.is_cgroups_supported", return_value=False): with self.assertRaises(CGroupsException) as context_manager: CGroupConfigurator.get_instance().enable() self.assertIn("cgroups are not supported", str(context_manager.exception))
def tearDown(self): if self.cgroups_enabled: CGroupConfigurator.get_instance().enable() else: CGroupConfigurator.get_instance().disable() self.mock_get_log_dir.stop() self.mock_get_base_dir.stop() AgentTestCase.tearDown(self)
def test_cleanup_legacy_cgroups_should_disable_cgroups_when_the_daemon_was_added_to_the_legacy_cgroup_on_systemd(self, _): # Set up a mock /var/run/waagent.pid file daemon_pid = "42" daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") fileutil.write_file(daemon_pid_file, daemon_pid + "\n") # Set up old controller cgroups and add the daemon PID to them CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) cgroup_configurator = CGroupConfigurator.get_instance() with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as mock_add_event: with patch("azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): cgroup_configurator.cleanup_legacy_cgroups() self.assertEquals(len(mock_add_event.call_args_list), 1) _, kwargs = mock_add_event.call_args_list[0] self.assertEquals(kwargs['op'], 'CGroupsCleanUp') self.assertFalse(kwargs['is_success']) self.assertEquals( kwargs['message'], "Failed to process legacy cgroups. Collection of resource usage data will be disabled. [CGroupsException] The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) self.assertFalse(cgroup_configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def test_cgroup_operations_should_log_a_warning_when_the_cgroup_api_raises_an_exception(self): configurator = CGroupConfigurator.get_instance() # cleanup_legacy_cgroups disables cgroups on error, so make disable() a no-op with patch.object(configurator, "disable"): # List of operations to test, and the functions to mock in order to raise exceptions operations = [ [lambda: configurator.create_agent_cgroups(track_cgroups=False), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_agent_cgroups"], [lambda: configurator.cleanup_legacy_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_legacy_cgroups"], [lambda: configurator.create_extension_cgroups_root(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups_root"], [lambda: configurator.create_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups"], [lambda: configurator.remove_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.remove_extension_cgroups"] ] def raise_exception(*_): raise Exception("A TEST EXCEPTION") for op in operations: with patch("azurelinuxagent.common.cgroupconfigurator.logger.warn") as mock_logger_warn: with patch(op[1], raise_exception): op[0]() self.assertEquals(mock_logger_warn.call_count, 1) args, kwargs = mock_logger_warn.call_args message = args[0] self.assertIn("A TEST EXCEPTION", message)
def __init__(self): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), ReportNetworkErrorsOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), ReportNetworkConfigurationChangesOperation(), ] if CGroupConfigurator.get_instance().enabled(): self._periodic_operations.append(PollResourceUsageOperation()) self.protocol = None self.protocol_util = None self.health_service = None self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState( min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) self.imds_errorstate = ErrorState( min_timedelta=MonitorHandler.IMDS_HEALTH_PERIOD)
def _get_new_cgroup_configurator_instance(initialize=True): CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() if initialize: with mock_cgroup_commands(): configurator.initialize() return configurator
def is_log_collection_allowed(): # There are three conditions that need to be met in order to allow periodic log collection: # 1) It should be enabled in the configuration. # 2) The system must be using cgroups to manage services. Needed for resource limiting of the log collection. # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting. conf_enabled = conf.get_collect_logs() cgroups_enabled = CGroupConfigurator.get_instance().enabled() supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3 is_allowed = conf_enabled and cgroups_enabled and supported_python msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \ "configuration enabled [{1}], cgroups enabled [{2}], python supported: [{3}]".format(is_allowed, conf_enabled, cgroups_enabled, supported_python) logger.info(msg) add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, is_success=is_allowed, message=msg, log_event=False) return is_allowed
def test_cgroup_operations_should_not_invoke_the_cgroup_api_when_cgroups_are_not_enabled( self): configurator = CGroupConfigurator.get_instance() configurator.disable() # List of operations to test, and the functions to mock used in order to do verifications operations = [ [ lambda: configurator.create_agent_cgroups(track_cgroups=False), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_agent_cgroups" ], [ lambda: configurator.cleanup_legacy_cgroups(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.cleanup_legacy_cgroups" ], [ lambda: configurator.create_extension_cgroups_root(), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups_root" ], [ lambda: configurator.create_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.create_extension_cgroups" ], [ lambda: configurator.remove_extension_cgroups("A.B.C-1.0.0"), "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.remove_extension_cgroups" ] ] for op in operations: with patch(op[1]) as mock_cgroup_api_operation: op[0]() self.assertEqual(mock_cgroup_api_operation.call_count, 0)
def test_start_extension_command_should_forward_to_subprocess_popen_when_groups_are_not_enabled( self): configurator = CGroupConfigurator.get_instance() configurator.disable() with patch( "azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.start_extension_command" ) as mock_fs: with patch( "azurelinuxagent.common.cgroupapi.SystemdCgroupsApi.start_extension_command" ) as mock_systemd: with patch( "azurelinuxagent.common.cgroupconfigurator.handle_process_completion" ) as mock_popen: configurator.start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", timeout=300, shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(mock_popen.call_count, 1) self.assertEqual(mock_fs.call_count, 0) self.assertEqual(mock_systemd.call_count, 0)
def test_enable_and_disable_should_change_the_enabled_state_of_cgroups(self): configurator = CGroupConfigurator.get_instance() self.assertTrue(configurator.enabled()) configurator.disable() self.assertFalse(configurator.enabled()) configurator.enable() self.assertTrue(configurator.enabled())
def are_cgroups_enabled(): # We use a function decorator to check if cgroups are enabled in multiple tests, which at some point calls # get_osutil. The global mock for that function doesn't get executed before the function decorators are imported, # so we need to specifically mock it beforehand. mock__get_osutil = patch( "azurelinuxagent.common.osutil.factory._get_osutil", mock_get_osutil) mock__get_osutil.start() ret = CGroupConfigurator.get_instance().enabled mock__get_osutil.stop() return ret
def test_cleanup_legacy_cgroups_should_disable_cgroups_when_it_fails_to_process_legacy_cgroups( self): # Set up a mock /var/run/waagent.pid file daemon_pid = "42" daemon_pid_file = os.path.join(self.tmp_dir, "waagent.pid") fileutil.write_file(daemon_pid_file, daemon_pid + "\n") # Set up old controller cgroups and add the daemon PID to them CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "cpu", daemon_pid) CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", daemon_pid) # Set up new controller cgroups and add extension handler's PID to them CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "cpu", "999") CGroupsTools.create_agent_cgroup(self.cgroups_file_system_root, "memory", "999") def mock_append_file(filepath, contents, **kwargs): if re.match(r'/.*/cpu/.*/cgroup.procs', filepath): raise OSError(errno.ENOSPC, os.strerror(errno.ENOSPC)) fileutil.append_file(filepath, contents, **kwargs) # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup( CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) cgroup_configurator = CGroupConfigurator.get_instance() with patch("azurelinuxagent.common.cgroupconfigurator.add_event" ) as mock_add_event: with patch( "azurelinuxagent.common.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): with patch( "azurelinuxagent.common.cgroupapi.fileutil.append_file", side_effect=mock_append_file): cgroup_configurator.cleanup_legacy_cgroups() self.assertEquals(len(mock_add_event.call_args_list), 1) _, kwargs = mock_add_event.call_args_list[0] self.assertEquals(kwargs['op'], 'CGroupsCleanUp') self.assertFalse(kwargs['is_success']) self.assertEquals( kwargs['message'], 'Failed to process legacy cgroups. Collection of resource usage data will be disabled. [Errno 28] No space left on device' ) self.assertFalse(cgroup_configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def test_disable_should_reset_tracked_cgroups(self): configurator = CGroupConfigurator.get_instance() # Start tracking a couple of dummy cgroups CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "cpu")) CGroupsTelemetry.track_cgroup(CGroup("dummy", "/sys/fs/cgroup/memory/system.slice/dummy.service", "memory")) configurator.disable() self.assertFalse(configurator.enabled()) self.assertEquals(len(CGroupsTelemetry._tracked), 0)
def test_start_extension_command_should_start_tracking_the_extension_cgroups( self): CGroupConfigurator.get_instance().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "cpu", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" ))) self.assertTrue( CGroupsTelemetry.is_tracked( os.path.join( self.cgroups_file_system_root, "memory", "walinuxagent.extensions/Microsoft.Compute.TestExtension_1.2.3" )))
def test_start_extension_command_should_forward_to_cgroups_api_when_groups_are_enabled(self): configurator = CGroupConfigurator.get_instance() with patch("azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.start_extension_command", return_value=[[], None]) as mock_start_extension_command: configurator.start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", timeout=300, shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(mock_start_extension_command.call_count, 1)
def test_start_extension_command_should_raise_an_exception_when_the_command_cannot_be_started(self): configurator = CGroupConfigurator.get_instance() def raise_exception(*_, **__): raise Exception("A TEST EXCEPTION") with patch("azurelinuxagent.common.cgroupapi.FileSystemCgroupsApi.start_extension_command", raise_exception): with self.assertRaises(Exception) as context_manager: configurator.start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="date", timeout=300, shell=False, cwd=self.tmp_dir, env={}, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertIn("A TEST EXCEPTION", str(context_manager.exception))
def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=None): CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() CGroupsTelemetry.reset() with mock_cgroup_environment(self.tmp_dir) as mock_environment: if mock_commands is not None: for command in mock_commands: mock_environment.add_command(command) configurator.mocks = mock_environment if initialize: if not enable: with patch.object(configurator, "enable"): configurator.initialize() else: configurator.initialize() yield configurator
def _create_collect_logs_handler(iterations=1, cgroups_enabled=True, collect_logs_conf=True): """ Creates an instance of CollectLogsHandler that * Uses a mock_wire_protocol for network requests, * Runs its main loop only the number of times given in the 'iterations' parameter, and * Does not sleep at the end of each iteration The returned CollectLogsHandler is augmented with 2 methods: * get_mock_wire_protocol() - returns the mock protocol * run_and_wait() - invokes run() and wait() on the CollectLogsHandler """ with mock_wire_protocol(DATA_FILE) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) with patch("azurelinuxagent.ga.collect_logs.get_protocol_util", return_value=protocol_util): with patch( "azurelinuxagent.ga.collect_logs.CollectLogsHandler.stopped", side_effect=[False] * iterations + [True]): with patch("time.sleep"): # Grab the singleton to patch it cgroups_configurator_singleton = CGroupConfigurator.get_instance( ) with patch.object(cgroups_configurator_singleton, "enabled", return_value=cgroups_enabled): with patch( "azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=collect_logs_conf): def run_and_wait(): collect_logs_handler.run() collect_logs_handler.join() collect_logs_handler = get_collect_logs_handler() collect_logs_handler.get_mock_wire_protocol = lambda: protocol collect_logs_handler.run_and_wait = run_and_wait yield collect_logs_handler
def _operation_impl(self): # # Check the processes in the agent cgroup # processes_check_error = None try: processes = CGroupConfigurator.get_instance( ).get_processes_in_agent_cgroup() if processes is not None: unexpected_processes = [] for (_, command_line) in processes: if not CGroupConfigurator.is_agent_process(command_line): unexpected_processes.append(command_line) if len(unexpected_processes) > 0: unexpected_processes.sort() processes_check_error = "The agent's cgroup includes unexpected processes: {0}".format( ustr(unexpected_processes)) except Exception as e: processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format( ustr(e)) # Report a small sample of errors if processes_check_error != self._last_error and self._error_count < 5: self._error_count += 1 self._last_error = processes_check_error logger.info(processes_check_error) add_event(op=WALAEventOperation.CGroupsDebug, message=processes_check_error) # # Report metrics # metrics = CGroupsTelemetry.poll_all_tracked() for metric in metrics: report_metric(metric.category, metric.counter, metric.instance, metric.value)
def setUpClass(cls): AgentTestCase.setUpClass() # ensure cgroups are enabled by forcing a new instance CGroupConfigurator._instance = None with mock_cgroup_commands(): CGroupConfigurator.get_instance().initialize()
def _ensure_cgroups_initialized(self): configurator = CGroupConfigurator.get_instance() configurator.create_agent_cgroups(track_cgroups=True) configurator.cleanup_legacy_cgroups() configurator.create_extension_cgroups_root()
def test_init_should_disable_cgroups_when_they_are_not_supported(self): with patch( "azurelinuxagent.common.osutil.default.DefaultOSUtil.is_cgroups_supported", return_value=False): self.assertFalse(CGroupConfigurator.get_instance().enabled())
def make_new_cgroup(name="test-cgroup"): return CGroupConfigurator.get_instance().create_extension_cgroups(name)
def setUpClass(cls): CGroupConfigurator.get_instance() super(AgentTestCase, cls).setUpClass()