def _collect_logs(): logger.info("Starting log collection...") # Invoke the command line tool in the agent to collect logs, with resource limits on CPU and memory (RAM). scope_name = "collect-logs-{0}.scope".format( ustr(int(time.time() * 1000000))) systemd_cmd = [ "systemd-run", "--unit={0}".format(scope_name), "--scope" ] # More info on resource limits properties in systemd here: # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups cpu_limit, memory_limit = CollectLogsHandler._get_resource_limits() resource_limits = [ "--property=CPUAccounting=1", "--property=CPUQuota={0}".format(cpu_limit), "--property=MemoryAccounting=1", "--property=MemoryLimit={0}".format(memory_limit) ] # The log tool is invoked from the current agent's egg with the command line option collect_logs_cmd = [sys.executable, "-u", sys.argv[0], "-collect-logs"] final_command = systemd_cmd + resource_limits + collect_logs_cmd start_time = datetime.datetime.utcnow() success = False msg = None try: shellutil.run_command(final_command, log_error=True) duration = elapsed_milliseconds(start_time) archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH) msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format( archive_size, duration) logger.info(msg) success = True return True except Exception as e: # pylint: disable=C0103 duration = elapsed_milliseconds(start_time) if isinstance(e, CommandError): exception_message = ustr("[stderr] %s", e.stderr) # pylint: disable=no-member else: exception_message = ustr(e) msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format( duration, exception_message) # No need to log to the local log since we ran run_command with logging errors as enabled return False finally: add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, is_success=success, message=msg, log_event=False)
def test_save_event_message_with_non_ascii_characters(self): test_data_dir = os.path.join( data_dir, "events", "collect_and_send_extension_stdout_stderror") msg = "" with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout: with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr: msg = read_output(stdout, stderr) duration = elapsed_milliseconds(datetime.utcnow()) log_msg = "{0}\n{1}".format( "DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""])) add_event('test_extension', message=log_msg, duration=duration) for tld_file in os.listdir(self.tmp_dir): event_str = MonitorHandler.collect_event( os.path.join(self.tmp_dir, tld_file)) event_json = json.loads(event_str) self.assertEqual(len(event_json["parameters"]), 8) for i in event_json["parameters"]: if i["name"] == "Name": self.assertEqual(i["value"], "test_extension") if i["name"] == "Message": self.assertEqual(i["value"], log_msg)
def launch_command(self, cmd, timeout=300): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). # Some extensions erroneously begin cmd with a slash; don't interpret those # as root-relative. (Issue #1170) full_path = os.path.join(base_dir, cmd.lstrip(os.sep)) process = subprocess.Popen(full_path, shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ, preexec_fn=os.setsid) except OSError as e: raise ExtensionError("Failed to launch '{0}': {1}".format( full_path, e.strerror)) msg = capture_from_process(process, cmd, timeout) ret = process.poll() if ret is None or ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format( ret, cmd, msg)) duration = elapsed_milliseconds(begin_utc) self.report_event(message="{0}\n{1}".format(cmd, msg), duration=duration, log_event=False)
def run(self): # If provision is enabled, run default provision handler if conf.get_provision_enabled(): logger.warn("Provisioning flag is enabled, which overrides using " "cloud-init; running the default provisioning code") super(CloudInitProvisionHandler, self).run() return try: if super(CloudInitProvisionHandler, self).is_provisioned(): logger.info("Provisioning already completed, skipping.") return utc_start = datetime.utcnow() logger.info("Running CloudInit provisioning handler") self.wait_for_ovfenv() self.protocol_util.get_protocol() self.report_not_ready("Provisioning", "Starting") thumbprint = self.wait_for_ssh_host_key() self.write_provisioned() logger.info("Finished provisioning") self.report_ready(thumbprint) self.report_event("Provisioning with cloud-init succeeded ({0}s)".format(self._get_uptime_seconds()), is_success=True, duration=elapsed_milliseconds(utc_start)) except ProvisionError as e: msg = "Provisioning with cloud-init failed: {0} ({1}s)".format(ustr(e), self._get_uptime_seconds()) logger.error(msg) self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(msg) return
def run(self): try: if super(CloudInitProvisionHandler, self).check_provisioned_file(): logger.info("Provisioning already completed, skipping.") return utc_start = datetime.utcnow() logger.info("Running CloudInit provisioning handler") self.wait_for_ovfenv() self.protocol_util.get_protocol() self.report_not_ready("Provisioning", "Starting") thumbprint = self.wait_for_ssh_host_key() # pylint: disable=W0612 self.write_provisioned() logger.info("Finished provisioning") self.report_ready() self.report_event( "Provisioning with cloud-init succeeded ({0}s)".format( self._get_uptime_seconds()), is_success=True, duration=elapsed_milliseconds(utc_start)) except ProvisionError as e: msg = "Provisioning with cloud-init failed: {0} ({1}s)".format( ustr(e), self._get_uptime_seconds()) logger.error(msg) self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(msg) return
def launch_command(self, cmd, timeout=300): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() try: devnull = open(os.devnull, 'w') child = subprocess.Popen(base_dir + "/" + cmd, shell=True, cwd=base_dir, stdout=devnull, env=os.environ) except Exception as e: #TODO do not catch all exception raise ExtensionError("Failed to launch: {0}, {1}".format(cmd, e)) retry = timeout while retry > 0 and child.poll() is None: time.sleep(1) retry -= 1 if retry == 0: os.kill(child.pid, 9) raise ExtensionError("Timeout({0}): {1}".format(timeout, cmd)) ret = child.wait() if ret == None or ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}".format( ret, cmd)) duration = elapsed_milliseconds(begin_utc) self.report_event(message="Launch command succeeded: {0}".format(cmd), duration=duration)
def launch_command(self, cmd, timeout=300): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() def sanitize(s): return ustr(s, encoding='utf-8', errors='backslashreplace') try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). process = subprocess.Popen(os.path.join(base_dir, cmd), shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ) stdout, stderr = process.communicate(timeout) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() msg = format_stdout_stderr(sanitize(stdout), sanitize(stderr)) raise ExtensionError("Timeout({0}): {1}\n{2}".format(timeout, cmd, msg)) except Exception as e: process.kill() process.wait() raise ExtensionError("Failed to launch: {0}, {1}".format(cmd, e)) ret = process.poll() if ret is None or ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}".format(ret, cmd)) duration = elapsed_milliseconds(begin_utc) msg = format_stdout_stderr(sanitize(stdout), sanitize(stderr)) self.report_event(message="{0}\n{1}".format(cmd, msg), duration=duration)
def test_save_event_message_with_non_ascii_characters(self): test_data_dir = os.path.join(data_dir, "events", "collect_and_send_extension_stdout_stderror") msg = "" with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout: with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr: msg = read_output(stdout, stderr) duration = elapsed_milliseconds(datetime.utcnow()) log_msg = "{0}\n{1}".format("DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""])) with patch("azurelinuxagent.common.event.datetime") as patch_datetime: patch_datetime.utcnow = Mock(return_value=datetime.strptime("2019-01-01 01:30:00", '%Y-%m-%d %H:%M:%S')) with patch('os.getpid', return_value=42): with patch("threading.Thread.getName", return_value="HelloWorldTask"): add_event('test_extension', message=log_msg, duration=duration) for tld_file in os.listdir(self.tmp_dir): event_str = MonitorHandler.collect_event(os.path.join(self.tmp_dir, tld_file)) event_json = json.loads(event_str) self.assertEqual(len(event_json["parameters"]), 15) # Checking the contents passed above, and also validating the default values that were passed in. for i in event_json["parameters"]: if i["name"] == "Name": self.assertEqual(i["value"], "test_extension") elif i["name"] == "Message": self.assertEqual(i["value"], log_msg) elif i["name"] == "Version": self.assertEqual(i["value"], str(CURRENT_VERSION)) elif i['name'] == 'IsInternal': self.assertEqual(i['value'], False) elif i['name'] == 'Operation': self.assertEqual(i['value'], 'Unknown') elif i['name'] == 'OperationSuccess': self.assertEqual(i['value'], True) elif i['name'] == 'Duration': self.assertEqual(i['value'], 0) elif i['name'] == 'ExtensionType': self.assertEqual(i['value'], '') elif i['name'] == 'ContainerId': self.assertEqual(i['value'], 'UNINITIALIZED') elif i['name'] == 'OpcodeName': self.assertEqual(i['value'], '2019-01-01 01:30:00') elif i['name'] == 'EventTid': self.assertEqual(i['value'], threading.current_thread().ident) elif i['name'] == 'EventPid': self.assertEqual(i['value'], 42) elif i['name'] == 'TaskName': self.assertEqual(i['value'], 'HelloWorldTask') elif i['name'] == 'KeywordName': self.assertEqual(i['value'], '') elif i['name'] == 'GAVersion': self.assertEqual(i['value'], str(CURRENT_AGENT)) else: self.assertFalse(True, "Contains a field outside the defaults expected. Field Name: {0}". format(i['name']))
def exec_command(output_file): start_time = datetime.datetime.utcnow() success = False msg = None try: shellutil.run_command(final_command, log_error=False, stdout=output_file, stderr=output_file) duration = elapsed_milliseconds(start_time) archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH) msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format(archive_size, duration) logger.info(msg) success = True return True except Exception as e: duration = elapsed_milliseconds(start_time) err_msg = ustr(e) if isinstance(e, CommandError): # pylint has limited (i.e. no) awareness of control flow w.r.t. typing. we disable=no-member # here because we know e must be a CommandError but pylint still considers the case where # e is a different type of exception. err_msg = ustr("Log Collector exited with code {0}").format(e.returncode) # pylint: disable=no-member if e.returncode == logcollector.INVALID_CGROUPS_ERRCODE: # pylint: disable=no-member logger.info("Disabling periodic log collection until service restart due to process error.") self.stop() else: logger.info(err_msg) msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(duration, err_msg) # No need to log to the local log since we logged stdout, stderr from the process. return False finally: add_event( name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, is_success=success, message=msg, log_event=False)
def run(self): if not conf.get_provision_enabled(): logger.info("Provisioning is disabled, skipping.") self.write_provisioned() self.report_ready() return try: utc_start = datetime.utcnow() thumbprint = None if self.is_provisioned(): logger.info("Provisioning already completed, skipping.") return logger.info("Running default provisioning handler") if not self.validate_cloud_init(is_expected=False): raise ProvisionError("cloud-init appears to be running, " "this is not expected, cannot continue") logger.info("Copying ovf-env.xml") ovf_env = self.protocol_util.copy_ovf_env() self.protocol_util.get_protocol(by_file=True) self.report_not_ready("Provisioning", "Starting") logger.info("Starting provisioning") self.provision(ovf_env) thumbprint = self.reg_ssh_host_key() self.osutil.restart_ssh_service() self.write_provisioned() self.report_event("Provisioning succeeded ({0}s)".format( self._get_uptime_seconds()), is_success=True, duration=elapsed_milliseconds(utc_start)) self.report_event(message=ovf_env.provision_guest_agent, is_success=True, duration=0, operation=WALAEventOperation.ProvisionGuestAgent) self.report_ready(thumbprint) logger.info("Provisioning complete") except (ProtocolError, ProvisionError) as e: self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(ustr(e)) logger.error("Provisioning failed: {0}", ustr(e)) return
def run(self): if not conf.get_provision_enabled(): logger.info("Provisioning is disabled, skipping.") self.write_provisioned() self.report_ready() return try: utc_start = datetime.utcnow() thumbprint = None # pylint: disable=W0612 if self.check_provisioned_file(): logger.info("Provisioning already completed, skipping.") return logger.info("Running default provisioning handler") if cloud_init_is_enabled(): raise ProvisionError( "cloud-init appears to be installed and enabled, " "this is not expected, cannot continue") logger.info("Copying ovf-env.xml") ovf_env = self.protocol_util.copy_ovf_env() self.protocol_util.get_protocol() # Trigger protocol detection self.report_not_ready("Provisioning", "Starting") logger.info("Starting provisioning") self.provision(ovf_env) thumbprint = self.reg_ssh_host_key() self.osutil.restart_ssh_service() self.write_provisioned() self.report_event("Provisioning succeeded ({0}s)".format( self._get_uptime_seconds()), is_success=True, duration=elapsed_milliseconds(utc_start)) self.handle_provision_guest_agent(ovf_env.provision_guest_agent) self.report_ready() logger.info("Provisioning complete") except (ProtocolError, ProvisionError) as e: msg = "Provisioning failed: {0} ({1}s)".format( ustr(e), self._get_uptime_seconds()) logger.error(msg) self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(msg, is_success=False) return
def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() if env is None: env = {} env.update(os.environ) try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). # Some extensions erroneously begin cmd with a slash; don't interpret those # as root-relative. (Issue #1170) full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep)) def pre_exec_function(): """ Change process state before the actual target process is started. Effectively, this runs between the fork() and the exec() of sub-process creation. :return: """ os.setsid() CGroups.add_to_extension_cgroup(self.ext_handler.name) process = subprocess.Popen(full_path, shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, preexec_fn=pre_exec_function) except OSError as e: raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror), code=extension_error_code) cg = CGroups.for_extension(self.ext_handler.name) CGroupsTelemetry.track_extension(self.ext_handler.name, cg) msg = capture_from_process(process, cmd, timeout, extension_error_code) ret = process.poll() if ret is None: raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg), code=extension_error_code) if ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg), code=extension_error_code) duration = elapsed_milliseconds(begin_utc) log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""])) self.logger.verbose(log_msg) self.report_event(message=log_msg, duration=duration, log_event=False)
def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Launch command: [{0}]", cmd) base_dir = self.get_base_dir() if env is None: env = {} env.update(os.environ) try: # This should be .run(), but due to the wide variety # of Python versions we must support we must use .communicate(). # Some extensions erroneously begin cmd with a slash; don't interpret those # as root-relative. (Issue #1170) full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep)) def pre_exec_function(): """ Change process state before the actual target process is started. Effectively, this runs between the fork() and the exec() of sub-process creation. :return: """ os.setsid() CGroups.add_to_extension_cgroup(self.ext_handler.name) process = subprocess.Popen(full_path, shell=True, cwd=base_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, preexec_fn=pre_exec_function) except OSError as e: raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror), code=extension_error_code) cg = CGroups.for_extension(self.ext_handler.name) CGroupsTelemetry.track_extension(self.ext_handler.name, cg) msg = capture_from_process(process, cmd, timeout, extension_error_code) ret = process.poll() if ret is None: raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg), code=extension_error_code) if ret != 0: raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg), code=extension_error_code) duration = elapsed_milliseconds(begin_utc) log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""])) self.logger.verbose(log_msg) self.report_event(message=log_msg, duration=duration, log_event=False)
def run(self): if not conf.get_provision_enabled(): logger.info("Provisioning is disabled, skipping.") self.write_provisioned() self.report_ready() return try: utc_start = datetime.utcnow() thumbprint = None if self.is_provisioned(): logger.info("Provisioning already completed, skipping.") return logger.info("Running default provisioning handler") if not self.validate_cloud_init(is_expected=False): raise ProvisionError("cloud-init appears to be running, " "this is not expected, cannot continue") logger.info("Copying ovf-env.xml") ovf_env = self.protocol_util.copy_ovf_env() self.protocol_util.get_protocol(by_file=True) self.report_not_ready("Provisioning", "Starting") logger.info("Starting provisioning") self.provision(ovf_env) thumbprint = self.reg_ssh_host_key() self.osutil.restart_ssh_service() self.write_provisioned() self.report_event("Provisioning succeeded ({0}s)".format(self._get_uptime_seconds()), is_success=True, duration=elapsed_milliseconds(utc_start)) self.report_event(message=ovf_env.provision_guest_agent, is_success=True, duration=0, operation=WALAEventOperation.ProvisionGuestAgent) self.report_ready(thumbprint) logger.info("Provisioning complete") except (ProtocolError, ProvisionError) as e: self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(ustr(e)) logger.error("Provisioning failed: {0}", ustr(e)) return
def run(self): # If provision is not enabled, report ready and then return if not conf.get_provision_enabled(): logger.info("Provisioning is disabled, skipping.") return try: utc_start = datetime.utcnow() thumbprint = None # if provisioning is already done, return if self.is_provisioned(): logger.info("Provisioning already completed, skipping.") return logger.info("Running default provisioning handler") if not self.validate_cloud_init(is_expected=False): raise ProvisionError("cloud-init appears to be running, " "this is not expected, cannot continue") logger.info("Copying ovf-env.xml") ovf_env = self.protocol_util.copy_ovf_env() self.protocol_util.get_protocol_by_file() self.report_not_ready("Provisioning", "Starting") logger.info("Starting provisioning") self.provision(ovf_env) thumbprint = self.reg_ssh_host_key() self.osutil.restart_ssh_service() # write out provisioned file and report Ready self.write_provisioned() self.report_event("Provision succeed", is_success=True, duration=elapsed_milliseconds(utc_start)) self.report_ready(thumbprint) logger.info("Provisioning complete") except (ProtocolError, ProvisionError) as e: self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(ustr(e)) logger.error("Provisioning failed: {0}", ustr(e)) return
def run(self): # If provision is not enabled, report ready and then return if not conf.get_provision_enabled(): logger.info("Provisioning is disabled, skipping.") return try: utc_start = datetime.utcnow() thumbprint = None # if provisioning is already done, return if self.is_provisioned(): logger.info("Provisioning already completed, skipping.") return logger.info("Running default provisioning handler") if not self.validate_cloud_init(is_expected=False): raise ProvisionError("cloud-init appears to be running, " "this is not expected, cannot continue") logger.info("Copying ovf-env.xml") ovf_env = self.protocol_util.copy_ovf_env() self.protocol_util.get_protocol_by_file() self.report_not_ready("Provisioning", "Starting") logger.info("Starting provisioning") self.provision(ovf_env) thumbprint = self.reg_ssh_host_key() self.osutil.restart_ssh_service() # write out provisioned file and report Ready self.write_provisioned() self.report_event("Provision succeed", is_success=True, duration=elapsed_milliseconds(utc_start)) self.report_ready(thumbprint) logger.info("Provisioning complete") except (ProtocolError, ProvisionError) as e: self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(ustr(e)) logger.error("Provisioning failed: {0}", ustr(e)) return
def run(self): # If provision is enabled, run default provision handler if conf.get_provision_enabled(): logger.warn("Provisioning flag is enabled, which overrides using " "cloud-init; running the default provisioning code") super(CloudInitProvisionHandler, self).run() return try: if super(CloudInitProvisionHandler, self).is_provisioned(): logger.info("Provisioning already completed, skipping.") return utc_start = datetime.utcnow() logger.info("Running CloudInit provisioning handler") self.wait_for_ovfenv() self.protocol_util.get_protocol() self.report_not_ready("Provisioning", "Starting") thumbprint = self.wait_for_ssh_host_key() self.write_provisioned() logger.info("Finished provisioning") self.report_ready(thumbprint) self.report_event("Provision succeed", is_success=True, duration=elapsed_milliseconds(utc_start)) self.report_event(self.create_guest_state_telemetry_messsage(), is_success=True, operation=WALAEventOperation.GuestState) except ProvisionError as e: logger.error("Provisioning failed: {0}", ustr(e)) self.report_not_ready("ProvisioningFailed", ustr(e)) self.report_event(ustr(e)) return
def download(self): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Download extension package") self.set_operation(WALAEventOperation.Download) if self.pkg is None: raise ExtensionError("No package uri found") package = None for uri in self.pkg.uris: try: package = self.protocol.download_ext_handler_pkg(uri.uri) if package is not None: break except Exception as e: logger.warn("Error while downloading extension: {0}", e) if package is None: raise ExtensionError("Failed to download extension") self.logger.verbose("Unpack extension package") self.pkg_file = os.path.join(conf.get_lib_dir(), os.path.basename(uri.uri) + ".zip") try: fileutil.write_file(self.pkg_file, bytearray(package), asbin=True) zipfile.ZipFile(self.pkg_file).extractall(self.get_base_dir()) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to write and unzip plugin", e) #Add user execute permission to all files under the base dir for file in fileutil.get_all_files(self.get_base_dir()): fileutil.chmod(file, os.stat(file).st_mode | stat.S_IXUSR) duration = elapsed_milliseconds(begin_utc) self.report_event(message="Download succeeded", duration=duration) self.logger.info("Initialize extension directory") #Save HandlerManifest.json man_file = fileutil.search_file(self.get_base_dir(), 'HandlerManifest.json') if man_file is None: raise ExtensionError("HandlerManifest.json not found") try: man = fileutil.read_file(man_file, remove_bom=True) fileutil.write_file(self.get_manifest_file(), man) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to save HandlerManifest.json", e) #Create status and config dir try: status_dir = self.get_status_dir() fileutil.mkdir(status_dir, mode=0o700) seq_no, status_path = self.get_status_file_path() if seq_no > -1: now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") status = { "version": 1.0, "timestampUTC": now, "status": { "name": self.ext_handler.name, "operation": "Enabling Handler", "status": "transitioning", "code": 0 } } fileutil.write_file(json.dumps(status), status_path) conf_dir = self.get_conf_dir() fileutil.mkdir(conf_dir, mode=0o700) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to create status or config dir", e) #Save HandlerEnvironment.json self.create_handler_env()
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: send_event_time = datetime.utcnow() self._ensure_no_orphans() self._emit_restart_event() while self.running: if self._is_orphaned: logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT) break if self._upgrade_available(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() log_event = last_etag != exthandlers_handler.last_etag or \ (datetime.utcnow() >= send_event_time) add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), log_event=log_event) if log_event: send_event_time += timedelta(minutes=REPORT_STATUS_INTERVAL) test_agent = self.get_test_agent() if test_agent is not None and test_agent.in_slice: test_agent.enable() logger.info(u"Enabled Agent {0} as test agent", test_agent.name) break time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) return self._shutdown() sys.exit(0) return
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # # Fetch the goal state one time; some components depend on information provided by the goal state and this # call ensures the required info is initialized (e.g telemetry depends on the container ID.) # protocol = self.protocol_util.get_protocol() protocol.update_goal_state() # Initialize the common parameters for telemetry events initialize_event_logger_vminfo_common_parameters(protocol) # Log OS-specific info. os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\ u"OSUtil: {util_name}; AgentService: {service_name}; "\ u"Python: {py_major}.{py_minor}.{py_micro}; "\ u"systemd: {systemd}; "\ u"LISDrivers: {lis_ver}; "\ u"logrotate: {has_logrotate};".format( dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION, util_name=type(self.osutil).__name__, service_name=self.osutil.service_name, py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR, py_micro=PY_VERSION_MICRO, systemd=systemd.is_systemd(), lis_ver=get_lis_version(), has_logrotate=has_logrotate() ) logger.info(os_info_msg) add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # # Perform initialization tasks # from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler(protocol) migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler(protocol) self._ensure_no_orphans() self._emit_restart_event() self._emit_changes_in_default_configuration() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() self._ensure_extension_telemetry_state_configured_properly( protocol) self._ensure_firewall_rules_persisted( dst_ip=protocol.get_endpoint()) # Get all thread handlers telemetry_handler = get_send_telemetry_events_handler( self.protocol_util) all_thread_handlers = [ get_monitor_handler(), get_env_handler(), telemetry_handler, get_collect_telemetry_events_handler(telemetry_handler) ] if is_log_collection_allowed(): all_thread_handlers.append(get_collect_logs_handler()) # Launch all monitoring threads for thread_handler in all_thread_handlers: thread_handler.run() goal_state_interval = conf.get_goal_state_period( ) if conf.get_extensions_enabled( ) else GOAL_STATE_INTERVAL_DISABLED while self.running: # # Check that the parent process (the agent's daemon) is still running # if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break # # Check that all the threads are still running # for thread_handler in all_thread_handlers: if not thread_handler.is_alive(): logger.warn("{0} thread died, restarting".format( thread_handler.get_thread_name())) thread_handler.start() # # Process the goal state # if not protocol.try_update_goal_state(): self._heartbeat_update_goal_state_error_count += 1 else: if self._upgrade_available(protocol): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) activity_id, correlation_id, gs_creation_time = exthandlers_handler.get_goal_state_debug_metadata( ) msg = 'ProcessGoalState completed [Incarnation: {0}; {1} ms; Activity Id: {2}; Correlation Id: {3}; GS Creation Time: {4}]'.format( exthandlers_handler.last_etag, duration, activity_id, correlation_id, gs_creation_time) logger.info(msg) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message=msg) self._send_heartbeat_telemetry(protocol) time.sleep(goal_state_interval) except Exception as error: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(error)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: send_event_time = datetime.utcnow() self._ensure_no_orphans() self._emit_restart_event() while self.running: if self._is_orphaned: logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT) break if self._upgrade_available(): if len(self.agents) > 0: logger.info( u"Agent {0} discovered {1} as an update and will exit", CURRENT_AGENT, self.agents[0].name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() log_event = last_etag != exthandlers_handler.last_etag or \ (datetime.utcnow() >= send_event_time) add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), log_event=log_event) if log_event: send_event_time += timedelta( minutes=REPORT_STATUS_INTERVAL) test_agent = self.get_test_agent() if test_agent is not None and test_agent.in_slice: test_agent.enable() logger.info(u"Enabled Agent {0} as test agent", test_agent.name) break time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) return self._shutdown() sys.exit(0) return
def run(self): """ This is the main loop which watches for agent and extension updates. """ logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler get_monitor_handler().run() from azurelinuxagent.ga.env import get_env_handler get_env_handler().run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() try: self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() if last_etag != exthandlers_handler.last_etag: add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), message="Incarnation {0}".format( exthandlers_handler.last_etag), log_event=True) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e)) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def download(self): begin_utc = datetime.datetime.utcnow() self.logger.verbose("Download extension package") self.set_operation(WALAEventOperation.Download) if self.pkg is None: raise ExtensionError("No package uri found") uris_shuffled = self.pkg.uris random.shuffle(uris_shuffled) file_downloaded = False for uri in uris_shuffled: try: destination = os.path.join(conf.get_lib_dir(), os.path.basename(uri.uri) + ".zip") file_downloaded = self.protocol.download_ext_handler_pkg(uri.uri, destination) if file_downloaded and os.path.exists(destination): self.pkg_file = destination break except Exception as e: logger.warn("Error while downloading extension: {0}", ustr(e)) if not file_downloaded: raise ExtensionError("Failed to download extension", code=1001) self.logger.verbose("Unzip extension package") try: zipfile.ZipFile(self.pkg_file).extractall(self.get_base_dir()) os.remove(self.pkg_file) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to unzip extension package", e, code=1001) # Add user execute permission to all files under the base dir for file in fileutil.get_all_files(self.get_base_dir()): fileutil.chmod(file, os.stat(file).st_mode | stat.S_IXUSR) duration = elapsed_milliseconds(begin_utc) self.report_event(message="Download succeeded", duration=duration) self.logger.info("Initialize extension directory") # Save HandlerManifest.json man_file = fileutil.search_file(self.get_base_dir(), 'HandlerManifest.json') if man_file is None: raise ExtensionError("HandlerManifest.json not found") try: man = fileutil.read_file(man_file, remove_bom=True) fileutil.write_file(self.get_manifest_file(), man) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to save HandlerManifest.json", e) # Create status and config dir try: status_dir = self.get_status_dir() fileutil.mkdir(status_dir, mode=0o700) seq_no, status_path = self.get_status_file_path() if status_path is not None: now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") status = { "version": 1.0, "timestampUTC": now, "status": { "name": self.ext_handler.name, "operation": "Enabling Handler", "status": "transitioning", "code": 0 } } fileutil.write_file(status_path, json.dumps(status)) conf_dir = self.get_conf_dir() fileutil.mkdir(conf_dir, mode=0o700) except IOError as e: fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file]) raise ExtensionError(u"Failed to create status or config dir", e) # Save HandlerEnvironment.json self.create_handler_env()
def run(self): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info('ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event( AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format(exthandlers_handler.last_etag)) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format(CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def test_elapsed_milliseconds(self): utc_start = datetime.utcnow() + timedelta(days=1) self.assertEqual(0, elapsed_milliseconds(utc_start))
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: # NOTE: Do not add any telemetry events until after the monitoring handler has been started with the # call to 'monitor_thread.run()'. That method call initializes the protocol, which is needed in order to # load the goal state and update the container id in memory. Any telemetry events sent before this happens # will result in an uninitialized container id value. logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Log OS-specific info locally. os_info_msg = u"Distro info: {0} {1}, osutil class being used: {2}, " \ u"agent service name: {3}".format(DISTRO_NAME, DISTRO_VERSION, type(self.osutil).__name__, self.osutil.service_name) logger.info(os_info_msg) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() # NOTE: Any telemetry events added from this point on will be properly populated with the container id. from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() # Send OS-specific info as a telemetry event after the monitoring thread has been initialized, and with # it the container id too. add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) goal_state_interval = GOAL_STATE_INTERVAL \ if conf.get_extensions_enabled() \ else GOAL_STATE_INTERVAL_DISABLED while self.running: if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info( 'ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format( exthandlers_handler.last_etag)) time.sleep(goal_state_interval) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def test_elapsed_milliseconds(self): utc_start = datetime.utcnow() + timedelta(days=1) self.assertEqual(0, elapsed_milliseconds(utc_start))
def run(self): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler() migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler() self._ensure_no_orphans() self._emit_restart_event() self._ensure_partition_assigned() self._ensure_readonly_files() while self.running: if self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() if self._upgrade_available(): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() add_event( AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ProcessGoalState, is_success=True, duration=elapsed_milliseconds(utc_start), message="Incarnation {0}".format( exthandlers_handler.last_etag), log_event=True) time.sleep(GOAL_STATE_INTERVAL) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)
def run(self, debug=False): """ This is the main loop which watches for agent and extension updates. """ try: logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) # # Fetch the goal state one time; some components depend on information provided by the goal state and this # call ensures the required info is initialized (e.g telemetry depends on the container ID.) # protocol = self.protocol_util.get_protocol() protocol.update_goal_state() initialize_event_logger_vminfo_common_parameters(protocol) # Log OS-specific info. os_info_msg = u"Distro: {0}-{1}; OSUtil: {2}; AgentService: {3}; Python: {4}.{5}.{6}".format( DISTRO_NAME, DISTRO_VERSION, type(self.osutil).__name__, self.osutil.service_name, PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) logger.info(os_info_msg) add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # Launch monitoring threads from azurelinuxagent.ga.monitor import get_monitor_handler monitor_thread = get_monitor_handler() monitor_thread.run() from azurelinuxagent.ga.env import get_env_handler env_thread = get_env_handler() env_thread.run() from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state exthandlers_handler = get_exthandlers_handler(protocol) migrate_handler_state() from azurelinuxagent.ga.remoteaccess import get_remote_access_handler remote_access_handler = get_remote_access_handler(protocol) self._ensure_no_orphans() self._emit_restart_event() self._emit_changes_in_default_configuration() self._ensure_partition_assigned() self._ensure_readonly_files() self._ensure_cgroups_initialized() goal_state_interval = conf.get_goal_state_period( ) if conf.get_extensions_enabled( ) else GOAL_STATE_INTERVAL_DISABLED while self.running: # # Check that the parent process (the agent's daemon) is still running # if not debug and self._is_orphaned: logger.info("Agent {0} is an orphan -- exiting", CURRENT_AGENT) break # # Check that all the threads are still running # if not monitor_thread.is_alive(): logger.warn(u"Monitor thread died, restarting") monitor_thread.start() if not env_thread.is_alive(): logger.warn(u"Environment thread died, restarting") env_thread.start() # # Process the goal state # if not protocol.try_update_goal_state(): self._heartbeat_update_goal_state_error_count += 1 else: if self._upgrade_available(protocol): available_agent = self.get_latest_agent() if available_agent is None: logger.info( "Agent {0} is reverting to the installed agent -- exiting", CURRENT_AGENT) else: logger.info( u"Agent {0} discovered update {1} -- exiting", CURRENT_AGENT, available_agent.name) break utc_start = datetime.utcnow() last_etag = exthandlers_handler.last_etag exthandlers_handler.run() remote_access_handler.run() if last_etag != exthandlers_handler.last_etag: self._ensure_readonly_files() duration = elapsed_milliseconds(utc_start) logger.info( 'ProcessGoalState completed [incarnation {0}; {1} ms]', exthandlers_handler.last_etag, duration) add_event(AGENT_NAME, op=WALAEventOperation.ProcessGoalState, duration=duration, message="Incarnation {0}".format( exthandlers_handler.last_etag)) self._send_heartbeat_telemetry(protocol) time.sleep(goal_state_interval) except Exception as e: msg = u"Agent {0} failed with exception: {1}".format( CURRENT_AGENT, ustr(e)) self._set_sentinel(msg=msg) logger.warn(msg) logger.warn(traceback.format_exc()) sys.exit(1) # additional return here because sys.exit is mocked in unit tests return self._shutdown() sys.exit(0)