Пример #1
0
    def _collect_logs():
        logger.info("Starting log collection...")

        # Invoke the command line tool in the agent to collect logs, with resource limits on CPU and memory (RAM).
        scope_name = "collect-logs-{0}.scope".format(
            ustr(int(time.time() * 1000000)))
        systemd_cmd = [
            "systemd-run", "--unit={0}".format(scope_name), "--scope"
        ]

        # More info on resource limits properties in systemd here:
        # https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
        cpu_limit, memory_limit = CollectLogsHandler._get_resource_limits()
        resource_limits = [
            "--property=CPUAccounting=1",
            "--property=CPUQuota={0}".format(cpu_limit),
            "--property=MemoryAccounting=1",
            "--property=MemoryLimit={0}".format(memory_limit)
        ]

        # The log tool is invoked from the current agent's egg with the command line option
        collect_logs_cmd = [sys.executable, "-u", sys.argv[0], "-collect-logs"]
        final_command = systemd_cmd + resource_limits + collect_logs_cmd

        start_time = datetime.datetime.utcnow()
        success = False
        msg = None
        try:
            shellutil.run_command(final_command, log_error=True)
            duration = elapsed_milliseconds(start_time)
            archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH)

            msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format(
                archive_size, duration)
            logger.info(msg)
            success = True

            return True
        except Exception as e:  # pylint: disable=C0103
            duration = elapsed_milliseconds(start_time)

            if isinstance(e, CommandError):
                exception_message = ustr("[stderr] %s", e.stderr)  # pylint: disable=no-member
            else:
                exception_message = ustr(e)

            msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(
                duration, exception_message)
            # No need to log to the local log since we ran run_command with logging errors as enabled

            return False
        finally:
            add_event(name=AGENT_NAME,
                      version=CURRENT_VERSION,
                      op=WALAEventOperation.LogCollection,
                      is_success=success,
                      message=msg,
                      log_event=False)
Пример #2
0
    def test_save_event_message_with_non_ascii_characters(self):
        test_data_dir = os.path.join(
            data_dir, "events", "collect_and_send_extension_stdout_stderror")
        msg = ""

        with open(os.path.join(test_data_dir,
                               "dummy_stdout_with_non_ascii_characters"),
                  mode="r+b") as stdout:
            with open(os.path.join(test_data_dir,
                                   "dummy_stderr_with_non_ascii_characters"),
                      mode="r+b") as stderr:
                msg = read_output(stdout, stderr)

        duration = elapsed_milliseconds(datetime.utcnow())
        log_msg = "{0}\n{1}".format(
            "DummyCmd",
            "\n".join([line for line in msg.split('\n') if line != ""]))

        add_event('test_extension', message=log_msg, duration=duration)

        for tld_file in os.listdir(self.tmp_dir):
            event_str = MonitorHandler.collect_event(
                os.path.join(self.tmp_dir, tld_file))
            event_json = json.loads(event_str)

            self.assertEqual(len(event_json["parameters"]), 8)

            for i in event_json["parameters"]:
                if i["name"] == "Name":
                    self.assertEqual(i["value"], "test_extension")
                if i["name"] == "Message":
                    self.assertEqual(i["value"], log_msg)
Пример #3
0
    def launch_command(self, cmd, timeout=300):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.sep))
            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=os.environ,
                                       preexec_fn=os.setsid)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(
                full_path, e.strerror))

        msg = capture_from_process(process, cmd, timeout)

        ret = process.poll()
        if ret is None or ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(
                ret, cmd, msg))

        duration = elapsed_milliseconds(begin_utc)
        self.report_event(message="{0}\n{1}".format(cmd, msg),
                          duration=duration,
                          log_event=False)
Пример #4
0
    def run(self):
        # If provision is enabled, run default provision handler
        if conf.get_provision_enabled():
            logger.warn("Provisioning flag is enabled, which overrides using "
                        "cloud-init; running the default provisioning code")
            super(CloudInitProvisionHandler, self).run()
            return

        try:
            if super(CloudInitProvisionHandler, self).is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            utc_start = datetime.utcnow()
            logger.info("Running CloudInit provisioning handler")
            self.wait_for_ovfenv()
            self.protocol_util.get_protocol()
            self.report_not_ready("Provisioning", "Starting")

            thumbprint = self.wait_for_ssh_host_key()
            self.write_provisioned()
            logger.info("Finished provisioning")

            self.report_ready(thumbprint)
            self.report_event("Provisioning with cloud-init succeeded ({0}s)".format(self._get_uptime_seconds()),
                is_success=True,
                duration=elapsed_milliseconds(utc_start))

        except ProvisionError as e:
            msg = "Provisioning with cloud-init failed: {0} ({1}s)".format(ustr(e), self._get_uptime_seconds())
            logger.error(msg)
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(msg)
            return
Пример #5
0
    def run(self):
        try:
            if super(CloudInitProvisionHandler, self).check_provisioned_file():
                logger.info("Provisioning already completed, skipping.")
                return

            utc_start = datetime.utcnow()
            logger.info("Running CloudInit provisioning handler")
            self.wait_for_ovfenv()
            self.protocol_util.get_protocol()
            self.report_not_ready("Provisioning", "Starting")

            thumbprint = self.wait_for_ssh_host_key()  # pylint: disable=W0612
            self.write_provisioned()
            logger.info("Finished provisioning")

            self.report_ready()
            self.report_event(
                "Provisioning with cloud-init succeeded ({0}s)".format(
                    self._get_uptime_seconds()),
                is_success=True,
                duration=elapsed_milliseconds(utc_start))

        except ProvisionError as e:
            msg = "Provisioning with cloud-init failed: {0} ({1}s)".format(
                ustr(e), self._get_uptime_seconds())
            logger.error(msg)
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(msg)
            return
Пример #6
0
    def launch_command(self, cmd, timeout=300):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()
        try:
            devnull = open(os.devnull, 'w')
            child = subprocess.Popen(base_dir + "/" + cmd,
                                     shell=True,
                                     cwd=base_dir,
                                     stdout=devnull,
                                     env=os.environ)
        except Exception as e:
            #TODO do not catch all exception
            raise ExtensionError("Failed to launch: {0}, {1}".format(cmd, e))

        retry = timeout
        while retry > 0 and child.poll() is None:
            time.sleep(1)
            retry -= 1
        if retry == 0:
            os.kill(child.pid, 9)
            raise ExtensionError("Timeout({0}): {1}".format(timeout, cmd))

        ret = child.wait()
        if ret == None or ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}".format(
                ret, cmd))

        duration = elapsed_milliseconds(begin_utc)
        self.report_event(message="Launch command succeeded: {0}".format(cmd),
                          duration=duration)
Пример #7
0
    def launch_command(self, cmd, timeout=300):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        def sanitize(s):
            return ustr(s, encoding='utf-8', errors='backslashreplace')

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            process = subprocess.Popen(os.path.join(base_dir, cmd),
                                  shell=True,
                                  cwd=base_dir,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  env=os.environ)
            stdout, stderr = process.communicate(timeout)
        except subprocess.TimeoutExpired:
            process.kill()
            stdout, stderr = process.communicate()
            msg = format_stdout_stderr(sanitize(stdout), sanitize(stderr))
            raise ExtensionError("Timeout({0}): {1}\n{2}".format(timeout, cmd, msg))
        except Exception as e:
            process.kill()
            process.wait()
            raise ExtensionError("Failed to launch: {0}, {1}".format(cmd, e))

        ret = process.poll()
        if ret is None or ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}".format(ret, cmd))

        duration = elapsed_milliseconds(begin_utc)
        msg = format_stdout_stderr(sanitize(stdout), sanitize(stderr))
        self.report_event(message="{0}\n{1}".format(cmd, msg), duration=duration)
Пример #8
0
    def test_save_event_message_with_non_ascii_characters(self):
        test_data_dir = os.path.join(data_dir, "events", "collect_and_send_extension_stdout_stderror")
        msg = ""

        with open(os.path.join(test_data_dir, "dummy_stdout_with_non_ascii_characters"), mode="r+b") as stdout:
            with open(os.path.join(test_data_dir, "dummy_stderr_with_non_ascii_characters"), mode="r+b") as stderr:
                msg = read_output(stdout, stderr)

        duration = elapsed_milliseconds(datetime.utcnow())
        log_msg = "{0}\n{1}".format("DummyCmd", "\n".join([line for line in msg.split('\n') if line != ""]))

        with patch("azurelinuxagent.common.event.datetime") as patch_datetime:
            patch_datetime.utcnow = Mock(return_value=datetime.strptime("2019-01-01 01:30:00",
                                                                        '%Y-%m-%d %H:%M:%S'))
            with patch('os.getpid', return_value=42):
                with patch("threading.Thread.getName", return_value="HelloWorldTask"):
                    add_event('test_extension', message=log_msg, duration=duration)

        for tld_file in os.listdir(self.tmp_dir):
            event_str = MonitorHandler.collect_event(os.path.join(self.tmp_dir, tld_file))
            event_json = json.loads(event_str)

            self.assertEqual(len(event_json["parameters"]), 15)

            # Checking the contents passed above, and also validating the default values that were passed in.
            for i in event_json["parameters"]:
                if i["name"] == "Name":
                    self.assertEqual(i["value"], "test_extension")
                elif i["name"] == "Message":
                    self.assertEqual(i["value"], log_msg)
                elif i["name"] == "Version":
                    self.assertEqual(i["value"], str(CURRENT_VERSION))
                elif i['name'] == 'IsInternal':
                    self.assertEqual(i['value'], False)
                elif i['name'] == 'Operation':
                    self.assertEqual(i['value'], 'Unknown')
                elif i['name'] == 'OperationSuccess':
                    self.assertEqual(i['value'], True)
                elif i['name'] == 'Duration':
                    self.assertEqual(i['value'], 0)
                elif i['name'] == 'ExtensionType':
                    self.assertEqual(i['value'], '')
                elif i['name'] == 'ContainerId':
                    self.assertEqual(i['value'], 'UNINITIALIZED')
                elif i['name'] == 'OpcodeName':
                    self.assertEqual(i['value'], '2019-01-01 01:30:00')
                elif i['name'] == 'EventTid':
                    self.assertEqual(i['value'], threading.current_thread().ident)
                elif i['name'] == 'EventPid':
                    self.assertEqual(i['value'], 42)
                elif i['name'] == 'TaskName':
                    self.assertEqual(i['value'], 'HelloWorldTask')
                elif i['name'] == 'KeywordName':
                    self.assertEqual(i['value'], '')
                elif i['name'] == 'GAVersion':
                    self.assertEqual(i['value'], str(CURRENT_AGENT))
                else:
                    self.assertFalse(True, "Contains a field outside the defaults expected. Field Name: {0}".
                                     format(i['name']))
Пример #9
0
        def exec_command(output_file):
            start_time = datetime.datetime.utcnow()
            success = False
            msg = None
            try:
                shellutil.run_command(final_command, log_error=False, stdout=output_file, stderr=output_file)
                duration = elapsed_milliseconds(start_time)
                archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH)

                msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format(archive_size,
                                                                                                    duration)
                logger.info(msg)
                success = True

                return True
            except Exception as e:
                duration = elapsed_milliseconds(start_time)
                err_msg = ustr(e)

                if isinstance(e, CommandError):
                    # pylint has limited (i.e. no) awareness of control flow w.r.t. typing. we disable=no-member
                    # here because we know e must be a CommandError but pylint still considers the case where
                    # e is a different type of exception.
                    err_msg = ustr("Log Collector exited with code {0}").format(e.returncode) # pylint: disable=no-member

                    if e.returncode == logcollector.INVALID_CGROUPS_ERRCODE: # pylint: disable=no-member
                        logger.info("Disabling periodic log collection until service restart due to process error.")
                        self.stop()
                    else:
                        logger.info(err_msg)

                msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(duration, err_msg)
                # No need to log to the local log since we logged stdout, stderr from the process.

                return False
            finally:
                add_event(
                    name=AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.LogCollection,
                    is_success=success,
                    message=msg,
                    log_event=False)
Пример #10
0
    def run(self):
        if not conf.get_provision_enabled():
            logger.info("Provisioning is disabled, skipping.")
            self.write_provisioned()
            self.report_ready()
            return

        try:
            utc_start = datetime.utcnow()
            thumbprint = None

            if self.is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            logger.info("Running default provisioning handler")

            if not self.validate_cloud_init(is_expected=False):
                raise ProvisionError("cloud-init appears to be running, "
                                     "this is not expected, cannot continue")

            logger.info("Copying ovf-env.xml")
            ovf_env = self.protocol_util.copy_ovf_env()

            self.protocol_util.get_protocol(by_file=True)
            self.report_not_ready("Provisioning", "Starting")
            logger.info("Starting provisioning")

            self.provision(ovf_env)

            thumbprint = self.reg_ssh_host_key()
            self.osutil.restart_ssh_service()

            self.write_provisioned()

            self.report_event("Provisioning succeeded ({0}s)".format(
                self._get_uptime_seconds()),
                              is_success=True,
                              duration=elapsed_milliseconds(utc_start))

            self.report_event(message=ovf_env.provision_guest_agent,
                              is_success=True,
                              duration=0,
                              operation=WALAEventOperation.ProvisionGuestAgent)

            self.report_ready(thumbprint)
            logger.info("Provisioning complete")

        except (ProtocolError, ProvisionError) as e:
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(ustr(e))
            logger.error("Provisioning failed: {0}", ustr(e))
            return
Пример #11
0
    def run(self):
        if not conf.get_provision_enabled():
            logger.info("Provisioning is disabled, skipping.")
            self.write_provisioned()
            self.report_ready()
            return

        try:
            utc_start = datetime.utcnow()
            thumbprint = None  # pylint: disable=W0612

            if self.check_provisioned_file():
                logger.info("Provisioning already completed, skipping.")
                return

            logger.info("Running default provisioning handler")

            if cloud_init_is_enabled():
                raise ProvisionError(
                    "cloud-init appears to be installed and enabled, "
                    "this is not expected, cannot continue")

            logger.info("Copying ovf-env.xml")
            ovf_env = self.protocol_util.copy_ovf_env()

            self.protocol_util.get_protocol()  # Trigger protocol detection
            self.report_not_ready("Provisioning", "Starting")
            logger.info("Starting provisioning")

            self.provision(ovf_env)

            thumbprint = self.reg_ssh_host_key()
            self.osutil.restart_ssh_service()

            self.write_provisioned()

            self.report_event("Provisioning succeeded ({0}s)".format(
                self._get_uptime_seconds()),
                              is_success=True,
                              duration=elapsed_milliseconds(utc_start))

            self.handle_provision_guest_agent(ovf_env.provision_guest_agent)

            self.report_ready()
            logger.info("Provisioning complete")

        except (ProtocolError, ProvisionError) as e:
            msg = "Provisioning failed: {0} ({1}s)".format(
                ustr(e), self._get_uptime_seconds())
            logger.error(msg)
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(msg, is_success=False)
            return
Пример #12
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
Пример #13
0
    def launch_command(self, cmd, timeout=300, extension_error_code=1000, env=None):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Launch command: [{0}]", cmd)
        base_dir = self.get_base_dir()

        if env is None:
            env = {}
        env.update(os.environ)

        try:
            # This should be .run(), but due to the wide variety
            # of Python versions we must support we must use .communicate().
            # Some extensions erroneously begin cmd with a slash; don't interpret those
            # as root-relative. (Issue #1170)
            full_path = os.path.join(base_dir, cmd.lstrip(os.path.sep))

            def pre_exec_function():
                """
                Change process state before the actual target process is started. Effectively, this runs between
                the fork() and the exec() of sub-process creation.
                :return:
                """
                os.setsid()
                CGroups.add_to_extension_cgroup(self.ext_handler.name)

            process = subprocess.Popen(full_path,
                                       shell=True,
                                       cwd=base_dir,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=env,
                                       preexec_fn=pre_exec_function)
        except OSError as e:
            raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
                                 code=extension_error_code)

        cg = CGroups.for_extension(self.ext_handler.name)
        CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
        msg = capture_from_process(process, cmd, timeout, extension_error_code)

        ret = process.poll()
        if ret is None:
            raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
                                 code=extension_error_code)
        if ret != 0:
            raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
                                 code=extension_error_code)

        duration = elapsed_milliseconds(begin_utc)
        log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
        self.logger.verbose(log_msg)
        self.report_event(message=log_msg, duration=duration, log_event=False)
Пример #14
0
    def run(self):
        if not conf.get_provision_enabled():
            logger.info("Provisioning is disabled, skipping.")
            self.write_provisioned()
            self.report_ready()
            return

        try:
            utc_start = datetime.utcnow()
            thumbprint = None

            if self.is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            logger.info("Running default provisioning handler")

            if not self.validate_cloud_init(is_expected=False):
                raise ProvisionError("cloud-init appears to be running, "
                                        "this is not expected, cannot continue")

            logger.info("Copying ovf-env.xml")
            ovf_env = self.protocol_util.copy_ovf_env()

            self.protocol_util.get_protocol(by_file=True)
            self.report_not_ready("Provisioning", "Starting")
            logger.info("Starting provisioning")

            self.provision(ovf_env)

            thumbprint = self.reg_ssh_host_key()
            self.osutil.restart_ssh_service()

            self.write_provisioned()

            self.report_event("Provisioning succeeded ({0}s)".format(self._get_uptime_seconds()),
                is_success=True,
                duration=elapsed_milliseconds(utc_start))

            self.report_event(message=ovf_env.provision_guest_agent,
                              is_success=True,
                              duration=0,
                              operation=WALAEventOperation.ProvisionGuestAgent)

            self.report_ready(thumbprint)
            logger.info("Provisioning complete")

        except (ProtocolError, ProvisionError) as e:
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(ustr(e))
            logger.error("Provisioning failed: {0}", ustr(e))
            return
Пример #15
0
    def run(self):
        # If provision is not enabled, report ready and then return
        if not conf.get_provision_enabled():
            logger.info("Provisioning is disabled, skipping.")
            return

        try:
            utc_start = datetime.utcnow()
            thumbprint = None

            # if provisioning is already done, return
            if self.is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            logger.info("Running default provisioning handler")

            if not self.validate_cloud_init(is_expected=False):
                raise ProvisionError("cloud-init appears to be running, "
                                     "this is not expected, cannot continue")

            logger.info("Copying ovf-env.xml")
            ovf_env = self.protocol_util.copy_ovf_env()

            self.protocol_util.get_protocol_by_file()
            self.report_not_ready("Provisioning", "Starting")
            logger.info("Starting provisioning")

            self.provision(ovf_env)

            thumbprint = self.reg_ssh_host_key()
            self.osutil.restart_ssh_service()

            # write out provisioned file and report Ready
            self.write_provisioned()

            self.report_event("Provision succeed",
                              is_success=True,
                              duration=elapsed_milliseconds(utc_start))

            self.report_ready(thumbprint)
            logger.info("Provisioning complete")

        except (ProtocolError, ProvisionError) as e:
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(ustr(e))
            logger.error("Provisioning failed: {0}", ustr(e))
            return
Пример #16
0
    def run(self):
        # If provision is not enabled, report ready and then return
        if not conf.get_provision_enabled():
            logger.info("Provisioning is disabled, skipping.")
            return

        try:
            utc_start = datetime.utcnow()
            thumbprint = None

            # if provisioning is already done, return
            if self.is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            logger.info("Running default provisioning handler")

            if not self.validate_cloud_init(is_expected=False):
                raise ProvisionError("cloud-init appears to be running, "
                                        "this is not expected, cannot continue")

            logger.info("Copying ovf-env.xml")
            ovf_env = self.protocol_util.copy_ovf_env()

            self.protocol_util.get_protocol_by_file()
            self.report_not_ready("Provisioning", "Starting")
            logger.info("Starting provisioning")

            self.provision(ovf_env)

            thumbprint = self.reg_ssh_host_key()
            self.osutil.restart_ssh_service()

            # write out provisioned file and report Ready
            self.write_provisioned()

            self.report_event("Provision succeed",
                is_success=True,
                duration=elapsed_milliseconds(utc_start))

            self.report_ready(thumbprint)
            logger.info("Provisioning complete")

        except (ProtocolError, ProvisionError) as e:
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(ustr(e))
            logger.error("Provisioning failed: {0}", ustr(e))
            return
Пример #17
0
    def run(self):
        # If provision is enabled, run default provision handler
        if conf.get_provision_enabled():
            logger.warn("Provisioning flag is enabled, which overrides using "
                        "cloud-init; running the default provisioning code")
            super(CloudInitProvisionHandler, self).run()
            return

        try:
            if super(CloudInitProvisionHandler, self).is_provisioned():
                logger.info("Provisioning already completed, skipping.")
                return

            utc_start = datetime.utcnow()
            logger.info("Running CloudInit provisioning handler")
            self.wait_for_ovfenv()
            self.protocol_util.get_protocol()
            self.report_not_ready("Provisioning", "Starting")

            thumbprint = self.wait_for_ssh_host_key()
            self.write_provisioned()
            logger.info("Finished provisioning")

            self.report_ready(thumbprint)
            self.report_event("Provision succeed",
                              is_success=True,
                              duration=elapsed_milliseconds(utc_start))
            self.report_event(self.create_guest_state_telemetry_messsage(),
                              is_success=True,
                              operation=WALAEventOperation.GuestState)

        except ProvisionError as e:
            logger.error("Provisioning failed: {0}", ustr(e))
            self.report_not_ready("ProvisioningFailed", ustr(e))
            self.report_event(ustr(e))
            return
Пример #18
0
    def download(self):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Download extension package")
        self.set_operation(WALAEventOperation.Download)
        if self.pkg is None:
            raise ExtensionError("No package uri found")

        package = None
        for uri in self.pkg.uris:
            try:
                package = self.protocol.download_ext_handler_pkg(uri.uri)
                if package is not None:
                    break
            except Exception as e:
                logger.warn("Error while downloading extension: {0}", e)

        if package is None:
            raise ExtensionError("Failed to download extension")

        self.logger.verbose("Unpack extension package")
        self.pkg_file = os.path.join(conf.get_lib_dir(),
                                     os.path.basename(uri.uri) + ".zip")
        try:
            fileutil.write_file(self.pkg_file, bytearray(package), asbin=True)
            zipfile.ZipFile(self.pkg_file).extractall(self.get_base_dir())
        except IOError as e:
            fileutil.clean_ioerror(e,
                                   paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to write and unzip plugin", e)

        #Add user execute permission to all files under the base dir
        for file in fileutil.get_all_files(self.get_base_dir()):
            fileutil.chmod(file, os.stat(file).st_mode | stat.S_IXUSR)

        duration = elapsed_milliseconds(begin_utc)
        self.report_event(message="Download succeeded", duration=duration)

        self.logger.info("Initialize extension directory")
        #Save HandlerManifest.json
        man_file = fileutil.search_file(self.get_base_dir(),
                                        'HandlerManifest.json')

        if man_file is None:
            raise ExtensionError("HandlerManifest.json not found")

        try:
            man = fileutil.read_file(man_file, remove_bom=True)
            fileutil.write_file(self.get_manifest_file(), man)
        except IOError as e:
            fileutil.clean_ioerror(e,
                                   paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to save HandlerManifest.json", e)

        #Create status and config dir
        try:
            status_dir = self.get_status_dir()
            fileutil.mkdir(status_dir, mode=0o700)

            seq_no, status_path = self.get_status_file_path()
            if seq_no > -1:
                now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
                status = {
                    "version": 1.0,
                    "timestampUTC": now,
                    "status": {
                        "name": self.ext_handler.name,
                        "operation": "Enabling Handler",
                        "status": "transitioning",
                        "code": 0
                    }
                }
                fileutil.write_file(json.dumps(status), status_path)

            conf_dir = self.get_conf_dir()
            fileutil.mkdir(conf_dir, mode=0o700)

        except IOError as e:
            fileutil.clean_ioerror(e,
                                   paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to create status or config dir", e)

        #Save HandlerEnvironment.json
        self.create_handler_env()
Пример #19
0
    def run(self):
        """
        This is the main loop which watches for agent and extension updates.
        """

        logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT)

        # Launch monitoring threads
        from azurelinuxagent.ga.monitor import get_monitor_handler
        get_monitor_handler().run()

        from azurelinuxagent.ga.env import get_env_handler
        get_env_handler().run()

        from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
        exthandlers_handler = get_exthandlers_handler()
        migrate_handler_state()

        try:
            send_event_time = datetime.utcnow()

            self._ensure_no_orphans()
            self._emit_restart_event()

            while self.running:
                if self._is_orphaned:
                    logger.info("Goal state agent {0} was orphaned -- exiting", CURRENT_AGENT)
                    break

                if self._upgrade_available():
                    if len(self.agents) > 0:
                        logger.info(
                            u"Agent {0} discovered {1} as an update and will exit",
                            CURRENT_AGENT,
                            self.agents[0].name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                log_event = last_etag != exthandlers_handler.last_etag or \
                            (datetime.utcnow() >= send_event_time)
                add_event(
                    AGENT_NAME,
                    version=CURRENT_VERSION,
                    op=WALAEventOperation.ProcessGoalState,
                    is_success=True,
                    duration=elapsed_milliseconds(utc_start),
                    log_event=log_event)
                if log_event:
                    send_event_time += timedelta(minutes=REPORT_STATUS_INTERVAL)

                test_agent = self.get_test_agent()
                if test_agent is not None and test_agent.in_slice:
                    test_agent.enable()
                    logger.info(u"Enabled Agent {0} as test agent", test_agent.name)
                    break

                time.sleep(GOAL_STATE_INTERVAL)

        except Exception as e:
            logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT, ustr(e))
            logger.warn(traceback.format_exc())
            sys.exit(1)
            return

        self._shutdown()
        sys.exit(0)
        return
Пример #20
0
    def run(self, debug=False):
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            #
            # Fetch the goal state one time; some components depend on information provided by the goal state and this
            # call ensures the required info is initialized (e.g telemetry depends on the container ID.)
            #
            protocol = self.protocol_util.get_protocol()
            protocol.update_goal_state()

            # Initialize the common parameters for telemetry events
            initialize_event_logger_vminfo_common_parameters(protocol)

            # Log OS-specific info.
            os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\
                u"OSUtil: {util_name}; AgentService: {service_name}; "\
                u"Python: {py_major}.{py_minor}.{py_micro}; "\
                u"systemd: {systemd}; "\
                u"LISDrivers: {lis_ver}; "\
                u"logrotate: {has_logrotate};".format(
                    dist_name=DISTRO_NAME, dist_ver=DISTRO_VERSION,
                    util_name=type(self.osutil).__name__,
                    service_name=self.osutil.service_name,
                    py_major=PY_VERSION_MAJOR, py_minor=PY_VERSION_MINOR,
                    py_micro=PY_VERSION_MICRO, systemd=systemd.is_systemd(),
                    lis_ver=get_lis_version(), has_logrotate=has_logrotate()
            )

            logger.info(os_info_msg)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.OSInfo,
                      message=os_info_msg)

            #
            # Perform initialization tasks
            #
            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler(protocol)
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler(protocol)

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._emit_changes_in_default_configuration()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()
            self._ensure_cgroups_initialized()
            self._ensure_extension_telemetry_state_configured_properly(
                protocol)
            self._ensure_firewall_rules_persisted(
                dst_ip=protocol.get_endpoint())

            # Get all thread handlers
            telemetry_handler = get_send_telemetry_events_handler(
                self.protocol_util)
            all_thread_handlers = [
                get_monitor_handler(),
                get_env_handler(), telemetry_handler,
                get_collect_telemetry_events_handler(telemetry_handler)
            ]

            if is_log_collection_allowed():
                all_thread_handlers.append(get_collect_logs_handler())

            # Launch all monitoring threads
            for thread_handler in all_thread_handlers:
                thread_handler.run()

            goal_state_interval = conf.get_goal_state_period(
            ) if conf.get_extensions_enabled(
            ) else GOAL_STATE_INTERVAL_DISABLED

            while self.running:
                #
                # Check that the parent process (the agent's daemon) is still running
                #
                if not debug and self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                #
                # Check that all the threads are still running
                #
                for thread_handler in all_thread_handlers:
                    if not thread_handler.is_alive():
                        logger.warn("{0} thread died, restarting".format(
                            thread_handler.get_thread_name()))
                        thread_handler.start()

                #
                # Process the goal state
                #
                if not protocol.try_update_goal_state():
                    self._heartbeat_update_goal_state_error_count += 1
                else:
                    if self._upgrade_available(protocol):
                        available_agent = self.get_latest_agent()
                        if available_agent is None:
                            logger.info(
                                "Agent {0} is reverting to the installed agent -- exiting",
                                CURRENT_AGENT)
                        else:
                            logger.info(
                                u"Agent {0} discovered update {1} -- exiting",
                                CURRENT_AGENT, available_agent.name)
                        break

                    utc_start = datetime.utcnow()

                    last_etag = exthandlers_handler.last_etag
                    exthandlers_handler.run()

                    remote_access_handler.run()

                    if last_etag != exthandlers_handler.last_etag:
                        self._ensure_readonly_files()
                        duration = elapsed_milliseconds(utc_start)
                        activity_id, correlation_id, gs_creation_time = exthandlers_handler.get_goal_state_debug_metadata(
                        )
                        msg = 'ProcessGoalState completed [Incarnation: {0}; {1} ms; Activity Id: {2}; Correlation Id: {3}; GS Creation Time: {4}]'.format(
                            exthandlers_handler.last_etag, duration,
                            activity_id, correlation_id, gs_creation_time)
                        logger.info(msg)
                        add_event(AGENT_NAME,
                                  op=WALAEventOperation.ProcessGoalState,
                                  duration=duration,
                                  message=msg)

                self._send_heartbeat_telemetry(protocol)
                time.sleep(goal_state_interval)

        except Exception as error:
            msg = u"Agent {0} failed with exception: {1}".format(
                CURRENT_AGENT, ustr(error))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)
Пример #21
0
    def run(self):
        """
        This is the main loop which watches for agent and extension updates.
        """

        logger.info(u"Agent {0} is running as the goal state agent",
                    CURRENT_AGENT)

        # Launch monitoring threads
        from azurelinuxagent.ga.monitor import get_monitor_handler
        get_monitor_handler().run()

        from azurelinuxagent.ga.env import get_env_handler
        get_env_handler().run()

        from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
        exthandlers_handler = get_exthandlers_handler()
        migrate_handler_state()

        try:
            send_event_time = datetime.utcnow()

            self._ensure_no_orphans()
            self._emit_restart_event()

            while self.running:
                if self._is_orphaned:
                    logger.info("Goal state agent {0} was orphaned -- exiting",
                                CURRENT_AGENT)
                    break

                if self._upgrade_available():
                    if len(self.agents) > 0:
                        logger.info(
                            u"Agent {0} discovered {1} as an update and will exit",
                            CURRENT_AGENT, self.agents[0].name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                log_event = last_etag != exthandlers_handler.last_etag or \
                            (datetime.utcnow() >= send_event_time)
                add_event(AGENT_NAME,
                          version=CURRENT_VERSION,
                          op=WALAEventOperation.ProcessGoalState,
                          is_success=True,
                          duration=elapsed_milliseconds(utc_start),
                          log_event=log_event)
                if log_event:
                    send_event_time += timedelta(
                        minutes=REPORT_STATUS_INTERVAL)

                test_agent = self.get_test_agent()
                if test_agent is not None and test_agent.in_slice:
                    test_agent.enable()
                    logger.info(u"Enabled Agent {0} as test agent",
                                test_agent.name)
                    break

                time.sleep(GOAL_STATE_INTERVAL)

        except Exception as e:
            logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT,
                        ustr(e))
            logger.warn(traceback.format_exc())
            sys.exit(1)
            return

        self._shutdown()
        sys.exit(0)
        return
Пример #22
0
    def run(self):
        """
        This is the main loop which watches for agent and extension updates.
        """

        logger.info(u"Agent {0} is running as the goal state agent",
                    CURRENT_AGENT)

        # Launch monitoring threads
        from azurelinuxagent.ga.monitor import get_monitor_handler
        get_monitor_handler().run()

        from azurelinuxagent.ga.env import get_env_handler
        get_env_handler().run()

        from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
        exthandlers_handler = get_exthandlers_handler()
        migrate_handler_state()

        try:
            self._ensure_no_orphans()
            self._emit_restart_event()
            self._ensure_partition_assigned()

            while self.running:
                if self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                if self._upgrade_available():
                    available_agent = self.get_latest_agent()
                    if available_agent is None:
                        logger.info(
                            "Agent {0} is reverting to the installed agent -- exiting",
                            CURRENT_AGENT)
                    else:
                        logger.info(
                            u"Agent {0} discovered update {1} -- exiting",
                            CURRENT_AGENT, available_agent.name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                if last_etag != exthandlers_handler.last_etag:
                    add_event(AGENT_NAME,
                              version=CURRENT_VERSION,
                              op=WALAEventOperation.ProcessGoalState,
                              is_success=True,
                              duration=elapsed_milliseconds(utc_start),
                              message="Incarnation {0}".format(
                                  exthandlers_handler.last_etag),
                              log_event=True)

                time.sleep(GOAL_STATE_INTERVAL)

        except Exception as e:
            logger.warn(u"Agent {0} failed with exception: {1}", CURRENT_AGENT,
                        ustr(e))
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)
Пример #23
0
    def download(self):
        begin_utc = datetime.datetime.utcnow()
        self.logger.verbose("Download extension package")
        self.set_operation(WALAEventOperation.Download)

        if self.pkg is None:
            raise ExtensionError("No package uri found")
        
        uris_shuffled = self.pkg.uris
        random.shuffle(uris_shuffled)
        file_downloaded = False

        for uri in uris_shuffled:
            try:
                destination = os.path.join(conf.get_lib_dir(), os.path.basename(uri.uri) + ".zip")
                file_downloaded = self.protocol.download_ext_handler_pkg(uri.uri, destination)

                if file_downloaded and os.path.exists(destination):
                    self.pkg_file = destination
                    break

            except Exception as e:
                logger.warn("Error while downloading extension: {0}", ustr(e))
        
        if not file_downloaded:
            raise ExtensionError("Failed to download extension", code=1001)

        self.logger.verbose("Unzip extension package")
        try:
            zipfile.ZipFile(self.pkg_file).extractall(self.get_base_dir())
            os.remove(self.pkg_file)
        except IOError as e:
            fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to unzip extension package", e, code=1001)

        # Add user execute permission to all files under the base dir
        for file in fileutil.get_all_files(self.get_base_dir()):
            fileutil.chmod(file, os.stat(file).st_mode | stat.S_IXUSR)

        duration = elapsed_milliseconds(begin_utc)
        self.report_event(message="Download succeeded", duration=duration)

        self.logger.info("Initialize extension directory")
        # Save HandlerManifest.json
        man_file = fileutil.search_file(self.get_base_dir(), 'HandlerManifest.json')

        if man_file is None:
            raise ExtensionError("HandlerManifest.json not found")
        
        try:
            man = fileutil.read_file(man_file, remove_bom=True)
            fileutil.write_file(self.get_manifest_file(), man)
        except IOError as e:
            fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to save HandlerManifest.json", e)

        # Create status and config dir
        try:
            status_dir = self.get_status_dir()
            fileutil.mkdir(status_dir, mode=0o700)

            seq_no, status_path = self.get_status_file_path()
            if status_path is not None:
                now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
                status = {
                    "version": 1.0,
                    "timestampUTC": now,
                    "status": {
                        "name": self.ext_handler.name,
                        "operation": "Enabling Handler",
                        "status": "transitioning",
                        "code": 0
                    }
                }
                fileutil.write_file(status_path, json.dumps(status))

            conf_dir = self.get_conf_dir()
            fileutil.mkdir(conf_dir, mode=0o700)

        except IOError as e:
            fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file])
            raise ExtensionError(u"Failed to create status or config dir", e)

        # Save HandlerEnvironment.json
        self.create_handler_env()
Пример #24
0
    def run(self):
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            # Launch monitoring threads
            from azurelinuxagent.ga.monitor import get_monitor_handler
            monitor_thread = get_monitor_handler()
            monitor_thread.run()

            from azurelinuxagent.ga.env import get_env_handler
            env_thread = get_env_handler()
            env_thread.run()

            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler()
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler()

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()

            while self.running:
                if self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                if not monitor_thread.is_alive():
                    logger.warn(u"Monitor thread died, restarting")
                    monitor_thread.start()

                if not env_thread.is_alive():
                    logger.warn(u"Environment thread died, restarting")
                    env_thread.start()

                if self._upgrade_available():
                    available_agent = self.get_latest_agent()
                    if available_agent is None:
                        logger.info(
                            "Agent {0} is reverting to the installed agent -- exiting",
                            CURRENT_AGENT)
                    else:
                        logger.info(
                            u"Agent {0} discovered update {1} -- exiting",
                            CURRENT_AGENT,
                            available_agent.name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                remote_access_handler.run()

                if last_etag != exthandlers_handler.last_etag:
                    self._ensure_readonly_files()
                    duration = elapsed_milliseconds(utc_start)
                    logger.info('ProcessGoalState completed [incarnation {0}; {1} ms]',
                                exthandlers_handler.last_etag,
                                duration)
                    add_event(
                        AGENT_NAME,
                        op=WALAEventOperation.ProcessGoalState,
                        duration=duration,
                        message="Incarnation {0}".format(exthandlers_handler.last_etag))

                time.sleep(GOAL_STATE_INTERVAL)

        except Exception as e:
            msg = u"Agent {0} failed with exception: {1}".format(CURRENT_AGENT, ustr(e))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)
Пример #25
0
 def test_elapsed_milliseconds(self):
     utc_start = datetime.utcnow() + timedelta(days=1)
     self.assertEqual(0, elapsed_milliseconds(utc_start))
Пример #26
0
    def run(self, debug=False):
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            # NOTE: Do not add any telemetry events until after the monitoring handler has been started with the
            # call to 'monitor_thread.run()'. That method call initializes the protocol, which is needed in order to
            # load the goal state and update the container id in memory. Any telemetry events sent before this happens
            # will result in an uninitialized container id value.

            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            # Log OS-specific info locally.
            os_info_msg = u"Distro info: {0} {1}, osutil class being used: {2}, " \
                          u"agent service name: {3}".format(DISTRO_NAME, DISTRO_VERSION,
                                                            type(self.osutil).__name__, self.osutil.service_name)
            logger.info(os_info_msg)

            # Launch monitoring threads
            from azurelinuxagent.ga.monitor import get_monitor_handler
            monitor_thread = get_monitor_handler()
            monitor_thread.run()

            # NOTE: Any telemetry events added from this point on will be properly populated with the container id.

            from azurelinuxagent.ga.env import get_env_handler
            env_thread = get_env_handler()
            env_thread.run()

            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler()
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler()

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()
            self._ensure_cgroups_initialized()

            # Send OS-specific info as a telemetry event after the monitoring thread has been initialized, and with
            # it the container id too.
            add_event(AGENT_NAME,
                      op=WALAEventOperation.OSInfo,
                      message=os_info_msg)

            goal_state_interval = GOAL_STATE_INTERVAL \
                if conf.get_extensions_enabled() \
                else GOAL_STATE_INTERVAL_DISABLED

            while self.running:
                if not debug and self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                if not monitor_thread.is_alive():
                    logger.warn(u"Monitor thread died, restarting")
                    monitor_thread.start()

                if not env_thread.is_alive():
                    logger.warn(u"Environment thread died, restarting")
                    env_thread.start()

                if self._upgrade_available():
                    available_agent = self.get_latest_agent()
                    if available_agent is None:
                        logger.info(
                            "Agent {0} is reverting to the installed agent -- exiting",
                            CURRENT_AGENT)
                    else:
                        logger.info(
                            u"Agent {0} discovered update {1} -- exiting",
                            CURRENT_AGENT, available_agent.name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                remote_access_handler.run()

                if last_etag != exthandlers_handler.last_etag:
                    self._ensure_readonly_files()
                    duration = elapsed_milliseconds(utc_start)
                    logger.info(
                        'ProcessGoalState completed [incarnation {0}; {1} ms]',
                        exthandlers_handler.last_etag, duration)
                    add_event(AGENT_NAME,
                              op=WALAEventOperation.ProcessGoalState,
                              duration=duration,
                              message="Incarnation {0}".format(
                                  exthandlers_handler.last_etag))

                time.sleep(goal_state_interval)

        except Exception as e:
            msg = u"Agent {0} failed with exception: {1}".format(
                CURRENT_AGENT, ustr(e))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)
Пример #27
0
 def test_elapsed_milliseconds(self):
     utc_start = datetime.utcnow() + timedelta(days=1)
     self.assertEqual(0, elapsed_milliseconds(utc_start))
Пример #28
0
    def run(self):
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            # Launch monitoring threads
            from azurelinuxagent.ga.monitor import get_monitor_handler
            monitor_thread = get_monitor_handler()
            monitor_thread.run()

            from azurelinuxagent.ga.env import get_env_handler
            env_thread = get_env_handler()
            env_thread.run()

            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler()
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler()

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()

            while self.running:
                if self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                if not monitor_thread.is_alive():
                    logger.warn(u"Monitor thread died, restarting")
                    monitor_thread.start()

                if not env_thread.is_alive():
                    logger.warn(u"Environment thread died, restarting")
                    env_thread.start()

                if self._upgrade_available():
                    available_agent = self.get_latest_agent()
                    if available_agent is None:
                        logger.info(
                            "Agent {0} is reverting to the installed agent -- exiting",
                            CURRENT_AGENT)
                    else:
                        logger.info(
                            u"Agent {0} discovered update {1} -- exiting",
                            CURRENT_AGENT,
                            available_agent.name)
                    break

                utc_start = datetime.utcnow()

                last_etag = exthandlers_handler.last_etag
                exthandlers_handler.run()

                remote_access_handler.run()

                if last_etag != exthandlers_handler.last_etag:
                    self._ensure_readonly_files()
                    add_event(
                        AGENT_NAME,
                        version=CURRENT_VERSION,
                        op=WALAEventOperation.ProcessGoalState,
                        is_success=True,
                        duration=elapsed_milliseconds(utc_start),
                        message="Incarnation {0}".format(
                                            exthandlers_handler.last_etag),
                        log_event=True)

                time.sleep(GOAL_STATE_INTERVAL)

        except Exception as e:
            msg = u"Agent {0} failed with exception: {1}".format(
                CURRENT_AGENT, ustr(e))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)
Пример #29
0
    def run(self, debug=False):
        """
        This is the main loop which watches for agent and extension updates.
        """

        try:
            logger.info(u"Agent {0} is running as the goal state agent",
                        CURRENT_AGENT)

            #
            # Fetch the goal state one time; some components depend on information provided by the goal state and this
            # call ensures the required info is initialized (e.g telemetry depends on the container ID.)
            #
            protocol = self.protocol_util.get_protocol()
            protocol.update_goal_state()

            initialize_event_logger_vminfo_common_parameters(protocol)

            # Log OS-specific info.
            os_info_msg = u"Distro: {0}-{1}; OSUtil: {2}; AgentService: {3}; Python: {4}.{5}.{6}".format(
                DISTRO_NAME, DISTRO_VERSION,
                type(self.osutil).__name__, self.osutil.service_name,
                PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO)
            logger.info(os_info_msg)
            add_event(AGENT_NAME,
                      op=WALAEventOperation.OSInfo,
                      message=os_info_msg)

            # Launch monitoring threads
            from azurelinuxagent.ga.monitor import get_monitor_handler
            monitor_thread = get_monitor_handler()
            monitor_thread.run()

            from azurelinuxagent.ga.env import get_env_handler
            env_thread = get_env_handler()
            env_thread.run()

            from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
            exthandlers_handler = get_exthandlers_handler(protocol)
            migrate_handler_state()

            from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
            remote_access_handler = get_remote_access_handler(protocol)

            self._ensure_no_orphans()
            self._emit_restart_event()
            self._emit_changes_in_default_configuration()
            self._ensure_partition_assigned()
            self._ensure_readonly_files()
            self._ensure_cgroups_initialized()

            goal_state_interval = conf.get_goal_state_period(
            ) if conf.get_extensions_enabled(
            ) else GOAL_STATE_INTERVAL_DISABLED

            while self.running:
                #
                # Check that the parent process (the agent's daemon) is still running
                #
                if not debug and self._is_orphaned:
                    logger.info("Agent {0} is an orphan -- exiting",
                                CURRENT_AGENT)
                    break

                #
                # Check that all the threads are still running
                #
                if not monitor_thread.is_alive():
                    logger.warn(u"Monitor thread died, restarting")
                    monitor_thread.start()

                if not env_thread.is_alive():
                    logger.warn(u"Environment thread died, restarting")
                    env_thread.start()

                #
                # Process the goal state
                #
                if not protocol.try_update_goal_state():
                    self._heartbeat_update_goal_state_error_count += 1
                else:
                    if self._upgrade_available(protocol):
                        available_agent = self.get_latest_agent()
                        if available_agent is None:
                            logger.info(
                                "Agent {0} is reverting to the installed agent -- exiting",
                                CURRENT_AGENT)
                        else:
                            logger.info(
                                u"Agent {0} discovered update {1} -- exiting",
                                CURRENT_AGENT, available_agent.name)
                        break

                    utc_start = datetime.utcnow()

                    last_etag = exthandlers_handler.last_etag
                    exthandlers_handler.run()

                    remote_access_handler.run()

                    if last_etag != exthandlers_handler.last_etag:
                        self._ensure_readonly_files()
                        duration = elapsed_milliseconds(utc_start)
                        logger.info(
                            'ProcessGoalState completed [incarnation {0}; {1} ms]',
                            exthandlers_handler.last_etag, duration)
                        add_event(AGENT_NAME,
                                  op=WALAEventOperation.ProcessGoalState,
                                  duration=duration,
                                  message="Incarnation {0}".format(
                                      exthandlers_handler.last_etag))

                self._send_heartbeat_telemetry(protocol)
                time.sleep(goal_state_interval)

        except Exception as e:
            msg = u"Agent {0} failed with exception: {1}".format(
                CURRENT_AGENT, ustr(e))
            self._set_sentinel(msg=msg)
            logger.warn(msg)
            logger.warn(traceback.format_exc())
            sys.exit(1)
            # additional return here because sys.exit is mocked in unit tests
            return

        self._shutdown()
        sys.exit(0)