示例#1
0
    def get_single_system_state(self):
        """Get the current homogeneous DAOS system state.

        Raises:
            ServerFailed: if a single state for all servers is not detected

        Returns:
            str: the current DAOS system state

        """
        data = self.get_current_state()
        if not data:
            # The regex failed to get the rank and state
            raise ServerFailed("Error obtaining {} output: {}".format(
                self.dmg, data))
        try:
            states = list(set([data[rank]["state"] for rank in data]))
        except KeyError as error:
            raise ServerFailed(
                "Unexpected result from {} - missing 'state' key: {}".format(
                    self.dmg, data)) from error
        if len(states) > 1:
            # Multiple states for different ranks detected
            raise ServerFailed(
                "Multiple system states ({}) detected:\n  {}".format(
                    states, data))
        return states[0]
示例#2
0
文件: server_utils.py 项目: liw/daos
    def stop_random_rank(self, daos_log, force=False, exclude_ranks=None):
        """Kill/Stop a random server rank that is expected to be running.

        Args:
            daos_log (DaosLog): object for logging messages
            force (bool, optional): whether to use --force option to dmg system
                stop. Defaults to False.
            exclude_ranks (list, optional): ranks to exclude from the random selection.
                Default is None.

        Raises:
            avocado.core.exceptions.TestFail: if there is an issue stopping the server ranks.
            ServerFailed: if there are no available ranks to stop.

        """
        # Exclude non-running ranks
        rank_state = self.get_expected_states()
        candidate_ranks = []
        for rank, state in rank_state.items():
            for running_state in self._states["running"]:
                if running_state in state:
                    candidate_ranks.append(rank)
                    continue

        # Exclude specified ranks
        for rank in exclude_ranks or []:
            if rank in candidate_ranks:
                del candidate_ranks[candidate_ranks.index(rank)]

        if len(candidate_ranks) < 1:
            raise ServerFailed("No available candidate ranks to stop.")

        # Stop a random rank
        random_rank = random.choice(candidate_ranks) #nosec
        return self.stop_ranks([random_rank], daos_log=daos_log, force=force)
示例#3
0
文件: server_utils.py 项目: liw/daos
    def check_rank_state(self, rank, valid_state, max_checks=1):
        """Check the state of single rank in DAOS system.

        Args:
            rankv(int): daos rank whose state need's to be checked
            valid_state (str): expected state for the rank
            max_checks (int, optional): number of times to check the state
                Defaults to 1.
        Raises:
            ServerFailed: if there was error obtaining the data for daos
                          system query
        Returns:
            bool: returns True if there is a match for checked state,
                  else False.

        """
        checks = 0
        while checks < max_checks:
            if checks > 0:
                time.sleep(1)
            data = self.get_current_state()
            if not data:
                # The regex failed to get the rank and state
                raise ServerFailed("Error obtaining {} output: {}".format(self.dmg, data))
            checks += 1
            if data[rank]["state"] == valid_state:
                return True

        return False
示例#4
0
文件: server_utils.py 项目: liw/daos
    def get_environment_value(self, name):
        """Get the server config value associated with the env variable name.

        Args:
            name (str): environment variable name for which to get a daos_server
                configuration value

        Raises:
            ServerFailed: Unable to find a daos_server configuration value for
                the specified environment variable name

        Returns:
            str: the daos_server configuration value for the specified
                environment variable name

        """
        try:
            setting = self.ENVIRONMENT_VARIABLE_MAPPING[name]

        except IndexError as error:
            raise ServerFailed(
                "Unknown server config setting mapping for the {} environment "
                "variable!".format(name)) from error

        return self.get_config_value(setting)
示例#5
0
文件: server_utils.py 项目: liw/daos
    def stop(self):
        """Stop the server through the runner."""
        self.log.info("<SERVER> Stopping server %s command", self.manager.command)

        # Maintain a running list of errors detected trying to stop
        messages = []

        # Stop the subprocess running the job manager command
        try:
            super().stop()
        except CommandFailure as error:
            messages.append(
                "Error stopping the {} subprocess: {}".format(self.manager.command, error))

        # Kill any leftover processes that may not have been stopped correctly
        self.manager.kill()

        if self.manager.job.using_nvme:
            # Reset the storage
            try:
                self.reset_storage()
            except ServerFailed as error:
                messages.append(str(error))

            # Make sure the mount directory belongs to non-root user
            self.set_scm_mount_ownership()

        # Report any errors after all stop actions have been attempted
        if messages:
            raise ServerFailed("Failed to stop servers:\n  {}".format("\n  ".join(messages)))
示例#6
0
文件: server_utils.py 项目: liw/daos
    def detect_engine_start(self, hosts_qty=None):
        """Detect when all the engines have started.

        Args:
            hosts_qty (int): number of servers expected to have been started.

        Raises:
            ServerFailed: if there was an error starting the servers after formatting.

        """
        if hosts_qty is None:
            hosts_qty = len(self._hosts)

        if self.detect_start_via_dmg:
            self.log.info("<SERVER> Waiting for the daos_engine to start via dmg system query")
            self.manager.job.update_pattern("dmg", hosts_qty)
            started = self.get_detected_engine_count(self.manager.process)
        else:
            self.log.info("<SERVER> Waiting for the daos_engine to start")
            self.manager.job.update_pattern("normal", hosts_qty)
            started = self.manager.check_subprocess_status(self.manager.process)

        if not started:
            self.manager.kill()
            raise ServerFailed("Failed to start servers after format")

        # Update the dmg command host list to work with pool create/destroy
        self._prepare_dmg_hostlist()

        # Define the expected states for each rank
        self._expected_states = self.get_current_state()
示例#7
0
文件: server_utils.py 项目: liw/daos
    def prepare_storage(self, user, using_dcpm=None, using_nvme=None):
        """Prepare the server storage.

        Args:
            user (str): username
            using_dcpm (bool, optional): override option to prepare scm storage.
                Defaults to None, which uses the configuration file to determine
                if scm storage should be formatted.
            using_nvme (bool, optional): override option to prepare nvme
                storage. Defaults to None, which uses the configuration file to
                determine if nvme storage should be formatted.

        Raises:
            ServerFailed: if there was an error preparing the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.target_user.value = user
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use the configuration file settings if no overrides specified
        if using_dcpm is None:
            using_dcpm = self.manager.job.using_dcpm
        if using_nvme is None:
            using_nvme = self.manager.job.using_nvme

        if using_dcpm and not using_nvme:
            cmd.sub_command_class.sub_command_class.scm_only.value = True
        elif not using_dcpm and using_nvme:
            cmd.sub_command_class.sub_command_class.nvme_only.value = True

        self.log.info("Preparing DAOS server storage: %s", str(cmd))
        results = run_pcmd(self._hosts, str(cmd), timeout=self.storage_prepare_timeout.value)

        # gratuitously lifted from pcmd() and get_current_state()
        result = {}
        stdouts = ""
        for res in results:
            stdouts += '\n'.join(res["stdout"] + [''])
            if res["exit_status"] not in result:
                result[res["exit_status"]] = NodeSet()
            result[res["exit_status"]].add(res["hosts"])

        if len(result) > 1 or 0 not in result or \
           (using_dcpm and "No SCM modules detected; skipping operation" in stdouts):
            dev_type = "nvme"
            if using_dcpm and using_nvme:
                dev_type = "dcpm & nvme"
            elif using_dcpm:
                dev_type = "dcpm"
            pcmd(self._hosts, "sudo -n ipmctl show -v -dimm")
            pcmd(self._hosts, "ndctl list ")
            raise ServerFailed("Error preparing {} storage".format(dev_type))
示例#8
0
文件: server_utils.py 项目: liw/daos
    def system_start(self):
        """Start the DAOS I/O Engines.

        Raises:
            ServerFailed: if there was an error starting the servers

        """
        self.log.info("Starting DAOS I/O Engines")
        self.check_system_state(("stopped"))
        self.dmg.system_start()
        if self.dmg.result.exit_status != 0:
            raise ServerFailed("Error starting DAOS:\n{}".format(self.dmg.result))
示例#9
0
文件: server_utils.py 项目: liw/daos
    def system_stop(self, extra_states=None):
        """Stop the DAOS I/O Engines.

        Args:
            extra_states (list, optional): a list of DAOS system states in
                addition to "started" and "joined" that are verified prior to
                issuing the stop. Defaults to None.

        Raises:
            ServerFailed: if there was an error stopping the servers

        """
        valid_states = ["started", "joined"]
        if extra_states:
            valid_states.extend(extra_states)
        self.log.info("Stopping DAOS I/O Engines")
        self.check_system_state(valid_states)
        self.dmg.system_stop(force=True)
        if self.dmg.result.exit_status != 0:
            raise ServerFailed("Error stopping DAOS:\n{}".format(self.dmg.result))
示例#10
0
文件: server_utils.py 项目: liw/daos
    def reset_storage(self):
        """Reset the server storage.

        Raises:
            ServerFailed: if there was an error resetting the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.nvme_only.value = True
        cmd.sub_command_class.sub_command_class.reset.value = True
        cmd.sub_command_class.sub_command_class.force.value = True

        self.log.info("Resetting DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")
示例#11
0
文件: server_utils.py 项目: liw/daos
    def detect_format_ready(self, reformat=False):
        """Detect when all the daos_servers are ready for storage format.

        Args:
            reformat (bool, optional): whether or detect reformat (True) or
                format (False) messages. Defaults to False.

        Raises:
            ServerFailed: if there was an error starting the servers.

        """
        f_type = "format" if not reformat else "reformat"
        self.log.info("<SERVER> Waiting for servers to be ready for %s", f_type)
        self.manager.job.update_pattern(f_type, len(self._hosts))
        try:
            self.manager.run()
        except CommandFailure as error:
            self.manager.kill()
            raise ServerFailed(
                "Failed to start servers before format: {}".format(error)) from error
示例#12
0
    def check_system_state(self, valid_states, max_checks=1):
        """Check that the DAOS system state is one of the provided states.

        Fail the test if the current state does not match one of the specified
        valid states.  Optionally the state check can loop multiple times,
        sleeping one second between checks, by increasing the number of maximum
        checks.

        Args:
            valid_states (list): expected DAOS system states as a list of
                lowercase strings
            max_checks (int, optional): number of times to check the state.
                Defaults to 1.

        Raises:
            ServerFailed: if there was an error detecting the server state or
                the detected state did not match one of the valid states

        Returns:
            str: the matching valid detected state

        """
        checks = 0
        daos_state = "????"
        while daos_state not in valid_states and checks < max_checks:
            if checks > 0:
                time.sleep(1)
            try:
                daos_state = self.get_single_system_state().lower()
            except ServerFailed as error:
                raise error
            checks += 1
            self.log.info("System state check (%s): %s", checks, daos_state)
        if daos_state not in valid_states:
            raise ServerFailed(
                "Error checking DAOS state, currently neither {} after "
                "{} state check(s)!".format(valid_states, checks))
        return daos_state
示例#13
0
    def reset_storage(self):
        """Reset the server storage.

        Raises:
            ServerFailed: if there was an error resetting the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.nvme_only.value = True
        cmd.sub_command_class.sub_command_class.reset.value = True
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use VMD option when resetting storage if it's prepared with VMD.
        if "True" in os.environ["DAOS_ENABLE_VMD"]:
            cmd.sub_command_class.sub_command_class.enable_vmd.value = True

        self.log.info("Resetting DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")
示例#14
0
文件: server_utils.py 项目: liw/daos
    def autosize_pool_params(self, size, tier_ratio, scm_size, nvme_size,
                             min_targets=1, quantity=1):
        """Update any pool size parameter ending in a %.

        Use the current NVMe and SCM storage sizes to assign values to the size,
        scm_size, and or nvme_size dmg pool create arguments which end in "%".
        The numerical part of these arguments will be used to assign a value
        that is X% of the available storage capacity.  The updated size and
        nvme_size arguments will be assigned values that are multiples of 1GiB
        times the number of targets assigned to each server engine.  If needed
        the number of targets will be reduced (to not exceed min_targets) in
        order to support the requested size.  An optional number of expected
        pools (quantity) can also be specified to divide the available storage
        capacity.

        Note: depending upon the inputs this method may return dmg pool create
            parameter combinations that are not supported, e.g. tier_ratio +
            nvme_size.  This is intended to allow testing of these combinations.

        Args:
            size (object): the str, int, or None value for the dmg pool create size parameter.
            tier_ratio (object): the int or None value for the dmg pool create size parameter.
            scm_size (object): the str, int, or None value for the dmg pool
                create scm_size parameter.
            nvme_size (object): the str, int, or None value for the dmg pool
                create nvme_size parameter.
            min_targets (int, optional): the minimum number of targets per
                engine that can be configured. Defaults to 1.
            quantity (int, optional): Number of pools to account for in the size
                calculations. The pool size returned is only for a single pool.
                Defaults to 1.

        Raises:
            ServerFailed: if there was a error obtaining auto-sized TestPool parameters.
            AutosizeCancel: if a valid pool parameter size could not be obtained

        Returns:
            dict: the parameters for a TestPool object.

        """
        # Adjust any pool size parameter by the requested percentage
        params = {"tier_ratio": tier_ratio}
        adjusted = {"size": size, "scm_size": scm_size, "nvme_size": nvme_size}
        keys = [
            key for key in ("size", "scm_size", "nvme_size")
            if adjusted[key] is not None and str(adjusted[key]).endswith("%")]
        if keys:
            # Verify the minimum number of targets configured per engine
            targets = min(self.manager.job.get_engine_values("targets"))
            if targets < min_targets:
                raise ServerFailed(
                    "Minimum target quantity ({}) exceeds current target "
                    "quantity ({})".format(min_targets, targets))

            self.log.info("-" * 100)
            pool_msg = "{} pool{}".format(quantity, "s" if quantity > 1 else "")
            self.log.info(
                "Autosizing TestPool parameters ending with a \"%%\" for %s:",
                pool_msg)
            for key in ("size", "scm_size", "nvme_size"):
                self.log.info("  - %-9s : %s (%s)", key, adjusted[key], key in keys)

            # Determine the largest SCM and NVMe pool sizes can be used with
            # this server configuration with an optionally applied ratio.
            try:
                available_storage = self.get_available_storage()
            except ServerFailed as error:
                raise ServerFailed("Error obtaining available storage") from error

            # Determine the SCM and NVMe size limits for the size and tier_ratio
            # arguments for the total number of engines
            if tier_ratio is None:
                # Use the default value if not provided
                tier_ratio = 6
            engine_qty = len(self.manager.job.engine_params) * len(self._hosts)
            available_storage["size"] = min(
                engine_qty * available_storage["nvme"],
                (engine_qty * available_storage["scm"]) / float(tier_ratio / 100)
            )
            available_storage["tier_ratio"] = available_storage["size"] * float(tier_ratio / 100)
            self.log.info(
                "Largest storage size available for %s engines with a %.2f%% "
                "tier_ratio:", engine_qty, tier_ratio)
            self.log.info(
                "  - NVME     : %s",
                get_display_size(available_storage["size"]))
            self.log.info(
                "  - SCM      : %s",
                get_display_size(available_storage["tier_ratio"]))
            self.log.info(
                "  - COMBINED : %s",
                get_display_size(available_storage["size"] + available_storage["tier_ratio"]))

            # Apply any requested percentages to the pool parameters
            available = {
                "size": {"size": available_storage["size"], "type": "NVMe"},
                "scm_size": {"size": available_storage["scm"], "type": "SCM"},
                "nvme_size": {"size": available_storage["nvme"], "type": "NVMe"}
            }
            self.log.info("Adjusted pool sizes for %s:", pool_msg)
            for key in keys:
                try:
                    ratio = int(str(adjusted[key]).replace("%", ""))
                except NameError as error:
                    raise ServerFailed(
                        "Invalid '{}' format: {}".format(key, adjusted[key])) from error
                adjusted[key] = (available[key]["size"] * float(ratio / 100)) / quantity
                self.log.info(
                    "  - %-9s : %-4s storage adjusted by %.2f%%: %s",
                    key, available[key]["type"], ratio,
                    get_display_size(adjusted[key]))

            # Display the pool size increment value for each size argument
            increment = {
                "size": human_to_bytes("1GiB"),
                "scm_size": human_to_bytes("16MiB"),
                "nvme_size": human_to_bytes("1GiB")}
            self.log.info("Increment sizes per target:")
            for key in keys:
                self.log.info("  - %-9s : %s", key, get_display_size(increment[key]))

            # Adjust the size to use a SCM/NVMe target multiplier
            self.log.info("Pool sizes adjusted to fit by increment sizes:")
            adjusted_targets = targets
            for key in keys:
                multiplier = math.floor(adjusted[key] / increment[key])
                params[key] = multiplier * increment[key]
                self.log.info(
                    "  - %-9s : %s * %s = %s",
                    key, multiplier, increment[key],
                    get_display_size(params[key]))
                if multiplier < adjusted_targets:
                    adjusted_targets = multiplier
                    if adjusted_targets < min_targets:
                        raise AutosizeCancel(
                            "Unable to autosize the {} pool parameter due to "
                            "exceeding the minimum of {} targets: {}".format(
                                key, min_targets, adjusted_targets))
                if key == "size":
                    tier_ratio_size = params[key] * float(tier_ratio / 100)
                    self.log.info(
                        "  - %-9s : %.2f%% tier_ratio = %s",
                        key, tier_ratio, get_display_size(tier_ratio_size))
                    params[key] += tier_ratio_size
                    self.log.info(
                        "  - %-9s : NVMe + SCM = %s",
                        key, get_display_size(params[key]))
                params[key] = bytes_to_human(params[key], binary=True)

            # Reboot the servers if a reduced number of targets is required
            if adjusted_targets < targets:
                self.log.info(
                        "Updating targets per server engine: %s -> %s",
                        targets, adjusted_targets)
                self.set_config_value("targets", adjusted_targets)
                self.stop()
                self.start()

            self.log.info("-" * 100)

        return params