class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the daos_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info("<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates(get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update(self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "sudo rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info("Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=40) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self, host_qty=None): """Detect when all the daos_io_servers have started. Args: host_qty (int): number of servers expected to have been started. Raises: ServerFailed: if there was an error starting the servers after formatting. """ if host_qty is None: hosts_qty = len(self._hosts) self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", hosts_qty) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add("sudo chown -R {0}:{0} {1}".format( user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info("<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) # Temporarily increasing timeout to avoid CI errors until DAOS-5764 can # be further investigated. self.dmg.storage_format(timeout=40) # Wait for all the daos_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info("<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append("Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format( "\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting) def get_single_system_state(self): """Get the current homogeneous DAOS system state. Raises: ServerFailed: if a single state for all servers is not detected Returns: str: the current DAOS system state """ data = self.dmg.system_query() if not data: # The regex failed to get the rank and state raise ServerFailed("Error obtaining {} output: {}".format( self.dmg, data)) try: states = list(set([data[rank]["state"] for rank in data])) except KeyError: raise ServerFailed( "Unexpected result from {} - missing 'state' key: {}".format( self.dmg, data)) if len(states) > 1: # Multiple states for different ranks detected raise ServerFailed( "Multiple system states ({}) detected:\n {}".format( states, data)) return states[0] def check_system_state(self, valid_states, max_checks=1): """Check that the DAOS system state is one of the provided states. Fail the test if the current state does not match one of the specified valid states. Optionally the state check can loop multiple times, sleeping one second between checks, by increasing the number of maximum checks. Args: valid_states (list): expected DAOS system states as a list of lowercase strings max_checks (int, optional): number of times to check the state. Defaults to 1. Raises: ServerFailed: if there was an error detecting the server state or the detected state did not match one of the valid states Returns: str: the matching valid detected state """ checks = 0 daos_state = "????" while daos_state not in valid_states and checks < max_checks: if checks > 0: time.sleep(1) try: daos_state = self.get_single_system_state().lower() except ServerFailed as error: raise error checks += 1 self.log.info("System state check (%s): %s", checks, daos_state) if daos_state not in valid_states: raise ServerFailed( "Error checking DAOS state, currently neither {} after " "{} state check(s)!".format(valid_states, checks)) return daos_state def system_start(self): """Start the DAOS IO servers. Raises: ServerFailed: if there was an error starting the servers """ self.log.info("Starting DAOS IO servers") self.check_system_state(("stopped")) self.dmg.system_start() if self.dmg.result.exit_status != 0: raise ServerFailed("Error starting DAOS:\n{}".format( self.dmg.result)) def system_stop(self, extra_states=None): """Stop the DAOS IO servers. Args: extra_states (list, optional): a list of DAOS system states in addition to "started" and "joined" that are verified prior to issuing the stop. Defaults to None. Raises: ServerFailed: if there was an error stopping the servers """ valid_states = ["started", "joined"] if extra_states: valid_states.extend(extra_states) self.log.info("Stopping DAOS IO servers") self.check_system_state(valid_states) self.dmg.system_stop(force=True) if self.dmg.result.exit_status != 0: raise ServerFailed("Error stopping DAOS:\n{}".format( self.dmg.result)) def get_available_storage(self): """Get the available SCM and NVMe storage. Raises: ServerFailed: if there was an error stopping the servers Returns: list: a list of the maximum available SCM and NVMe sizes in bytes """ def get_host_capacity(key, device_names): """Get the total storage capacity per host rank. Args: key (str): the capacity type, e.g. "scm" or "nvme" device_names (list): the device names of this capacity type Returns: dict: a dictionary of total storage capacity per host rank """ host_capacity = {} for host in data: device_sizes = [] for device in data[host][key]: if device in device_names: device_sizes.append( human_to_bytes( data[host][key][device]["capacity"])) host_capacity[host] = sum(device_sizes) return host_capacity # Default maximum bytes for SCM and NVMe storage = [0, 0] using_dcpm = self.manager.job.using_dcpm using_nvme = self.manager.job.using_nvme if using_dcpm or using_nvme: # Stop the DAOS IO servers in order to be able to scan the storage self.system_stop() # Scan all of the hosts for their SCM and NVMe storage self.dmg.hostlist = self._hosts data = self.dmg.storage_scan(verbose=True) self.dmg.hostlist = self.get_config_value("access_points") if self.dmg.result.exit_status != 0: raise ServerFailed("Error obtaining DAOS storage:\n{}".format( self.dmg.result)) # Restart the DAOS IO servers self.system_start() if using_dcpm: # Find the sizes of the configured SCM storage scm_devices = [ os.path.basename(path) for path in self.get_config_value("scm_list") if path ] capacity = get_host_capacity("scm", scm_devices) for host in sorted(capacity): self.log.info("SCM capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[0] = capacity[min(capacity, key=capacity.get)] else: # Use the assigned scm_size scm_size = self.get_config_value("scm_size") storage[0] = human_to_bytes("{}GB".format(scm_size)) if using_nvme: # Find the sizes of the configured NVMe storage capacity = get_host_capacity("nvme", self.get_config_value("bdev_list")) for host in sorted(capacity): self.log.info("NVMe capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[1] = capacity[min(capacity, key=capacity.get)] self.log.info( "Total available storage:\n SCM: %s (%s)\n NVMe: %s (%s)", str(storage[0]), bytes_to_human(storage[0], binary=False), str(storage[1]), bytes_to_human(storage[1], binary=False)) return storage
class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the doas_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info( "<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates( get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update( self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update( {"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info( "Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self): """Detect when all the daos_io_servers have started.""" self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", len(self._hosts)) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add( "sudo chown -R {0}:{0} {1}".format(user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info( "<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) self.dmg.storage_format() # Wait for all the doas_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info( "<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append( "Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed( "Failed to stop servers:\n {}".format("\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting)