class NvmeFault(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate IO works fine when NVMe fault generated on single or multiple servers with single drive. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(NvmeFault, self).setUp() self.no_of_pools = self.params.get("number_of_pools", '/run/pool/*', 1) self.capacity = self.params.get("percentage", '/run/faulttests/pool_capacity/*') self.no_of_servers = self.params.get( "count", '/run/faulttests/no_of_servers/*/') self.no_of_drives = self.params.get("count", '/run/faulttests/no_of_drives/*/') self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") #Set to True to generate the NVMe fault during IO self.set_faulty_device = True @skipForTicket("DAOS-5497") def test_nvme_fault(self): """Jira ID: DAOS-4722. Test Description: Test NVMe disk fault. Use Case: Create the large size of pool and start filling up the pool. while IO is in progress remove single disks from single/multiple servers. :avocado: tags=all,hw,medium,nvme,ib2,nvme_fault,full_regression """ #Create the Pool with Maximum NVMe size self.create_pool_max_size(nvme=True) #Start the IOR Command and generate the NVMe fault. self.start_ior_load(precent=self.capacity) print("pool_percentage_used -- After -- {}".format( self.pool.pool_percentage_used())) #Check nvme-health command works try: self.dmg.hostlist = self.hostlist_servers self.dmg.storage_query_nvme_health() except CommandFailure as _error: self.fail("dmg nvme-health failed")
def test_dmg_nvme_scan_basic(self): """ JIRA ID: DAOS-2485 Test Description: Test basic dmg functionality to scan the nvme storage. on the system. :avocado: tags=all,tiny,pr,dmg,nvme_scan,basic """ # Create dmg command dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.get_params(self) # Update hostlist value for dmg command port = self.params.get("port", "/run/server_config/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers] dmg.hostlist = servers_with_ports try: dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details))
class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the daos_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info("<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates(get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update(self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "sudo rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info("Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=40) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self, host_qty=None): """Detect when all the daos_io_servers have started. Args: host_qty (int): number of servers expected to have been started. Raises: ServerFailed: if there was an error starting the servers after formatting. """ if host_qty is None: hosts_qty = len(self._hosts) self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", hosts_qty) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add("sudo chown -R {0}:{0} {1}".format( user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info("<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) # Temporarily increasing timeout to avoid CI errors until DAOS-5764 can # be further investigated. self.dmg.storage_format(timeout=40) # Wait for all the daos_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info("<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append("Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format( "\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting) def get_single_system_state(self): """Get the current homogeneous DAOS system state. Raises: ServerFailed: if a single state for all servers is not detected Returns: str: the current DAOS system state """ data = self.dmg.system_query() if not data: # The regex failed to get the rank and state raise ServerFailed("Error obtaining {} output: {}".format( self.dmg, data)) try: states = list(set([data[rank]["state"] for rank in data])) except KeyError: raise ServerFailed( "Unexpected result from {} - missing 'state' key: {}".format( self.dmg, data)) if len(states) > 1: # Multiple states for different ranks detected raise ServerFailed( "Multiple system states ({}) detected:\n {}".format( states, data)) return states[0] def check_system_state(self, valid_states, max_checks=1): """Check that the DAOS system state is one of the provided states. Fail the test if the current state does not match one of the specified valid states. Optionally the state check can loop multiple times, sleeping one second between checks, by increasing the number of maximum checks. Args: valid_states (list): expected DAOS system states as a list of lowercase strings max_checks (int, optional): number of times to check the state. Defaults to 1. Raises: ServerFailed: if there was an error detecting the server state or the detected state did not match one of the valid states Returns: str: the matching valid detected state """ checks = 0 daos_state = "????" while daos_state not in valid_states and checks < max_checks: if checks > 0: time.sleep(1) try: daos_state = self.get_single_system_state().lower() except ServerFailed as error: raise error checks += 1 self.log.info("System state check (%s): %s", checks, daos_state) if daos_state not in valid_states: raise ServerFailed( "Error checking DAOS state, currently neither {} after " "{} state check(s)!".format(valid_states, checks)) return daos_state def system_start(self): """Start the DAOS IO servers. Raises: ServerFailed: if there was an error starting the servers """ self.log.info("Starting DAOS IO servers") self.check_system_state(("stopped")) self.dmg.system_start() if self.dmg.result.exit_status != 0: raise ServerFailed("Error starting DAOS:\n{}".format( self.dmg.result)) def system_stop(self, extra_states=None): """Stop the DAOS IO servers. Args: extra_states (list, optional): a list of DAOS system states in addition to "started" and "joined" that are verified prior to issuing the stop. Defaults to None. Raises: ServerFailed: if there was an error stopping the servers """ valid_states = ["started", "joined"] if extra_states: valid_states.extend(extra_states) self.log.info("Stopping DAOS IO servers") self.check_system_state(valid_states) self.dmg.system_stop(force=True) if self.dmg.result.exit_status != 0: raise ServerFailed("Error stopping DAOS:\n{}".format( self.dmg.result)) def get_available_storage(self): """Get the available SCM and NVMe storage. Raises: ServerFailed: if there was an error stopping the servers Returns: list: a list of the maximum available SCM and NVMe sizes in bytes """ def get_host_capacity(key, device_names): """Get the total storage capacity per host rank. Args: key (str): the capacity type, e.g. "scm" or "nvme" device_names (list): the device names of this capacity type Returns: dict: a dictionary of total storage capacity per host rank """ host_capacity = {} for host in data: device_sizes = [] for device in data[host][key]: if device in device_names: device_sizes.append( human_to_bytes( data[host][key][device]["capacity"])) host_capacity[host] = sum(device_sizes) return host_capacity # Default maximum bytes for SCM and NVMe storage = [0, 0] using_dcpm = self.manager.job.using_dcpm using_nvme = self.manager.job.using_nvme if using_dcpm or using_nvme: # Stop the DAOS IO servers in order to be able to scan the storage self.system_stop() # Scan all of the hosts for their SCM and NVMe storage self.dmg.hostlist = self._hosts data = self.dmg.storage_scan(verbose=True) self.dmg.hostlist = self.get_config_value("access_points") if self.dmg.result.exit_status != 0: raise ServerFailed("Error obtaining DAOS storage:\n{}".format( self.dmg.result)) # Restart the DAOS IO servers self.system_start() if using_dcpm: # Find the sizes of the configured SCM storage scm_devices = [ os.path.basename(path) for path in self.get_config_value("scm_list") if path ] capacity = get_host_capacity("scm", scm_devices) for host in sorted(capacity): self.log.info("SCM capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[0] = capacity[min(capacity, key=capacity.get)] else: # Use the assigned scm_size scm_size = self.get_config_value("scm_size") storage[0] = human_to_bytes("{}GB".format(scm_size)) if using_nvme: # Find the sizes of the configured NVMe storage capacity = get_host_capacity("nvme", self.get_config_value("bdev_list")) for host in sorted(capacity): self.log.info("NVMe capacity for %s: %s", host, capacity[host]) # Use the minimum SCM storage across all servers storage[1] = capacity[min(capacity, key=capacity.get)] self.log.info( "Total available storage:\n SCM: %s (%s)\n NVMe: %s (%s)", str(storage[0]), bytes_to_human(storage[0], binary=False), str(storage[1]), bytes_to_human(storage[1], binary=False)) return storage
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') pool_capacity = self.params.get("pool_used_percentage", '/run/pool/*') pool_capacity = pool_capacity / 100 storage = self.get_max_storage_sizes() #Create the pool from available of storage space single_pool_nvme_size = int((storage[1] * pool_capacity) / no_of_pools) single_pool_scm_size = int((storage[0] * pool_capacity) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): self.log.info("-- Creating pool number = %s", _pool) self.pool.append(self.get_pool(create=False)) self.pool[-1].scm_size.update(single_pool_scm_size, "scm_size") self.pool[-1].nvme_size.update(single_pool_nvme_size, "nvme_size") self.pool[-1].create() # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
def pool_acl_verification(self, current_user_acl, read, write): ''' Deascription: Daos pool security verification with acl file. Steps: (1)Setup dmg tool for creating a pool (2)Generate acl file with permissions (3)Create a pool with acl (4)Verify the pool create status (5)Get the pool's acl list (6)Verify pool read operation (7)Verify pool write operation (8)Cleanup user and destroy pool Args: current_user_acl: acl with read write access credential. read: expecting read permission. write: expecting write permission. Return: pass to continue. fail to report the testlog and stop. ''' # (1)Create daos_shell command dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.get_params(self) port = self.params.get("port", "/run/server_config/*", 10001) get_acl_file = self.params.get("acl_file", "/run/pool_acl/*", "acl_test.txt") acl_file = os.path.join(self.tmp, get_acl_file) num_user = self.params.get("num_user", "/run/pool_acl/*") num_group = self.params.get("num_group", "/run/pool_acl/*") servers_with_ports = [ "{}:{}".format(host, port) for host in self.hostlist_servers ] dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") self.log.info(" (1)dmg= %s", dmg) # (2)Generate acl file with permissions self.log.info(" (2)Generate acl file with user/group permissions") permission_list = self.create_pool_acl(num_user, num_group, current_user_acl, acl_file) # (3)Create a pool with acl self.log.info(" (3)Create a pool with acl") dmg.action_command.acl_file.value = acl_file dmg.exit_status_exception = False result = dmg.run() # (4)Verify the pool create status self.log.info(" (4)dmg.run() result=\n%s", result) if result.stderr == "": uuid, svc = dmg_utils.get_pool_uuid_service_replicas_from_stdout( result.stdout) else: self.fail("##(4)Unable to parse pool uuid and svc.") # (5)Get the pool's acl list # dmg pool get-acl --pool <UUID> self.log.info(" (5)Get a pool's acl list by: " "dmg pool get-acl --pool --hostlist") pool_acl_list = self.get_pool_acl_list(uuid) self.log.info(" pool original permission_list: %s", permission_list) self.log.info(" pool get_acl permission_list: %s", pool_acl_list) # (6)Verify pool read operation # daos pool query --pool <uuid> self.log.info(" (6)Verify pool read by: daos pool query --pool") self.verify_pool_readwrite(svc, uuid, "read", expect=read) # (7)Verify pool write operation # daos continer create --pool <uuid> self.log.info(" (7)Verify pool write by: daos continer create --pool") self.verify_pool_readwrite(svc, uuid, "write", expect=write) # (8)Cleanup user and destroy pool self.log.info(" (8)Cleanup user and destroy pool") self.cleanup_user_group(num_user, num_group) dmg = DmgCommand(os.path.join(self.prefix, "bin")) dmg.request.value = "pool" dmg.action.value = "destroy --pool={}".format(uuid) dmg.hostlist.update(",".join(servers_with_ports), "dmg.hostlist") result = dmg.run() return
class DaosServerManager(SubprocessManager): """Manages the daos_server execution on one or more hosts.""" # Mapping of environment variable names to daos_server config param names ENVIRONMENT_VARIABLE_MAPPING = { "CRT_PHY_ADDR_STR": "provider", "OFI_INTERFACE": "fabric_iface", "OFI_PORT": "fabric_iface_port", } def __init__(self, server_command, manager="Orterun", dmg_cfg=None): """Initialize a DaosServerManager object. Args: server_command (ServerCommand): server command object manager (str, optional): the name of the JobManager class used to manage the YamlCommand defined through the "job" attribute. Defaults to "OpenMpi". dmg_cfg (DmgYamlParameters, optional): The dmg configuration file parameters used to connect to this group of servers. """ super(DaosServerManager, self).__init__(server_command, manager) self.manager.job.sub_command_override = "start" # Dmg command to access this group of servers which will be configured # to access the doas_servers when they are started self.dmg = DmgCommand(self.manager.job.command_path, dmg_cfg) def get_params(self, test): """Get values for all of the command params from the yaml file. Use the yaml file parameter values to assign the server command and orterun command parameters. Args: test (Test): avocado Test object """ super(DaosServerManager, self).get_params(test) # Get the values for the dmg parameters self.dmg.get_params(test) def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info( "<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates( get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update( self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update( {"plm_rsh_args": "-l root"}, "orterun.mca", True) def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_cmds = [] for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "rm -fr {}/*".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_cmds: clean_cmds.append(cmd) if self.manager.job.using_dcpm: scm_list = server_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info( "Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_cmds: clean_cmds.append(cmd) pcmd(self._hosts, "; ".join(clean_cmds), verbose) def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: cmd.sub_command_class.sub_command_class.hugepages.value = 4096 self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type)) def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format.""" f_type = "format" if not reformat else "reformat" self.log.info("<SERVER> Waiting for servers to be ready for format") self.manager.job.update_pattern(f_type, len(self._hosts)) try: self.manager.run() except CommandFailure as error: self.kill() raise ServerFailed( "Failed to start servers before format: {}".format(error)) def detect_io_server_start(self): """Detect when all the daos_io_servers have started.""" self.log.info("<SERVER> Waiting for the daos_io_servers to start") self.manager.job.update_pattern("normal", len(self._hosts)) if not self.manager.job.check_subprocess_status(self.manager.process): self.kill() raise ServerFailed("Failed to start servers after format") # Update the dmg command host list to work with pool create/destroy self.dmg.hostlist = self.get_config_value("access_points") def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage") def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add( "sudo chown -R {0}:{0} {1}".format(user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose) def start(self): """Start the server through the job manager.""" # Prepare the servers self.prepare() # Start the servers and wait for them to be ready for storage format self.detect_format_ready() # Format storage and wait for server to change ownership self.log.info( "<SERVER> Formatting hosts: <%s>", self.dmg.hostlist) self.dmg.storage_format() # Wait for all the doas_io_servers to start self.detect_io_server_start() return True def stop(self): """Stop the server through the runner.""" self.log.info( "<SERVER> Stopping server %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the job manager command try: super(DaosServerManager, self).stop() except CommandFailure as error: messages.append( "Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.kill() if self.manager.job.using_nvme: # Reset the storage try: self.reset_storage() except ServerFailed as error: messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() # Report any errors after all stop actions have been attempted if messages: raise ServerFailed( "Failed to stop servers:\n {}".format("\n ".join(messages))) def get_environment_value(self, name): """Get the server config value associated with the env variable name. Args: name (str): environment variable name for which to get a daos_server configuration value Raises: ServerFailed: Unable to find a daos_server configuration value for the specified environment variable name Returns: str: the daos_server configuration value for the specified environment variable name """ try: setting = self.ENVIRONMENT_VARIABLE_MAPPING[name] except IndexError: raise ServerFailed( "Unknown server config setting mapping for the {} environment " "variable!".format(name)) return self.get_config_value(setting)
class NvmeHealth(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate NVMe health test cases :avocado: recursive """ @skipForTicket("DAOS-7011") def test_monitor_for_large_pools(self): """Jira ID: DAOS-4722. Test Description: Test Health monitor for large number of pools. Use Case: This tests will create the 40 number of pools and verify the dmg list-pools, device-health and nvme-health works for all pools. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=nvme_health """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches no_of_pools = self.params.get("number_of_pools", '/run/pool/*') # Stop the servers to run SPDK too to get the server capacity self.stop_servers() storage = self.get_nvme_max_capacity() self.start_servers() # Create the pool from 80% of available of storage space single_pool_nvme_size = int((storage * 0.80) / no_of_pools) self.pool = [] # Create the Large number of pools for _pool in range(no_of_pools): pool = TestPool(self.context, self.get_dmg_command()) pool.get_params(self) # SCM size is 10% of NVMe pool.scm_size.update('{}'.format(int(single_pool_nvme_size * 0.10))) pool.nvme_size.update('{}'.format(single_pool_nvme_size)) pool.create() self.pool.append(pool) # initialize the dmg command self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") # List all pools self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") self.dmg.sub_command_class.sub_command_class.set_sub_command( "list-pools") for host in self.hostlist_servers: self.dmg.hostlist = host try: result = self.dmg.run() except CommandFailure as error: self.fail("dmg command failed: {}".format(error)) #Verify all pools UUID listed as part of query for pool in self.pool: if pool.uuid.lower() not in result.stdout_text: self.fail('Pool uuid {} not found in smd query'.format( pool.uuid.lower())) # Get the device ID from all the servers. device_ids = get_device_ids(self.dmg, self.hostlist_servers) # Get the device health for host in device_ids: self.dmg.hostlist = host for _dev in device_ids[host]: try: result = self.dmg.storage_query_device_health(_dev) except CommandFailure as error: self.fail("dmg get device states failed {}".format(error)) if 'State:NORMAL' not in result.stdout_text: self.fail("device {} on host {} is not NORMAL".format( _dev, host)) # Get the nvme-health try: self.dmg.storage_scan_nvme_health() except CommandFailure as error: self.fail("dmg storage scan --nvme-health failed {}".format(error))
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.sub_command_class.sub_command_class.set_sub_command("smd") self.dmg.sub_command_class. \ sub_command_class.sub_command_class.devices.value = True self.dmg.sub_command_class. \ sub_command_class.sub_command_class.pools.value = True try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) uid = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^UUID:", line): temp = line.split() uid = temp[1] break return uid def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.sub_command_class. \ sub_command_class.set_sub_command("blobstore-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.devuuid.value = "{}".format(device_id) try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) csum_count = None for line in result.stdout.splitlines(): line = line.strip() if re.search("^Checksum", line): temp = line.split() csum_count = int(temp[2]) break return csum_count def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
class CSumErrorLog(DaosCoreBase): """ Test Class Description: This test runs daos_test -z (Checksum tests) and verifies whether Checksum Error Counters are incremented in the NVME device due to checksum fault injection. :avocado: recursive """ # pylint: disable=too-many-instance-attributes def setUp(self): super(CSumErrorLog, self).setUp() self.dmg = DmgCommand(os.path.join(self.prefix, "bin")) self.dmg.get_params(self) self.dmg.hostlist = self.hostlist_servers[0] self.dmg.insecure.update( self.server_managers[0].get_config_value("allow_insecure"), "dmg.insecure") self.dmg.set_sub_command("storage") self.dmg.sub_command_class.set_sub_command("query") def get_nvme_device_id(self): self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("list-devices") try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: return v['storage']['smd_info']['devices'][0]['uuid'] def get_checksum_error_value(self, device_id=None): if device_id is None: self.fail("No device id provided") return self.dmg.json.value = True self.dmg.sub_command_class. \ sub_command_class.set_sub_command("device-health") self.dmg.sub_command_class. \ sub_command_class. \ sub_command_class.uuid.value = device_id try: result = self.dmg.run() except process.CmdError as details: self.fail("dmg command failed: {}".format(details)) data = json.loads(result.stdout) if len(data['host_errors']) > 0: self.fail("dmg command failed: {}".format(data['host_errors'])) for v in data['host_storage_map'].values(): if v['storage']['smd_info']['devices']: dev = v['storage']['smd_info']['devices'][0] return dev['health']['checksum_errors'] def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,pr,hw,medium,ib2,csum_error_log """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
def test_create(self): """Test dmg pool create and destroy with various parameters. Create a pool and verify that the pool was created by comparing the UUID returned from the dmg command against the directory name in /mnt/daos Destroy the pool and verify that the directory is deleted. :avocado: tags=all,pool,full_regression,small,multitarget """ # Create a dmg command object dmg = DmgCommand(self.bin) dmg.get_params(self) dmg.hostlist.update( self.server_managers[0].runner.job.yaml_params.access_points.value, "dmg.hostlist") # Disable raising an exception if the dmg command fails dmg.exit_status_exception = False # Accumulate a list of pass/fail indicators representing what is # expected for each parameter then "and" them to determine the # expected result of the test expected_for_param = [] userlist = self.params.get("user", '/run/tests/users/*') user = os.getlogin() if userlist[0] == 'valid' else userlist[0] expected_for_param.append(userlist[1]) grouplist = self.params.get("group", '/run/tests/groups/*') group = os.getlogin() if grouplist[0] == 'valid' else grouplist[0] expected_for_param.append(grouplist[1]) systemnamelist = self.params.get("systemname", '/run/tests/systemnames/*') system_name = systemnamelist[0] expected_for_param.append(systemnamelist[1]) tgtlistlist = self.params.get("tgt", '/run/tests/tgtlist/*') tgtlist = tgtlistlist[0] expected_for_param.append(tgtlistlist[1]) # if any parameter is FAIL then the test should FAIL expected_result = RESULT_PASS if RESULT_FAIL in expected_for_param: expected_result = RESULT_FAIL host1 = self.hostlist_servers[0] host2 = self.hostlist_servers[1] test_destroy = True create_result = dmg.pool_create("1GB", user, group, None, tgtlist, None, system_name) if create_result.exit_status == 0: if expected_result == RESULT_FAIL: self.fail( "Test was expected to fail but it passed at pool create.") uuid, _ = get_pool_uuid_service_replicas_from_stdout( create_result.stdout) if '0' in tgtlist: # check_for_pool checks if the uuid directory exists in host1 exists = check_for_pool.check_for_pool(host1, uuid) if exists != 0: self.fail("Pool {0} not found on host {1}.\n".format( uuid, host1)) if '1' in tgtlist: exists = check_for_pool.check_for_pool(host2, uuid) if exists != 0: self.fail("Pool {0} not found on host {1}.\n".format( uuid, host2)) else: test_destroy = False if expected_result == RESULT_PASS: self.fail("Test was expected to pass but it failed at pool " + "create.") if test_destroy: destroy_result = dmg.pool_destroy(uuid) if destroy_result.exit_status == 0: if expected_result == RESULT_FAIL: self.fail("Test was expected to fail but it passed at " + "pool create.") if '0' in tgtlist: exists = check_for_pool.check_for_pool(host1, uuid) if exists == 0: self.fail( "Pool {0} found on host {1} after destroy.\n". format(uuid, host1)) if '1' in tgtlist: exists = check_for_pool.check_for_pool(host2, uuid) if exists == 0: self.fail( "Pool {0} found on host {1} after destroy.\n". format(uuid, host2)) else: if expected_result == RESULT_PASS: self.fail("Test was expected to pass but it failed at " + "pool destroy.")