def _run_unit_command(self, command): """Run the systemctl command. Args: command (str): systemctl unit command Raises: CommandFailure: if there is an issue running the command Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ self._systemctl.unit_command.value = command self.timestamps[command] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") result = pcmd(self._hosts, self.__str__(), self.verbose, self.timeout) if 255 in result: raise CommandFailure( "Timeout detected running '{}' with a {}s timeout on {}". format(self.__str__(), self.timeout, NodeSet.fromlist(result[255]))) if 0 not in result or len(result) > 1: failed = [] for item, value in list(result.items()): if item != 0: failed.extend(value) raise CommandFailure("Error occurred running '{}' on {}".format( self.__str__(), NodeSet.fromlist(failed))) return result
def create_mount_point(self): """Create dfuse directory. Raises: CommandFailure: In case of error creating directory """ # Raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") # Create the mount point on any host without dfuse already mounted state = self.check_mount_state() if state["nodirectory"]: command = "mkdir -p {}".format(self.mount_dir.value) ret_code = pcmd(state["nodirectory"], command, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: failed_nodes = [ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ] error_hosts = NodeSet(",".join(failed_nodes)) raise CommandFailure( "Error creating the {} dfuse mount point on the " "following hosts: {}".format(self.mount_dir.value, error_hosts))
def update_config_file_from_file(self, dst_hosts, test_dir, generated_yaml): """Update config file and object. Create and place the new config file in /etc/daos/daos_server.yml Then update SCM-related data in engine_params so that those disks will be wiped. Args: dst_hosts (list): Destination server hostnames to place the new config file. test_dir (str): Directory where the server config data from generated_yaml will be written. generated_yaml (YAMLObject): New server config data. """ # Create a temporary file in test_dir and write the generated config. temp_file_path = os.path.join(test_dir, "temp_server.yml") try: with open(temp_file_path, 'w') as write_file: yaml.dump(generated_yaml, write_file, default_flow_style=False) except Exception as error: raise CommandFailure( "Error writing the yaml file! {}: {}".format(temp_file_path, error)) from error # Copy the config from temp dir to /etc/daos of the server node. default_server_config = get_default_config_file("server") try: distribute_files( dst_hosts, temp_file_path, default_server_config, verbose=False, sudo=True) except DaosTestError as error: raise CommandFailure( "ERROR: Copying yaml configuration file to {}: " "{}".format(dst_hosts, error)) from error # Before restarting daos_server, we need to clear SCM. Unmount the mount # point, wipefs the disks, etc. This clearing step is built into the # server start steps. It'll look at the engine_params of the # server_manager and clear the SCM set there, so we need to overwrite it # before starting to the values from the generated config. self.log.info("Resetting engine_params") self.manager.job.yaml.engine_params = [] engines = generated_yaml["engines"] for i, engine in enumerate(engines): self.log.info("engine %d", i) for storage_tier in engine["storage"]: if storage_tier["class"] != "dcpm": continue self.log.info("scm_mount = %s", storage_tier["scm_mount"]) self.log.info("class = %s", storage_tier["class"]) self.log.info("scm_list = %s", storage_tier["scm_list"]) per_engine_yaml_parameters = DaosServerYamlParameters.PerEngineYamlParameters(i) per_engine_yaml_parameters.scm_mount.update(storage_tier["scm_mount"]) per_engine_yaml_parameters.scm_class.update(storage_tier["class"]) per_engine_yaml_parameters.scm_size.update(None) per_engine_yaml_parameters.scm_list.update(storage_tier["scm_list"]) per_engine_yaml_parameters.reset_yaml_data_updated() self.manager.job.yaml.engine_params.append( per_engine_yaml_parameters)
def run(self, check=True, bind_cores=None): # pylint: disable=arguments-differ """Run the dfuse command. Args: check (bool): Check if dfuse mounted properly after mount is executed. bind_cores (str): List of CPU cores to pass to taskset Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self.env: raise CommandFailure( "Dfuse missing environment variables for D_LOG_FILE") if 'D_LOG_MASK' not in self.env: self.env['D_LOG_MASK'] = 'INFO' # create dfuse dir if does not exist self.create_mount_point() # run dfuse command cmd = self.env.get_export_str() if bind_cores: cmd += 'taskset -c {} '.format(bind_cores) cmd += str(self) self.log.info("Command is '%s'", cmd) ret_code = pcmd(self.hosts, cmd, timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if check: # Dfuse will block in the command for the mount to complete, even # if run in background mode so it should be possible to start using # it immediately after the command returns. if not self.check_running(fail_on_error=False): self.log.info('Waiting two seconds for dfuse to start') time.sleep(2) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) self.check_running()
def remove_mount_point(self, fail=True): """Remove dfuse directory. Try once with a simple rmdir which should succeed, if this does not then try again with rm -rf, but still raise an error. Raises: CommandFailure: In case of error deleting directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") dir_exists, clean_nodes = check_file_exists(self.hosts, self.mount_dir.value, directory=True) if dir_exists: target_nodes = list(self.hosts) if clean_nodes: target_nodes.remove(clean_nodes) self.log.info("Removing the %s dfuse mount point on %s", self.mount_dir.value, target_nodes) cmd = "rmdir {}".format(self.mount_dir.value) ret_code = pcmd(target_nodes, cmd, timeout=30) if len(ret_code) == 1 and 0 in ret_code: return failed_nodes = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) cmd = "rm -rf {}".format(self.mount_dir.value) ret_code = pcmd(failed_nodes, cmd, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rm on " "the following hosts: {}".format( self.mount_dir.value, error_hosts)) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rmdir on the " "following hosts: {}".format(self.mount_dir.value, failed_nodes)) else: self.log.info("No %s dfuse mount point directory found on %s", self.mount_dir.value, self.hosts)
def get_aggregate_total(self, processes): """Get the total bytes expected to be written by ior. Args: processes (int): number of processes running the ior command Returns: int: total number of bytes written Raises: CommandFailure: if there is an error obtaining the aggregate total """ power = {"k": 1, "m": 2, "g": 3, "t": 4} total = processes for name in ("block_size", "segment_count"): item = getattr(self, name).value if item: sub_item = re.split(r"([^\d])", str(item)) if int(sub_item[0]) > 0: total *= int(sub_item[0]) if len(sub_item) > 1: key = sub_item[1].lower() if key in power: total *= 1024**power[key] else: raise CommandFailure( "Error obtaining the IOR aggregate total from " "the {} - bad key: value: {}, split: {}, " "key: {}".format(name, item, sub_item, key)) else: raise CommandFailure( "Error obtaining the IOR aggregate total from the {}: " "value: {}, split: {}".format(name, item, sub_item)) # Account for any replicas, except for the ones with no replication # i.e all object classes starting with "S". Eg: S1,S2,...,SX. if not self.dfs_oclass.value.startswith("S"): replica_qty = 1 try: # Extract the replica quantity from the object class string replica_qty = int(re.findall(r"\d+", self.dfs_oclass.value)[0]) except (TypeError, IndexError): # If the daos object class is undefined (TypeError) or it does # not contain any numbers (IndexError) then there is only one # replica. pass finally: total *= replica_qty return total
def run(self): """Start the job's service via the systemctl command. Enable the service, start the service, and report the status of the service. If an error occurs with any of these commands also display the journalctl output for the service. Raises: CommandFailure: if unable to enable or start the service Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Start the daos_server.service self.service_enable() result = self.service_start() # result = self.service_status() # Determine if the command has launched correctly using its # check_subprocess_status() method. if not self.check_subprocess_status(None): msg = "Command '{}' did not launch correctly".format(self) self.log.error(msg) raise CommandFailure(msg) return result
def check_running(self, fail_on_error=True): """Check dfuse is running. Run a command to verify dfuse is running on hosts where it is supposed to be. Use grep -v and rc=1 here so that if it isn't, then we can see what is being used instead. Args: fail_on_error (bool, optional): should an exception be raised if an error is detected. Defaults to True. Raises: CommandFailure: raised if dfuse is found not running on any expected nodes and fail_on_error is set. Returns: bool: whether or not dfuse is running """ status = True state = self.check_mount_state(self.running_hosts) if state["unmounted"] or state["nodirectory"]: self.log.error("Error: dfuse not running on %s", str(state["unmounted"].union(state["nodirectory"]))) status = False if fail_on_error: raise CommandFailure("dfuse not running") return status
def _execute_cmd(self, cmd): """Execute command on the host clients. Args: cmd (str): Command to run Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # execute bash cmds ret = pcmd(self.dfuse_hosts, cmd, verbose=True, timeout=30) if 0 not in ret: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret.items()) if code != 0 ])) raise CommandFailure("Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") return ret
def get_dmg_network_information(dmg_network_scan): """Get the network device information from the dmg network scan output. Args: dmg_network_scan (dict): the dmg network scan json command output Raises: CommandFailure: if there was an error processing the dmg network scan output Returns: list: a list of NetworkDevice objects identifying the network devices on each host """ network_devices = [] try: for host_fabric in dmg_network_scan["response"]["HostFabrics"].values( ): for host in NodeSet(host_fabric["HostSet"].split(":")[0]): for interface in host_fabric["HostFabric"]["Interfaces"]: network_devices.append( NetworkDevice(host, interface["Device"], None, 1, interface["Provider"], interface["NumaNode"])) except KeyError as error: raise CommandFailure( f"Error processing dmg network scan json output: {dmg_network_scan}" ) from error return network_devices
def _run_process(self): """Run the command as a foreground process. Raises: CommandFailure: if there is an error running the command """ if self._hosts is None: # Run fio locally self.log.debug("Running: %s", self.__str__()) super()._run_process() else: # Run fio remotely self.log.debug("Running: %s", self.__str__()) ret_codes = pcmd(self._hosts, self.__str__()) # Report any failures if len(ret_codes) > 1 or 0 not in ret_codes: failed = [ "{}: rc={}".format(val, key) for key, val in list(ret_codes.items()) if key != 0 ] raise CommandFailure( "Error running fio on the following hosts: {}".format( ", ".join(failed)))
def create_yaml(self, filename=None): """Create a yaml file from the parameter values. A yaml file will only be created if at least one of its parameter values have be updated (BasicParameter.updated = True). Args: filename (str, optional): the yaml file to generate with the parameters. Defaults to None, which uses self.filename. Raises: CommandFailure: if there is an error creating the yaml file Returns: bool: whether or not an updated yaml file was created """ create_yaml = self.is_yaml_data_updated() if create_yaml: # Write a new yaml file if any of the parameters have been updated if filename is None: filename = self.filename yaml_data = self.get_yaml_data() self.log.info("Writing yaml configuration file %s", filename) try: with open(filename, 'w') as write_file: yaml.dump(yaml_data, write_file, default_flow_style=False) except Exception as error: raise CommandFailure( "Error writing the yaml file {}: {}".format( filename, error)) from error self.reset_yaml_data_updated() return create_yaml
def get_device_ids(dmg, servers): """Get the NVMe Device ID from servers. Args: dmg: DmgCommand class instance. servers (list): list of server hosts. Returns: devices (dictionary): Device UUID for servers. """ devices = {} dmg.set_sub_command("storage") dmg.sub_command_class.set_sub_command("query") dmg.sub_command_class.sub_command_class.set_sub_command("list-devices") for host in servers: dmg.hostlist = host try: result = dmg.run() except CommandFailure as _error: raise CommandFailure("dmg list-devices failed with error {}".format(_error)) from _error drive_list = [] for line in result.stdout_text.split('\n'): if 'UUID' in line: drive_list.append(line.split('UUID:')[1].split(' ')[0]) devices[host] = drive_list return devices
def stop(self): """Stop the agent through the job manager. Raises: CommandFailure: if there was an error stopping the agents. """ self.log.info("<AGENT> Stopping agent %s command", self.manager.command) # Maintain a running list of errors detected trying to stop messages = [] # Stop the subprocess running the manager command try: super().stop() except CommandFailure as error: messages.append("Error stopping the {} subprocess: {}".format( self.manager.command, error)) # Kill any leftover processes that may not have been stopped correctly self.manager.kill() # Report any errors after all stop actions have been attempted if messages: raise CommandFailure("Failed to stop agents:\n {}".format( "\n ".join(messages)))
def _run_cmd(self, cmd): ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=180) if 0 not in ret_code: error_hosts = NodeSet(",".join( [str(v) for k, v in list(ret_code.items()) if k != 0])) raise CommandFailure( "Error running '{}' on the following hosts: {}".format( cmd, error_hosts))
def update_params(self, **params): """Update each of provided parameter name and value pairs.""" for name, value in params.items(): try: getattr(self, name).update(value, name) except AttributeError as error: raise CommandFailure( "Unknown parameter: {}".format(name)) from error
def set_query_data(self, show_enabled=False, show_disabled=False): """Execute dmg pool query and store the results. Args: show_enabled (bool, optional): Display enabled ranks. show_disabled (bool, optional): Display disabled ranks. Only supported with the dmg control method. """ self.query_data = {} if self.pool: if self.dmg: end_time = None if self.pool_query_timeout.value is not None: self.log.info( "Waiting for pool %s query to be responsive with a %s " "second timeout", self.identifier, self.pool_query_timeout.value) end_time = time() + self.pool_query_timeout.value while True: try: self.query_data = self.dmg.pool_query( self.identifier, show_enabled, show_disabled) break except CommandFailure as error: if end_time is not None: self.log.info( "Pool %s query still non-responsive: %s", self.identifier, str(error)) if time() > end_time: raise CommandFailure( "TIMEOUT detected after {} seconds while " "waiting for pool {} query response. This " "timeout can be adjusted via the " "'pool/pool_query_timeout' test yaml " "parameter.".format( self.pool_query_timeout.value, self.identifier)) \ from error else: raise CommandFailure(error) from error else: self.log.error("Error: Undefined dmg command")
def get_params(self, test): """Get values for all of the command params from the yaml file. Autosize any size/scm_size/nvme_size parameter whose value ends in "%". Also create a unique label by adding the incremented number prefix. Args: test (Test): avocado Test object """ super().get_params(test) # Autosize any size/scm_size/nvme_size parameters # pylint: disable=too-many-boolean-expressions if ((self.scm_size.value is not None and str(self.scm_size.value).endswith("%")) or (self.nvme_size.value is not None and str(self.nvme_size.value).endswith("%"))): index = self.server_index.value try: params = test.server_managers[index].autosize_pool_params( size=None, tier_ratio=None, scm_size=self.scm_size.value, nvme_size=self.nvme_size.value, min_targets=self.min_targets.value, quantity=self.quantity.value) except ServerFailed as error: test.fail( "Failure autosizing pool parameters: {}".format(error)) except AutosizeCancel as error: test.cancel(error) # Update the pool parameters with any autosized values for name in params: test_pool_param = getattr(self, name) test_pool_param.update(params[name], name) # Cache the autosized value so we do not calculate it again # pylint: disable=protected-access cache_id = (name, self.namespace, test_pool_param._default) test.params._cache[cache_id] = params[name] # Use a unique pool label if using pool labels if self.label.value is not None: if not isinstance(self.label_generator, LabelGenerator): raise CommandFailure( "Unable to create a unique pool label; Undefined label_generator" ) self.label.update(self.label_generator.get_label(self.label.value))
def assert_on_exception(self, out_queue=None): """Assert on exception while executing an application. Args: out_queue (queue): Check whether the queue is empty. If empty, app (ior, mdtest) didn't encounter error. """ if out_queue is None: out_queue = self.out_queue if out_queue.empty(): pass else: exc = out_queue.get(block=False) out_queue.put(exc) raise CommandFailure(exc)
def _report_unit_command(self, command): """Run the systemctl command and report the log data on an error. Args: command (str): systemctl unit command Raises: CommandFailure: if there is an issue running the command Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: return self._run_unit_command(command) except CommandFailure as error: self.log.info(error) self.display_log_data( self.get_log_data(self._hosts, self.timestamps[command])) raise CommandFailure(error) from error
def stop(self, pool=None): """Stop the ior command when the job manager was run as a subprocess . Args: pool (TestPool, optional): if provided the pool space will be displayed after attempting to stop the ior command . Defaults to None. Raises: CommandFailure: if there is an error stopping the ior subprocess """ if self.manager.run_as_subprocess: error_message = None try: self.manager.stop() except CommandFailure as error: error_message = "IOR Failed: {}".format(error) finally: if pool: self.display_pool_space(pool) if error_message: raise CommandFailure(error_message)
def run(self): """Run the daos_racer command remotely. Raises: CommandFailure: if there is an error running the command """ # Run daos_racer on the specified host self.log.info( "Running %s on %s with %s timeout", self.__str__(), self.host, "no" if self.clush_timeout.value is None else "a {}s".format( self.clush_timeout.value)) return_codes = pcmd([self.host], self.__str__(), True, self.clush_timeout.value) if 0 not in return_codes or len(return_codes) > 1: # Kill the daos_racer process if the remote command timed out if 255 in return_codes: self.log.info("Stopping timed out daos_racer process on %s", self.host) pcmd([self.host], "pkill daos_racer", True) raise CommandFailure("Error running '{}'".format(self._command)) self.log.info("Test passed!")
def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): """Execute the command on all client hosts. Optionally verify if the command returns a non zero return code. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Raises: CommandFailure: if 'fail_on_err' is set and the command fails on at least one of the client hosts Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ if hosts is None: hosts = self.hostlist_clients result = pcmd(hosts, command, verbose=display_output, timeout=300) if 0 not in result and fail_on_err: hosts = [ str(nodes) for code, nodes in list(result.items()) if code != 0 ] raise CommandFailure( "Error running '{}' on the following hosts: {}".format( command, NodeSet(",".join(hosts)))) return result
def run_ior_loop(manager, uuids, tmpdir_base): """IOR run for each UUID provided. Args: manager (str): mpi job manager command uuids (list): list of container UUIDs tmpdir_base (str): base directory for the mpi orte_tmpdir_base mca parameter Returns: list: a list of CmdResults from each ior command run """ results = [] errors = [] for index, cont_uuid in enumerate(uuids): manager.job.dfs_cont.update(cont_uuid, "ior.cont_uuid") # Create a unique temporary directory for the the manager command tmp_dir = mkdtemp(dir=tmpdir_base) manager.tmpdir_base.update(tmp_dir, "tmpdir_base") try: results.append(manager.run()) except CommandFailure as error: ior_mode = "read" if "-r" in manager.job.flags.value else "write" errors.append( "IOR {} Loop {}/{} failed for container {}: {}".format( ior_mode, index, len(uuids), cont_uuid, error)) finally: # Remove the unique temporary directory and its contents to avoid conflicts shutil.rmtree(tmp_dir, ignore_errors=True) if errors: raise CommandFailure( "IOR failed in {}/{} loops: {}".format(len(errors), len(uuids), "\n".join(errors))) return results
def test_parallelio(self): """Jira ID: DAOS-3775. Test Description: Purpose of this test is to mount dfuse and verify multiple containers using fio. Use cases: Mount dfuse using pool uuid. Create multiple containers under that dfuse mount point. Check those containers are accessible from that mount point. Perform io to those containers using FIO Delete one of the containers Check if dfuse is still running. If not, fail the test and exit. Otherwise, try accessing the deleted container. This should fail. Check dfuse again. :avocado: tags=all,full_regression :avocado: tags=hw,medium,ib2 :avocado: tags=daosio,tx,dfuse :avocado: tags=parallelio """ # get test params for cont and pool count self.cont_count = self.params.get("cont_count", '/run/container/*') threads = [] # Create a pool and start dfuse. self.create_pool() self.start_dfuse(self.hostlist_clients, self.pool[0], None) # create multiple containers self.add_container_qty(self.cont_count, self.pool[0]) # check if all the created containers can be accessed and perform # io on each container using fio in parallel for _, cont in enumerate(self.container): dfuse_cont_dir = self.dfuse.mount_dir.value + "/" + cont.uuid cmd = "ls -a {}".format(dfuse_cont_dir) try: # execute bash cmds ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure("Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("ParallelIo Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # run fio on all containers thread = threading.Thread(target=self.execute_fio, args=(self.dfuse.mount_dir.value + "/" + cont.uuid, False)) threads.append(thread) thread.start() # wait for all fio jobs to be finished for job in threads: job.join() # destroy first container container_to_destroy = self.container[0].uuid self.container[0].destroy(1) # check dfuse if it is running fine self.dfuse.check_running() # try accessing destroyed container, it should fail try: self.execute_fio( self.dfuse.mount_dir.value + "/" + container_to_destroy, False) self.fail("Fio was able to access destroyed container: {}".format( self.container[0].uuid)) except CommandFailure as error: self.log.info("This run is expected to fail") # check dfuse is still running after attempting to access deleted # container. self.dfuse.check_running()
def test_config_generate_run(self): """Run daos_server with generated server config file. 1. Start daos_server. 2. Call dmg config generate with different parameters. 3. Store the generated output to a temporary directory - self.test_dir 4. Copy the generated output from the temp dir to /etc/daos of the server node. 5. Stop daos_server. 6. Restart daos_server. See yaml for the test cases. Note: When running locally, use 50 sec timeout in DaosServerCommand.__init__() :avocado: tags=all,full_regression :avocado: tags=hw,small :avocado: tags=control,config_generate_entries,config_generate_run """ num_engines = self.params.get( "num_engines", "/run/config_generate_params/*/") min_ssds = self.params.get( "min_ssds", "/run/config_generate_params/*/") net_class = self.params.get( "net_class", "/run/config_generate_params/*/") # Call dmg config generate. AP is always the first server host. server_host = self.hostlist_servers[0] result = self.get_dmg_command().config_generate( access_points=server_host, num_engines=num_engines, min_ssds=min_ssds, net_class=net_class) try: generated_yaml = yaml.safe_load(result.stdout) except yaml.YAMLError: raise CommandFailure("Error loading dmg generated config!") # Stop and restart daos_server. self.start_server_managers() has the # server startup check built into it, so if there's something wrong, # it'll throw an error. self.log.info("Stopping servers") self.stop_servers() # Create a new server config from generated_yaml and update SCM-related # data in engine_params so that the cleanup before the server start # works. self.log.info("Copy config to /etc/daos and update engine_params") self.server_managers[0].update_config_file_from_file( self.hostlist_servers, self.test_dir, generated_yaml) # Start server with the generated config. self.log.info("Restarting server with the generated config") try: agent_force = self.start_server_managers(force=True) except ServerFailed as error: self.fail("Restarting server failed! {}".format(error)) # We don't need agent for this test. However, when we stop the server, # agent is also stopped. Then the harness checks that the agent is # running during the teardown. If agent isn't running at that point, it # would cause an error, so start it here. self.log.info("Restarting agents") self.start_agent_managers(force=agent_force)
def test_bashcmd(self): """Jira ID: DAOS-3508. Test Description: Purpose of this test is to mount different mount points of dfuse for different container and pool sizes and perform basic bash commands. Use cases: Following list of bash commands have been incorporated as part of this test: mkdir, touch, ls, chmod, rm, dd, stat, cp, cmp, mv, rmdir. Create a directory. Create a file under that directory. List the created file. Remove the file. Write a file to the dfuse mounted location using dd. List the written file to verify if it's create. Verify the file created is of right size as desired. Copy the file Compare the copied file with original to verify the content is same. Remove copied file. Rename file Verify renamed file exist using list. Verify dfuse support for '.' Verify dfuse support for '..' Remove renamed file Remove a directory :avocado: tags=all,daily_regression,pr :avocado: tags=hw,small :avocado: tags=dfuse :avocado: tags=bashcmd """ dir_name = self.params.get("dirname", '/run/bashcmd/*') file_name1 = self.params.get("filename1", '/run/bashcmd/*') file_name2 = self.params.get("filename2", '/run/bashcmd/*') dd_count = self.params.get("dd_count", '/run/bashcmd/*') dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*') pool_count = self.params.get("pool_count", '/run/pool/*') cont_count = self.params.get("cont_count", '/run/container/*') # Create a pool if one does not already exist. for _ in range(pool_count): self.add_pool(connect=False) # perform test for multiple containers. for count in range(cont_count): self.add_container(self.pool) mount_dir = "/tmp/{}_daos_dfuse{}".format( self.pool.uuid, count) self.start_dfuse(self.hostlist_clients, self.pool, self.container, mount_dir) abs_dir_path = os.path.join(self.dfuse.mount_dir.value, dir_name) abs_file_path1 = os.path.join(abs_dir_path, file_name1) abs_file_path2 = os.path.join(abs_dir_path, file_name2) # list of commands to be executed. commands = [ "mkdir -p {}".format(abs_dir_path), "touch {}".format(abs_file_path1), "ls -a {}".format(abs_file_path1), "rm {}".format(abs_file_path1), "dd if=/dev/zero of={} count={} bs={}".format( abs_file_path1, dd_count, dd_blocksize), "ls -al {}".format(abs_file_path1), "filesize=$(stat -c%s '{}');\ if (( filesize != {}*{} )); then exit 1;\ fi".format(abs_file_path1, dd_count, dd_blocksize), "cp -r {} {}".format( abs_file_path1, abs_file_path2), "cmp --silent {} {}".format(abs_file_path1, abs_file_path2), "rm {}".format(abs_file_path2), "mv {} {}".format( abs_file_path1, abs_file_path2), "ls -al {}".format(abs_file_path2), "ls -al {}/.".format(abs_dir_path), "ls -al {}/..".format(abs_dir_path), "rm {}".format(abs_file_path2), "rmdir {}".format(abs_dir_path) ] for cmd in commands: try: # execute bash cmds ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("BashCmd Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # stop dfuse self.stop_dfuse() # destroy container self.container.destroy() # destroy pool self.pool.destroy()
def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ # Include all hosts when stopping to ensure all mount points in any # state are properly removed self.running_hosts.add(NodeSet.fromlist(self.hosts)) self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value, self.running_hosts) if self.mount_dir.value and self.running_hosts: error_list = [] # Loop until all fuseblk mounted devices are unmounted counter = 0 while self.running_hosts and counter < 3: # Attempt to kill dfuse on after first unmount fails if self.running_hosts and counter > 1: kill_command = "pkill dfuse --signal KILL" pcmd(self.running_hosts, kill_command, timeout=30) # Attempt to unmount any fuseblk mounted devices after detection if self.running_hosts and counter > 0: pcmd(self.running_hosts, self.get_umount_command(counter > 1), expect_rc=None) time.sleep(2) # Detect which hosts have fuseblk mounted devices and remove any # hosts which no longer have the dfuse mount point mounted state = self.check_mount_state(self.running_hosts) for host in state["unmounted"].union(state["nodirectory"]): self.running_hosts.remove(host) # Increment the loop counter counter += 1 if self.running_hosts: error_list.append("Error stopping dfuse on {}".format( self.running_hosts)) # Remove mount points try: self.remove_mount_point() except CommandFailure as error: error_list.append(error) # Report any errors if error_list: raise CommandFailure("\n".join(error_list)) elif self.mount_dir.value is None: self.log.info("No dfuse mount directory defined - nothing to stop") else: self.log.info("No hosts running dfuse - nothing to stop")
def test_daos_build(self): """Jira ID: DAOS-8937. Test Description: This test builds DAOS on a dfuse filesystem. Use cases: Create Pool Create Posix container Mount dfuse Checkout and build DAOS sources. :avocado: tags=all,full_regression :avocado: tags=vm :avocado: tags=daosio,dfuse :avocado: tags=dfusedaosbuild """ # Create a pool, container and start dfuse. self.add_pool(connect=False) self.add_container(self.pool) daos_cmd = self.get_daos_command() cont_attrs = OrderedDict() cache_mode = self.params.get('name', '/run/dfuse/*') intercept = self.params.get('use_intercept', '/run/intercept/*', default=False) # How long to cache things for, if caching is enabled. cache_time = '30m' build_time = 15 if cache_mode == 'writeback': cont_attrs['dfuse-data-cache'] = 'on' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time elif cache_mode == 'writethrough': cont_attrs['dfuse-data-cache'] = 'on' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time if intercept: build_time = 120 elif cache_mode == 'metadata': cont_attrs['dfuse-data-cache'] = 'off' cont_attrs['dfuse-attr-time'] = cache_time cont_attrs['dfuse-dentry-time'] = cache_time cont_attrs['dfuse-ndentry-time'] = cache_time elif cache_mode == 'nocache': build_time = 210 cont_attrs['dfuse-data-cache'] = 'off' cont_attrs['dfuse-attr-time'] = '0' cont_attrs['dfuse-dentry-time'] = '0' cont_attrs['dfuse-ndentry-time'] = '0' else: self.fail('Invalid cache_mode: {}'.format(cache_mode)) for key, value in cont_attrs.items(): daos_cmd.container_set_attr(pool=self.pool.uuid, cont=self.container.uuid, attr=key, val=value) self.start_dfuse(self.hostlist_clients, self.pool, self.container) mount_dir = self.dfuse.mount_dir.value build_dir = os.path.join(mount_dir, 'daos') remote_env = OrderedDict() remote_env['PATH'] = '{}:$PATH'.format( os.path.join(mount_dir, 'venv', 'bin')) remote_env['VIRTUAL_ENV'] = os.path.join(mount_dir, 'venv') if intercept: remote_env['LD_PRELOAD'] = os.path.join(self.prefix, 'lib64', 'libioil.so') remote_env['D_LOG_FILE'] = '/var/tmp/daos_testing/daos-il.log' remote_env['DD_MASK'] = 'all' remote_env['DD_SUBSYS'] = 'all' remote_env['D_LOG_MASK'] = 'INFO' envs = [ 'export {}={}'.format(env, value) for env, value in remote_env.items() ] preload_cmd = ';'.join(envs) # Run the deps build in parallel for speed/coverage however the daos build itself does # not yet work, so run this part in serial. cmds = [ 'python3 -m venv {}/venv'.format(mount_dir), 'git clone https://github.com/daos-stack/daos.git {}'.format( build_dir), 'git -C {} submodule init'.format(build_dir), 'git -C {} submodule update'.format(build_dir), 'python3 -m pip install pip --upgrade', 'python3 -m pip install -r {}/requirements.txt'.format(build_dir), 'scons -C {} --jobs 50 build --build-deps=yes --deps-only'.format( build_dir), 'scons -C {} build'.format(build_dir) ] for cmd in cmds: try: command = '{};{}'.format(preload_cmd, cmd) # Use a 10 minute timeout for most commands, but vary the build timeout based on # the dfuse mode. timeout = 10 * 60 if cmd.startswith('scons'): timeout = build_time * 60 ret_code = general_utils.pcmd(self.hostlist_clients, command, timeout=timeout) if 0 in ret_code: continue self.log.info(ret_code) raise CommandFailure("Error running '{}'".format(cmd)) except CommandFailure as error: self.log.error('BuildDaos Test Failed: %s', str(error)) self.fail( 'Unable to build daos over dfuse in mode {}.\n'.format( cache_mode))
def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: list: a list of dictionaries including: "hosts": <NodeSet() of hosts with this data> "data": <journalctl output> """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = self.get_journalctl_command(since, until) self.log.info("Gathering log data on %s: %s", str(hosts), command) # Gather the log information per host results = run_pcmd(hosts, command, False, timeout, None) # Determine if the command completed successfully without a timeout status = True for result in results: if result["interrupted"]: self.log.info(" Errors detected running \"%s\":", command) self.log.info(" %s: timeout detected after %s seconds", str(result["hosts"]), timeout) status = False elif result["exit_status"] != 0: self.log.info(" Errors detected running \"%s\":", command) status = False if not status: break # Display/return the command output log_data = [] for result in results: if result["exit_status"] == 0 and not result["interrupted"]: # Add the successful output from each node to the dictionary log_data.append({ "hosts": result["hosts"], "data": result["stdout"] }) else: # Display all of the results in the case of an error if len(result["stdout"]) > 1: self.log.info(" %s: rc=%s, output:", str(result["hosts"]), result["exit_status"]) for line in result["stdout"]: self.log.info(" %s", line) else: self.log.info(" %s: rc=%s, output: %s", str(result["hosts"]), result["exit_status"], result["stdout"][0]) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data