def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info("<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update(self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update({"plm_rsh_args": "-l root"}, "orterun.mca", True)
def _run_cmd(self, cmd): ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=180) if 0 not in ret_code: error_hosts = NodeSet(",".join( [str(v) for k, v in ret_code.items() if k != 0])) raise CommandFailure( "Error running '{}' on the following hosts: {}".format( cmd, error_hosts))
def check_mount_state(self, nodes=None): """Check the dfuse mount point mounted state on the hosts. Args: nodes (NodeSet, optional): hosts on which to check if dfuse is mounted. Defaults to None, which will use all of the hosts. Returns: dict: a dictionary of NodeSets of hosts with the dfuse mount point either "mounted" or "unmounted" """ state = { "mounted": NodeSet(), "unmounted": NodeSet(), "nodirectory": NodeSet() } if not nodes: nodes = NodeSet.fromlist(self.hosts) check_mounted = NodeSet() # Detect which hosts have mount point directories defined command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value) retcodes = pcmd(nodes, command, expect_rc=None) for retcode, hosts in list(retcodes.items()): for host in hosts: if retcode == 0: check_mounted.add(host) else: state["nodirectory"].add(host) if check_mounted: # Detect which hosts with mount point directories have it mounted as # a fuseblk device command = "stat -c %T -f {0} | grep -v fuseblk".format( self.mount_dir.value) retcodes = pcmd(check_mounted, command, expect_rc=None) for retcode, hosts in list(retcodes.items()): for host in hosts: if retcode == 1: state["mounted"].add(host) else: state["unmounted"].add(host) return state
def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ self.log.info('Stopping dfuse at %s on %s', self.mount_dir.value, self.running_hosts) if self.mount_dir.value is None: return if not len(self.running_hosts): return self.check_running() umount_cmd = [ "if [ -x '$(command -v fusermount)' ]", "then fusermount -u {0}".format(self.mount_dir.value), "else fusermount3 -u {0}".format(self.mount_dir.value), "fi" ] ret_code = pcmd(self.running_hosts, "; ".join(umount_cmd), timeout=30) if 0 in ret_code: self.running_hosts.remove(ret_code[0]) del ret_code[0] if len(self.running_hosts): cmd = "pkill dfuse --signal KILL" pcmd(self.running_hosts, cmd, timeout=30) pcmd(self.running_hosts, umount_cmd, timeout=30) self.remove_mount_point(fail=False) raise CommandFailure( "Error stopping dfuse on the following hosts: {}".format( self.running_hosts)) time.sleep(2) self.remove_mount_point()
def clean_files(self, verbose=True): """Clean up the daos server files. Args: verbose (bool, optional): display clean commands. Defaults to True. """ clean_commands = [] for index, engine_params in \ enumerate(self.manager.job.yaml.engine_params): scm_mount = engine_params.get_value("scm_mount") self.log.info("Cleaning up the %s directory.", str(scm_mount)) # Remove the superblocks cmd = "sudo rm -fr {}/*".format(scm_mount) if cmd not in clean_commands: clean_commands.append(cmd) # Remove the shared memory segment associated with this io server cmd = "sudo ipcrm -M {}".format(self.D_TM_SHARED_MEMORY_KEY + index) clean_commands.append(cmd) # Dismount the scm mount point cmd = "while sudo umount {}; do continue; done".format(scm_mount) if cmd not in clean_commands: clean_commands.append(cmd) if self.manager.job.using_dcpm: scm_list = engine_params.get_value("scm_list") if isinstance(scm_list, list): self.log.info("Cleaning up the following device(s): %s.", ", ".join(scm_list)) # Umount and wipefs the dcpm device cmd_list = [ "for dev in {}".format(" ".join(scm_list)), "do mount=$(lsblk $dev -n -o MOUNTPOINT)", "if [ ! -z $mount ]", "then while sudo umount $mount", "do continue", "done", "fi", "sudo wipefs -a $dev", "done" ] cmd = "; ".join(cmd_list) if cmd not in clean_commands: clean_commands.append(cmd) pcmd(self._hosts, "; ".join(clean_commands), verbose)
def run(self, check=True, bind_cores=None): # pylint: disable=arguments-differ """Run the dfuse command. Args: check (bool): Check if dfuse mounted properly after mount is executed. bind_cores (str): List of CPU cores to pass to taskset Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self.env: raise CommandFailure( "Dfuse missing environment variables for D_LOG_FILE") if 'D_LOG_MASK' not in self.env: self.env['D_LOG_MASK'] = 'INFO' # create dfuse dir if does not exist self.create_mount_point() # run dfuse command cmd = self.env.get_export_str() if bind_cores: cmd += 'taskset -c {} '.format(bind_cores) cmd += str(self) self.log.info("Command is '%s'", cmd) ret_code = pcmd(self.hosts, cmd, timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if check: # Dfuse will block in the command for the mount to complete, even # if run in background mode so it should be possible to start using # it immediately after the command returns. if not self.check_running(fail_on_error=False): self.log.info('Waiting two seconds for dfuse to start') time.sleep(2) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) self.check_running()
def add_del_user(hosts, ba_cmd, user): ''' Deascription: Add or delete the daos user and group on host by sudo command. Args: hosts: list of host. ba_cmd: linux bash command to create user or group. user: user or group name to be created or cleaned. Return: none. ''' bash_cmd = os.path.join("/usr/sbin", ba_cmd) homedir = "" if "usermod" not in ba_cmd and "user" in ba_cmd: homedir = "-r" cmd = " ".join(("sudo", bash_cmd, homedir, user)) print(" =Clients/hosts {0}, exec cmd: {1}".format(hosts, cmd)) pcmd(hosts, cmd, False)
def clean_files(self): """Clean the tmpfs on the servers.""" scm_mount = self.runner.job.yaml_params.server_params[-1].scm_mount scm_list = self.runner.job.yaml_params.server_params[-1].scm_list.value clean_cmds = [ "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf" ] if self.runner.job.yaml_params.is_nvme(): clean_cmds.append("sudo rm -rf {0}; \ sudo umount {0}".format(scm_mount)) # scm_mount can be /mnt/daos0 or /mnt/daos1 for two daos_server # instances. Presently, not supported in DAOS. The for loop needs # to be updated in future to handle it. Single instance pmem # device should work now. if self.runner.job.yaml_params.is_scm(): for value in scm_list: clean_cmds.append("sudo umount {}; \ sudo wipefs -a {}".format(scm_mount, value)) self.log.info("Cleanup of %s directory.", str(scm_mount)) pcmd(self._hosts, "; ".join(clean_cmds), False)
def stop_agent(sessions, client_list=None): """Kill ssh and the agent. This is temporary; presuming the agent will deamonize at somepoint and can be started and killed more appropriately. Args: sessions (dict): set of subprocess sessions returned by run_agent() client_list (list, optional): lists of hosts running the daos agent. Defaults to None. Raises: AgentFailed: if the daos agents failed to stop """ # if empty client list, 'self' is effectively client if client_list is None: client_list = [socket.gethostname().split('.', 1)[0]] # Kill the agents processes pcmd(client_list, "pkill daos_agent", False) # Kill any processes running in the sessions for client in sessions: if sessions[client].poll() is None: sessions[client].kill() sessions[client].wait() # Check to make sure all the daos agents are dead # pgrep exit status: # 0 - One or more processes matched the criteria. # 1 - No processes matched. # 2 - Syntax error in the command line. # 3 - Fatal error: out of memory etc. time.sleep(5) result = pcmd(client_list, "pgrep 'daos_agent'", False, expect_rc=1) if len(result) > 1 or 1 not in result: raise AgentFailed( "DAOS agent processes detected after attempted stop on {}".format( ", ".join([str(result[key]) for key in result if key != 1])))
def storage_reset(hosts): """ Reset the Storage on servers using the DAOS server's yaml settings file. Args: hosts (str): a string of comma-separated host names Raises: ServerFailed: if server failed to reset storage """ daos_srv_bin = get_file_path("bin/daos_server") cmd = "sudo {} storage prepare -n --reset -f".format(daos_srv_bin[0]) result = pcmd(hosts, cmd) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def storage_prepare(hosts, user): """ Prepare the storage on servers using the DAOS server's yaml settings file. Args: hosts (str): a string of comma-separated host names Raises: ServerFailed: if server failed to prepare storage """ daos_srv_bin = get_file_path("bin/daos_server") cmd = ("sudo {} storage prepare -n -u \"{}\" --hugepages=4096 -f" .format(daos_srv_bin[0], user)) result = pcmd(hosts, cmd, timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error preparing NVMe storage")
def set_scm_mount_ownership(self, user=None, verbose=False): """Set the ownership to the specified user for each scm mount. Args: user (str, optional): user name. Defaults to None - current user. verbose (bool, optional): display commands. Defaults to False. """ user = getpass.getuser() if user is None else user cmd_list = set() for server_params in self.manager.job.yaml.server_params: scm_mount = server_params.scm_mount.value # Support single or multiple scm_mount points if not isinstance(scm_mount, list): scm_mount = [scm_mount] self.log.info("Changing ownership to %s for: %s", user, scm_mount) cmd_list.add("sudo chown -R {0}:{0} {1}".format( user, " ".join(scm_mount))) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), verbose)
def run(self): """Run the daos_racer command remotely. Raises: CommandFailure: if there is an error running the command """ # Run daos_racer on the specified host self.log.info( "Running %s on %s with %s timeout", self.__str__(), self.host, "no" if self.clush_timeout.value is None else "a {}s".format( self.clush_timeout.value)) return_codes = pcmd([self.host], self.__str__(), True, self.clush_timeout.value) if 0 not in return_codes or len(return_codes) > 1: # Kill the daos_racer process if the remote command timed out if 255 in return_codes: self.log.info("Stopping timed out daos_racer process on %s", self.host) pcmd([self.host], "pkill daos_racer", True) raise CommandFailure("Error running '{}'".format(self._command)) self.log.info("Test passed!")
def storage_reset(self): """Reset the servers' storage. NOTE: Don't enhance this method to reset SCM. SCM will not be in a useful state for running next tests. Raises: ServerFailed: if server failed to reset storage """ daos_srv_bin = os.path.join(self.daosbinpath, "daos_server") cmd = "{} storage prepare -n --reset -f".format(daos_srv_bin) result = pcmd(self._hosts, cmd) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def setUp(self): """Define test setup to be done.""" print("<<setUp Started>> at {}".format(time.ctime())) super(Soak, self).setUp() # Initialize loop param for all tests self.loop = 1 self.failed_job_id_list = [] # Fail if slurm partition daos_client is not defined if not self.partition_clients: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.log.info("<<Updated hostlist_clients %s >>", self.hostlist_clients) # include test node for log cleanup; remove from client list self.test_node = [socket.gethostname().split('.', 1)[0]] if self.test_node[0] in self.hostlist_clients: self.hostlist_clients.remove(self.test_node[0]) self.log.info("<<Updated hostlist_clients %s >>", self.hostlist_clients) self.node_list = self.hostlist_clients + self.test_node # self.node_list = self.hostlist_clients # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = "/tmp/soak" self.outputsoakdir = self.outputdir + "/soak" # Create the remote log directories on all client nodes self.rem_pass_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) # cleanup soak log directories before test on all nodes result = pcmd(NodeSet.fromlist(self.node_list), "rm -rf {}".format(self.log_dir), verbose=False) if len(result) > 1 or 0 not in result: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format(", ".join([ str(result[key]) for key in result if key != 0 ])))
def prepare_storage(self, user, using_dcpm=None, using_nvme=None): """Prepare the server storage. Args: user (str): username using_dcpm (bool, optional): override option to prepare scm storage. Defaults to None, which uses the configuration file to determine if scm storage should be formatted. using_nvme (bool, optional): override option to prepare nvme storage. Defaults to None, which uses the configuration file to determine if nvme storage should be formatted. Raises: ServerFailed: if there was an error preparing the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.target_user.value = user cmd.sub_command_class.sub_command_class.force.value = True # Use the configuration file settings if no overrides specified if using_dcpm is None: using_dcpm = self.manager.job.using_dcpm if using_nvme is None: using_nvme = self.manager.job.using_nvme if using_dcpm and not using_nvme: cmd.sub_command_class.sub_command_class.scm_only.value = True elif not using_dcpm and using_nvme: cmd.sub_command_class.sub_command_class.nvme_only.value = True if using_nvme: hugepages = self.get_config_value("nr_hugepages") cmd.sub_command_class.sub_command_class.hugepages.value = hugepages self.log.info("Preparing DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=40) if len(result) > 1 or 0 not in result: dev_type = "nvme" if using_dcpm and using_nvme: dev_type = "dcpm & nvme" elif using_dcpm: dev_type = "dcpm" raise ServerFailed("Error preparing {} storage".format(dev_type))
def run(self, check=True): """Run the dfuse command. Args: check (bool): Check if dfuse mounted properly after mount is executed. Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self.env: raise CommandFailure( "Dfuse missing environment variables for D_LOG_FILE") # create dfuse dir if does not exist self.create_mount_point() # run dfuse command cmd = "".join([self.env.get_export_str(), self.__str__()]) ret_code = pcmd(self.hosts, cmd, timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if len(ret_code): error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in ret_code.items() if code != 0 ])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if check: # Dfuse will block in the command for the mount to complete, even # if run in background mode so it should be possible to start using # it immediately after the command returns. if not self.check_running(fail_on_error=False): self.log.info('Waiting two seconds for dfuse to start') time.sleep(2) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) self.check_running()
def start(self, yamlfile): """Start the server through the runner.""" self.runner.job.set_config(yamlfile) self.server_clean() # Prepare nvme storage in servers if self.runner.job.yaml_params.is_nvme(): self.log.info("Performing nvme storage prepare in <format> mode") storage_prepare(self._hosts, "root") self.runner.mca.value = {"plm_rsh_args": "-l root"} # Make sure log file has been created for ownership change lfile = self.runner.job.yaml_params.server_params[ -1].log_file.value if lfile is not None: self.log.info("Creating log file") cmd_touch_log = "touch {}".format(lfile) pcmd(self._hosts, cmd_touch_log, False) # Change ownership of attach info directory chmod_attach = "chmod 777 -R {}".format(self.attach.value) pcmd(self._hosts, chmod_attach, False) try: self.run() except CommandFailure as details: self.log.info("<SERVER> Exception occurred: %s", str(details)) # Kill the subprocess, anything that might have started self.kill() raise ServerFailed("Failed to start server in {} mode.".format( self.runner.job.mode)) if self.runner.job.yaml_params.is_nvme() or \ self.runner.job.yaml_params.is_scm(): # Setup the hostlist to pass to dmg command servers_with_ports = [ "{}:{}".format(host, self.runner.job.yaml_params.port) for host in self._hosts ] # Format storage and wait for server to change ownership self.log.info("Formatting hosts: <%s>", self._hosts) storage_format(self.daosbinpath, ",".join(servers_with_ports)) self.runner.job.mode = "normal" try: self.runner.job.check_subprocess_status(self.runner.process) except CommandFailure as error: self.log.info("Failed to start after format: %s", str(error)) # Change ownership shared attach info file chmod_cmds = "sudo chmod 777 {}/daos_server.attach_info_tmp".format( self.attach.value) pcmd(self._hosts, chmod_cmds, False) return True
def storage_reset(hosts): """Reset the Storage on servers using the DAOS server's yaml settings file. NOTE: Don't enhance this method to reset SCM. SCM will not be in a useful state for running next tests. Args: hosts (str): a string of comma-separated host names Raises: ServerFailed: if server failed to reset storage """ daos_srv_bin = get_file_path("bin/daos_server") cmd = "sudo {} storage prepare -n --reset -f".format(daos_srv_bin[0]) result = pcmd(hosts, cmd) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def check_running(self, fail_on_error=True): """Check dfuse is running. Run a command to verify dfuse is running on hosts where it is supposed to be. Use grep -v and rc=1 here so that if it isn't, then we can see what is being used instead. """ retcodes = pcmd(self.running_hosts, "stat -c %T -f {0} | grep -v fuseblk".format( self.mount_dir.value), expect_rc=1) if 1 in retcodes: del retcodes[1] if len(retcodes): self.log.error('Errors checking running: %s', retcodes) if not fail_on_error: return False raise CommandFailure('dfuse not running') return True
def stop(self): """Stop dfuse Raises: CommandFailure: In case dfuse stop fails """ cmd = "if [ -x '$(command -v fusermount)' ]; " cmd += "then fusermount -u {0}; else fusermount3 -u {0}; fi".\ format(self.mount_dir.value) ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30) self.remove_mount_point() if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error stopping dfuse on the following hosts: {}".format( error_hosts))
def reset_storage(self): """Reset the server storage. Raises: ServerFailed: if there was an error resetting the storage """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("storage") cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.nvme_only.value = True cmd.sub_command_class.sub_command_class.reset.value = True cmd.sub_command_class.sub_command_class.force.value = True self.log.info("Resetting DAOS server storage: %s", str(cmd)) result = pcmd(self._hosts, str(cmd), timeout=120) if len(result) > 1 or 0 not in result: raise ServerFailed("Error resetting NVMe storage")
def run(self, host): """Execute Fio from the host passed on to the method. Args: host: hostname from where to launch fio """ # run fio command self.run_cmd = self.run_cmd.replace("fio", ' ').replace(" POSIX", '') print("Running: {}".format('fio' + self.run_cmd)) ret_code = general_utils.pcmd(host, 'fio' + self.run_cmd) # check for any failures if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error starting fio on the following hosts: {}".format( error_hosts))
def run(self): """Run the dfuse command. Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self.env: raise CommandFailure( "Dfuse missing environment varaibles for D_LOG_FILE") # create dfuse dir if does not exist self.create_mount_point() # run dfuse command cmd = "".join([self.env.get_export_str(), self.__str__()]) ret_code = pcmd(self.hosts, cmd, timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if len(ret_code): error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) if not self.check_running(fail_on_error=False): self.log.info('Waiting twenty five seconds for dfuse to start') time.sleep(25) self.check_running()
def execute_cluster_cmds(nodes, cmdlist, sudo=False): """Execute the list of cmds on hostlist nodes. Args: nodes (list): list of nodes cmdlist ([type]): list of cmdlines to execute sudo (str, optional): Execute cmd with sudo privs. Defaults to false. Returns: ret_code: returns error code if pcmd fails; """ for cmd in cmdlist: if sudo: cmd = "sudo {}".format(cmd) result = pcmd(nodes, cmd, True, None, 0) # if at least one node failed or all nodes failed # return on first failure if len(result) > 1 or 0 not in result: return 1 return 0
def tearDown(self): """Tear down after each test case.""" super().tearDown() # force test status !! # use mangling trick described at # https://stackoverflow.com/questions/3385317/private-variables-and-methods-in-python # to do so self._Test__status = 'PASS' # DAOS-1452 may need to check for one file per engine... ret_codes = pcmd(self.hostlist_servers, r"ls /tmp/daos_dump*.txt") # Report any failures if len(ret_codes) > 1 or 0 not in ret_codes: failed = [ "{}: rc={}".format(val, key) for key, val in ret_codes.items() if key != 0 ] print("no ULT stacks dump found on following hosts: {}".format( ", ".join(failed))) self._Test__status = 'FAIL'
def test_super_block_version_basic(self): """JIRA ID: DAOS-3648. Test Description: Basic test to verify that superblock file is versioned. :avocado: tags=all,tiny,pr,ds_versioning,basic """ # Check that the superblock file exists under the scm_mount dir. scm_mount = self.server_managers[0].get_config_value("scm_mount") fname = os.path.join(scm_mount, "superblock") check_result = check_file_exists(self.hostlist_servers, fname) if not check_result[0]: self.fail("{}: {} not found".format(check_result[1], fname)) # Make sure that 'version' is in the file, run task to check cmd = "cat {} | grep -F \"version\"".format(fname) result = pcmd(self.hostlist_servers, cmd, timeout=20) # Determine if the command completed successfully across all the hosts if len(result) > 1 or 0 not in result: self.fail("Was not able to find version in {} file".format(fname))
def remote_copy(self, hostlist, remote_dir, local_dir): """Copy files from remote dir to local dir. Args: hostlist (list): list of remote nodes remote_dir (str): remote directory of files local_dir (str): local directory Raises: SoakTestError: if there is an error with the remote copy """ this_host = socket.gethostname() result = pcmd( NodeSet.fromlist(hostlist), "if [ ! -z '$(ls -A {0})' ]; then " "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format( remote_dir, this_host, local_dir), verbose=False) if len(result) > 1 or 0 not in result: raise SoakTestError("Error executing remote copy: {}".format( ", ".join([str(result[key]) for key in result if key != 0])))
def run(self): """ Run the dfuse command. Raises: CommandFailure: In case dfuse run command fails """ # create dfuse dir if does not exist self.create_mount_point() # obtain env export string env = self.get_default_env() # run dfuse command ret_code = general_utils.pcmd(self.hosts, env + self.__str__(), timeout=30) # check for any failures if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts))
def get_subprocess_state(self, message=None): """Display the state of the subprocess. Args: message (str, optional): additional text to include in output. Defaults to None. Returns: list: a list of states for the process found. If the local job manager command is running its state will be the first in the list. Additional states in the list can typically indicate that remote processes were also found to be active. Active remote processes will be indicated by a 'R' state at the end of the list. """ # Get/display the state of the local job manager process state = super(JobManager, self).get_subprocess_state(message) if self._process is not None and self._hosts: # Display the status of the remote job processes on each host command = "/usr/bin/pgrep -a {}".format(self.job.command_regex) self.log.debug("%s processes still running remotely%s:", self.command, " {}".format(message) if message else "") self.log.debug("Running (on %s): %s", self._hosts, command) results = pcmd(self._hosts, command, True, 10, None) # Add a running state to the list of process states if any remote # process was found to be active. The pcmd method will return a # dictionary with a single key, e.g. {1: <NodeSet>}, if there are # no remote processes running on any of the hosts. If this value # is not returned, indicate there are processes running by adding # the "R" state to the process state list. if 1 not in results or len(results) > 1: if not state: state = ["?"] state.append("R") return state