def start_dfuse(self, pool): """Create a DfuseCommand object to start dfuse. Args: pool (obj): TestPool obj """ # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(pool) self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool)) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) # create dfuse mount point cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value) params = self.srun_params params["export"] = "all" params["ntasks-per-node"] = 1 result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Dfuse mountpoint {} not created>>".format( self.dfuse.mount_dir.value)) cmd = self.dfuse.__str__() result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd, params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Dfuse failed to start>>")
def get_remote_logs(self): """Copy files from remote dir to local dir. Raises: SoakTestError: if there is an error with the remote copy """ # copy the files from the remote # TO-DO: change scp this_host = socket.gethostname() command = "/usr/bin/rsync -avtr --min-size=1B {0} {1}:{0}/..".format( self.test_log_dir, this_host) result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), command, self.srun_params) if result.exit_status == 0: command = "/usr/bin/cp -R -p {0}/ \'{1}\'".format( self.test_log_dir, self.outputsoakdir) try: run_command(command, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak remote logfiles not copied to avocado data " "dir {} - check /tmp/soak on nodes {}>>".format( error, self.hostlist_clients)) command = "/usr/bin/rm -rf {0}/*".format(self.test_log_dir) slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), command, self.srun_params) run_command(command) else: raise SoakTestError( "<<FAILED: Soak remote logfiles not copied from clients>>: " "{}".format(self.hostlist_clients))
def cleanup_dfuse(self): """Cleanup and remove any dfuse mount points. Args: self (obj): soak obj """ cmd = [ "/usr/bin/bash -c 'for pid in $(pgrep dfuse)", "do sudo kill $pid", "done'" ] cmd2 = [ "/usr/bin/bash -c 'for dir in $(find /tmp/daos_dfuse/)", "do fusermount3 -uz $dir", "rm -rf $dir", "done'" ] try: slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "{}".format(";".join(cmd)), self.srun_params, timeout=180) except slurm_utils.SlurmFailed as error: self.log.info("Dfuse processes not stopped") try: slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "{}".format(";".join(cmd2)), self.srun_params, timeout=180) except slurm_utils.SlurmFailed as error: self.log.info("Dfuse mountpoints not deleted")
def cleanup_dfuse(self): """Cleanup and remove any dfuse mount points.""" cmd = [ "/usr/bin/bash -c 'pkill dfuse", "for dir in /tmp/daos_dfuse*", "do fusermount3 -uz $dir", "rm -rf $dir", "done'" ] try: slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "{}".format(";".join(cmd)), self.srun_params) except slurm_utils.SlurmFailed as error: self.log.info("<<FAILED: Dfuse directories not deleted %s >>", error)
def get_remote_logs(self): """Copy files from remote dir to local dir. Raises: SoakTestError: if there is an error with the remote copy """ # copy the files from the remote # TO-DO: change scp this_host = socket.gethostname() rsync_str = "rsync -avtr --min-size=1B" result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "bash -c \"{0} {1} {2}:{1}/.. && rm -rf {1}/*\"".format( rsync_str, self.test_log_dir, this_host), self.srun_params) if result.exit_status == 0: cmd = "cp -R -p {0}/ \'{1}\'; rm -rf {0}/*".format( self.test_log_dir, self.outputsoakdir) try: result = process.run(cmd, shell=True, timeout=30) except process.CmdError as error: raise SoakTestError("<<FAILED: Soak remote logfiles not copied" "to avocado data dir {} - check /tmp/soak " "on nodes {}>>".format( error, self.hostlist_clients)) else: raise SoakTestError("<<FAILED: Soak remote logfiles not copied " "from clients>>: {}".format( self.hostlist_clients))
def execute_jobs(self, jobs, pools): """Execute the overall soak test. Args: pools (list): list of TestPool obj - self.pool[1:] Raise: SoakTestError """ cmdlist = [] # unique numbers per pass self.used = [] # Update the remote log directories from new loop/pass self.sharedsoakdir = self.sharedlog_dir + "/pass" + str(self.loop) self.test_log_dir = self.log_dir + "/pass" + str(self.loop) local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "mkdir -p {}".format( self.test_log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: logfile directory not" "created on clients>>: {}".format(self.hostlist_clients)) # Create local log directory os.makedirs(local_pass_dir) os.makedirs(self.sharedsoakdir) # Setup cmdlines for job with specified pool if len(pools) < len(jobs): raise SoakTestError( "<<FAILED: There are not enough pools to run this test>>") for index, job in enumerate(jobs): cmdlist.extend(self.job_setup(job, pools[index])) # Gather the job_ids job_id_list = self.job_startup(cmdlist) # Initialize the failed_job_list to job_list so that any # unexpected failures will clear the squeue in tearDown self.failed_job_id_list = job_id_list # launch harassers if defined and enabled if self.h_list and self.loop > 1: self.log.info("<<Harassers are enabled>>") self.launch_harassers(self.h_list, pools) if not self.harasser_completion(self.harasser_timeout): raise SoakTestError("<<FAILED: Harassers failed ") # rebuild can only run once for now if self.is_harasser("rebuild"): self.h_list.remove("rebuild") # Wait for jobs to finish and cancel/kill jobs if necessary self.failed_job_id_list = self.job_completion(job_id_list) # Log the failing job ID if self.failed_job_id_list: self.log.info( "<<FAILED: The following jobs failed %s >>", (" ,".join( str(j_id) for j_id in self.failed_job_id_list))) # accumulate failing job IDs self.all_failed_jobs.extend(self.failed_job_id_list) # clear out the failed jobs for this pass self.failed_job_id_list = []
def get_remote_logs(self): """Copy files from remote dir to local dir. Args: self (obj): soak obj Raises: SoakTestError: if there is an error with the remote copy """ # copy the files from the client nodes to a shared directory command = "/usr/bin/rsync -avtr --min-size=1B {0} {1}/..".format( self.test_log_dir, self.sharedsoakdir) result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), command, self.srun_params) if result.exit_status == 0: # copy the local logs and the logs in the shared dir to avocado dir for directory in [self.test_log_dir, self.sharedsoakdir]: command = "/usr/bin/cp -R -p {0}/ \'{1}\'".format( directory, self.outputsoakdir) try: result = run_command(command, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: job logs failed to copy {}>>".format( error)) # remove the remote soak logs for this pass command = "/usr/bin/rm -rf {0}".format(self.test_log_dir) slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), command, self.srun_params) # remove the local log for this pass for directory in [self.test_log_dir, self.sharedsoakdir]: command = "/usr/bin/rm -rf {0}".format(directory) try: result = run_command(command) except DaosTestError as error: raise SoakTestError( "<<FAILED: job logs failed to delete {}>>".format( error)) else: raise SoakTestError( "<<FAILED: Soak remote logfiles not copied " "from clients>>: {}".format(self.hostlist_clients))
def execute_jobs(self, jobs, pools): """Execute the overall soak test. Args: pools (list): list of TestPool obj - self.pool[1:] Raise: SoakTestError """ job_script_list = [] # Update the remote log directories from new loop/pass self.sharedsoaktest_dir = self.sharedsoak_dir + "/pass" + str( self.loop) self.soaktest_dir = self.soak_dir + "/pass" + str(self.loop) outputsoaktest_dir = self.outputsoak_dir + "/pass" + str(self.loop) result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "mkdir -p {}".format(self.soaktest_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: logfile directory not" "created on clients>>: {}".format( self.hostlist_clients)) # Create local avocado log directory for this pass os.makedirs(outputsoaktest_dir) # Create shared log directory for this pass os.makedirs(self.sharedsoaktest_dir) # Create local test log directory for this pass os.makedirs(self.soaktest_dir) # create the batch scripts job_script_list = self.job_setup(jobs, pools) # randomize job list random.seed(4) random.shuffle(job_script_list) # Gather the job_ids job_id_list = self.job_startup(job_script_list) # Initialize the failed_job_list to job_list so that any # unexpected failures will clear the squeue in tearDown self.failed_job_id_list = job_id_list # Wait for jobs to finish and cancel/kill jobs if necessary self.failed_job_id_list = self.job_completion(job_id_list) # Log the failing job ID if self.failed_job_id_list: self.log.info( "<<FAILED: The following jobs failed %s >>", (" ,".join(str(j_id) for j_id in self.failed_job_id_list))) # accumulate failing job IDs self.all_failed_jobs.extend(self.failed_job_id_list) # clear out the failed jobs for this pass self.failed_job_id_list = []
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) self.dmg_command.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.dmg_command.copy_configuration(self.hostlist_clients) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file") try: reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont, num_bytes=500000000, cmd="write") except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container write failed>>") from error # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # verify reserved container data final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file") try: reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont) except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container read failed>>") from error if not cmp(initial_resv_file, final_resv_file): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") for file in [initial_resv_file, final_resv_file]: if os.path.isfile(file): file_name = os.path.split(os.path.dirname(file))[-1] # save a copy of the POSIX file in self.outputsoakdir copy_cmd = "cp -p {} {}/{}_resv_file".format( file, self.outputsoakdir, file_name) try: run_command(copy_cmd, timeout=30) except DaosTestError as error: self.soak_errors.append( "Reserved data file {} failed to archive".format(file)) os.remove(file) self.container.append(resv_cont) # Gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] test_to = self.params.get("test_timeout", test_param + "*") self.job_timeout = self.params.get("job_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") self.nodesperjob = self.params.get("nodesperjob", test_param + "*") self.taskspernode = self.params.get("taskspernode", test_param + "*") harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") obj_class = self.params.get("oclass", "/run/container_reserved/*") if harassers: harasserlist = get_harassers(harassers) self.harassers = harasserlist[:] run_harasser = True self.log.info("<< Initial harrasser list = %s>>", " ".join([harasser for harasser in self.harassers])) # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) resv_cont.write_objects(rank, obj_class) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize if harassers if run_harasser and not self.harassers: self.harasser_results = {} self.harasser_args = {} self.harassers = harasserlist[:] try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.soak_errors.extend(self.destroy_pools(self.pool[1])) # remove the test pools from self.pool; preserving reserved pool self.container = [] self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and self.harassers: self.harasser_loop_time = loop_time self.loop += 1 # TO-DO: use IOR if not resv_cont.read_objects(): self.soak_errors.append("Data verification error on reserved pool" "after SOAK completed") self.container.append(resv_cont) # gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.harasser_joblist = [] self.harasser_results = {} test_to = self.params.get("test_timeout", test_param) self.job_timeout = self.params.get("job_timeout", test_param) self.harasser_timeout = self.params.get("harasser_timeout", test_param) self.test_name = self.params.get("name", test_param) self.nodesperjob = self.params.get("nodesperjob", test_param) self.test_iteration = self.params.get("iteration", test_param) self.task_list = self.params.get("taskspernode", test_param + "*") self.h_list = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") pool_list = self.params.get("poollist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") if self.is_harasser("rebuild"): obj_class = "_".join(["OC", str( self.params.get("daos_oclass", "/run/rebuild/*")[0])]) else: obj_class = self.params.get( "object_class", "/run/container_reserved/*") # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool self.add_pools(["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data self.container = TestContainer(self.pool[0]) self.container.namespace = "/run/container_reserved/*" self.container.get_params(self) self.container.create() self.container.write_objects(rank, obj_class) self.all_failed_jobs = [] # cleanup soak log directories before test on all nodes result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format( self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Soak directories not removed" "from clients>>: {}".format(self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info( "<<Soak1 PASS %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create all specified pools self.add_pools(pool_list) self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) try: self.execute_jobs(job_list, self.pool[1:]) except SoakTestError as error: self.fail(error) errors = self.destroy_pools(self.pool[1:]) # remove the test pools from self.pool; preserving reserved pool self.pool = [self.pool[0]] self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) self.assertEqual(len(errors), 0, "\n".join(errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info( "<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format( loop_time)) self.loop += 1 # TO-DO: use IOR self.assertTrue( self.container.read_objects(), "Data verification error on reserved pool" "after SOAK completed") # gather the daos logs from the client nodes self.log.info( "<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format( time.time() - start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] self.mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") enable_sudo = self.params.get("enable_sudo", "/run/*", default=True) test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000) ignore_soak_errors = self.params.get("ignore_soak_errors", test_param + "*", False) self.sudo_cmd = "sudo" if enable_sudo else "" if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container self.resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file unless test is smoke self.initial_resv_file = os.path.join(self.test_dir, "initial", "resv_file") try: reserved_file_copy(self, self.initial_resv_file, self.pool[0], self.resv_cont, num_bytes=resv_bytes, cmd="write") except CommandFailure as error: self.fail(error) # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.soak_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.soak_dir, self.sharedsoak_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Baseline metrics data run_metrics_check(self, prefix="initial") # Initialize time self.start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = self.start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) # Cleanup any dfuse mounts before destroying containers cleanup_dfuse(self) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1:])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Gather metrics data after jobs complete run_metrics_check(self) # Fail if the pool/containers did not clean up correctly if not ignore_soak_errors: self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - self.start_time))