def pre_tear_down(self): """Tear down any test-specific steps prior to running tearDown(). Returns: list: a list of error strings to report after all tear down steps have been attempted """ errors = [] # clear out any jobs in squeue; if self.failed_job_id_list: job_id = " ".join([str(job) for job in self.failed_job_id_list]) self.log.info("<<Cancel jobs in queue with ids %s >>", job_id) try: run_command("scancel --partition {} -u {} {}".format( self.client_partition, self.username, job_id)) except DaosTestError as error: # Exception was raised due to a non-zero exit status errors.append("Failed to cancel jobs {}: {}".format( self.failed_job_id_list, error)) if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) # Check if any dfuse mount points need to be cleaned if self.dfuse: try: cleanup_dfuse(self) except SoakTestError as error: self.log.info("Dfuse cleanup failed with %s", error) # daos_agent is always started on this node when start agent is false if not self.setup_start_agents: self.hostlist_clients = [socket.gethostname().split('.', 1)[0]] return errors
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get( "single_test_pool", test_param + "*", True) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container resv_cont = self.get_container( self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file initial_resv_file = os.path.join( os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file") try: reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont, num_bytes=500000000, cmd="write") except CommandFailure as error: self.fail(error) # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format( self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Soak directories not removed" "from clients>>: {}".format(self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Baseline metrics data run_metrics_check(self, prefix="initial") # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info( "<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers(harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) # Cleanup any dfuse mounts before destroying containers cleanup_dfuse(self) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Fail if the pool/containers did not clean up correctly self.assertEqual( len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info( "<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format( loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # verify reserved container data final_resv_file = os.path.join( os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file") try: reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont) except CommandFailure as error: self.soak_errors.append( "<<FAILED: Soak reserved container read failed>>") if not cmp(initial_resv_file, final_resv_file): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") for file in [initial_resv_file, final_resv_file]: if os.path.isfile(file): file_name = os.path.split(os.path.dirname(file))[-1] # save a copy of the POSIX file in self.outputsoakdir copy_cmd = "cp -p {} {}/{}_resv_file".format( file, self.outputsoakdir, file_name) try: run_command(copy_cmd, timeout=30) except DaosTestError as error: self.soak_errors.append( "Reserved data file {} failed to archive".format(file)) os.remove(file) self.container.append(resv_cont) # Gather the daos logs from the client nodes self.log.info( "<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format( time.time() - start_time))