def get_ior_job_manager_command(self, custom_ior_cmd=None): """Get the MPI job manager command for IOR. Args: custom_ior_cmd (IorCommand): Custom IorCommand instance to create job_manager with. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if custom_ior_cmd: self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich") else: self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich") return self.job_manager
def setUp(self): """Set up each test case.""" super(MacsioTestBase, self).setUp() # Support using different job managers to launch the daos agent/servers mpi_type = self.params.get("mpi_type", default="mpich") self.manager = Mpirun(None, subprocess=False, mpitype=mpi_type) self.macsio = self.get_macsio_command()
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.daos_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.daos_cont.update(container_info ["{}{}{}".format(oclass, api, test[2])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def run(self, tmp, processes): # pylint: disable=arguments-differ """Run the dcp command. Args: tmp (str): path for hostfiles processes: Number of processes for dcp command Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: CommandFailure: In case dcp run command fails """ self.log.info('Starting dcp') # Handle compatibility if not self.has_src_pool: src_pool = self.daos_src_pool.value src_cont = self.daos_src_cont.value src_path = self.src_path.value dst_pool = self.daos_dst_pool.value dst_cont = self.daos_dst_cont.value dst_path = self.dst_path.value if src_pool or src_cont: self.log.info( "Converting --daos-src-pool to daos://pool/cont/path") src_path = "daos://{}/{}/{}".format( src_pool, src_cont, src_path) self.src_path.update(src_path) self.daos_src_pool.update(None) self.daos_src_cont.update(None) if dst_pool or dst_cont: self.log.info( "Converting --daos-dst-pool to daos://pool/cont/path") dst_path = "daos://{}/{}/{}".format( dst_pool, dst_cont, dst_path) self.dst_path.update(dst_path) self.daos_dst_pool.update(None) self.daos_dst_cont.update(None) if self.has_bufsize: blocksize = self.blocksize.value if blocksize: self.log.info( "Converting --blocksize to --bufsize") self.blocksize.update(None) self.bufsize.update(blocksize) # Get job manager cmd mpirun = Mpirun(self, mpitype="mpich") mpirun.assign_hosts(self.hosts, tmp) mpirun.assign_processes(processes) mpirun.exit_status_exception = self.exit_status_exception # run dcp out = mpirun.run() return out
def run(self, processes, job_manager): # pylint: disable=arguments-differ """Run the MpiFileUtils command. Args: processes: Number of processes for the command. job_manager: Job manager variable to set/assign Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: CommandFailure: In case run command fails. """ self.log.info('Starting %s', str(self.command).lower()) # Get job manager cmd job_manager = Mpirun(self, mpi_type="mpich") job_manager.assign_hosts(self.hosts, self.tmp) job_manager.assign_processes(processes) job_manager.exit_status_exception = self.exit_status_exception # Run the command out = job_manager.run() return out
def run(self, tmp, processes): # pylint: disable=arguments-differ """Run the dsync command. Args: tmp (str): path for hostfiles processes: Number of processes for dsync command Returns: CmdResult: Object that contains exit status, stdout, and other information. Raises: CommandFailure: In case dsync run command fails """ self.log.info('Starting dsync') # Get job manager cmd mpirun = Mpirun(self, mpitype="mpich") mpirun.assign_hosts(self.hosts, tmp) mpirun.assign_processes(processes) mpirun.exit_status_exception = self.exit_status_exception # run dsync out = mpirun.run() return out
def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break
def ior_thread(self, pool, oclass, api, test, flags, results): """This method calls job manager for IOR command invocation. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) if "-w" in flags: self.container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[0])]) manager.job.dfs_cont.update(self.container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if self.subprocess: self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich") else: self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich") return self.mpirun
def run_test(self): """Run the HDF5 VOL testsuites. Raises: VolFailed: for an invalid test name or test execution failure """ # initialize test specific variables mpi_type = self.params.get("mpi_type", default="mpich") test_repo = self.params.get("daos_vol_repo") plugin_path = self.params.get("plugin_path") # test_list = self.params.get("daos_vol_tests", default=[]) testname = self.params.get("testname") client_processes = self.params.get("client_processes") # create pool, container and dfuse mount self.add_pool(connect=False) self.add_container(self.pool) # VOL needs to run from a file system that supports xattr. # Currently nfs does not have this attribute so it was recommended # to create a dfuse dir and run vol tests from there. # create dfuse container self.start_dfuse(self.hostlist_clients, self.pool, self.container) # for test_param in test_list: # testname = test_param[0][1] # client_processes = test_param[1][1] exe = os.path.join(test_repo, testname) if mpi_type == "openmpi": manager = Orterun(exe, subprocess=False) else: manager = Mpirun(exe, subprocess=False, mpitype="mpich") env = EnvironmentVariables() env["DAOS_POOL"] = "{}".format(self.pool.uuid) env["DAOS_SVCL"] = "{}".format(self.pool.svc_ranks[0]) env["DAOS_CONT"] = "{}".format(self.container.uuid) env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) manager.assign_hosts(self.hostlist_clients) manager.assign_processes(client_processes) manager.assign_environment(env, True) manager.working_dir.value = self.dfuse.mount_dir.value # run VOL Command try: manager.run() except CommandFailure as _error: self.fail("{} FAILED> \nException occurred: {}".format( exe, str(_error)))
def get_mdtest_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. Returns: JobManager: the object for the mpi job manager command """ # Initialize MpioUtils if mdtest needs to be run using mpich if manager == "MPICH": mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") return Mpirun(self.mdtest_cmd, mpitype="mpich") return Orterun(self.mdtest_cmd)
def test_load_mpi(self): """Simple test of apricot test code to load the openmpi module. :avocado: tags=all :avocado: tags=harness,harness_basic_test,test_load_mpi :avocado: tags=load_mpi """ try: Orterun(None) except CommandFailure as error: self.fail("Orterun initialization failed: {}".format(error)) try: Mpirun(None, mpi_type="mpich") except CommandFailure as error: self.fail("Mpirun initialization failed: {}".format(error))
def run(self, tmp, processes): # pylint: disable=arguments-differ """Run the datamover command. Args: tmp (str): path for hostfiles processes: Number of processes for dcp command Raises: CommandFailure: In case datamover run command fails """ self.log.info('Starting datamover') # Get job manager cmd mpirun = Mpirun(self, mpitype="mpich") mpirun.assign_hosts(self.hosts, tmp) mpirun.assign_processes(processes) mpirun.exit_status_exception = self.exit_status_exception # run dcp out = mpirun.run() return out
def run(self, processes=1): # pylint: disable=arguments-differ """Run the dbench command. Args: processes: mpi processes Raises: CommandFailure: In case dbench run command fails """ self.log.info('Starting dbench') # Get job manager cmd mpirun = Mpirun(self, mpitype="mpich") mpirun.assign_hosts(self.hosts, self.tmp) mpirun.assign_processes(processes) mpirun.exit_status_exception = True # run dcp out = mpirun.run() return out
class NvmeFragmentation(TestWithServers): # pylint: disable=too-many-ancestors # pylint: disable=too-many-instance-attributes """NVMe drive fragmentation test cases. Test class Description: Verify the drive fragmentation does free the space and do not lead to ENOM_SPACE. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_transfer_size = self.params.get("transfer_block_size", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") cont_uuid = str(uuid.uuid4()) self.job_manager.job.dfs_cont.update(cont_uuid) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() container_info["{}{}{}".format(oclass, api, test[0])] = cont_uuid except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid #cmd.sub_command_class.sub_command_class.svc.value = \ # self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: # pylint: disable=protected-access cmd._get_result() except CommandFailure as _error: results.put("FAIL") def test_nvme_fragmentation(self): """Jira ID: DAOS-2332. Test Description: Purpose of this test is to verify there is no Fragmentation after doing some IO write/delete operation for ~hour. Use case: Create object with different transfer size in parallel (10 IOR threads) Delete the container created by IOR which will dealloc NVMe block Run above code in loop for some time (~1 hours) and expect not to fail with NO ENOM SPAC. :avocado: tags=all,full_regression :avocado: tags=hw,medium :avocado: tags=nvme,ib2,nvme_fragmentation """ no_of_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool self.add_pool(connect=False) self.pool.display_pool_daos_space("Pool space at the Beginning") # Repeat the test for 30 times which will take ~1 hour for test_loop in range(30): self.log.info("--Test Repeat for loop %s---", test_loop) # Create the IOR threads threads = [] for thrd in range(no_of_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_runner_thread, kwargs={"results": self.out_queue})) # Launch the IOR threads for thrd in threads: thrd.start() time.sleep(5) # Wait to finish the threads for thrd in threads: thrd.join() # Verify the queue and make sure no FAIL for any IOR run while not self.out_queue.empty(): if self.out_queue.get() == "FAIL": self.fail("FAIL") self.pool.display_pool_daos_space("Pool space at the End")
class IorTestBase(TestWithServers): """Base IOR test class. :avocado: recursive """ IOR_WRITE_PATTERN = "Commencing write performance test" IOR_READ_PATTERN = "Commencing read performance test" def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super(IorTestBase, self).__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.dfuse = None self.container = None self.lock = None self.mpirun = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) # lock is needed for run_multiple_ior method. self.lock = threading.Lock() def tearDown(self): """Tear down each test case.""" try: if self.dfuse: self.dfuse.stop() finally: # Stop the servers and agents super(IorTestBase, self).tearDown() def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params self.pool = TestPool( self.context, dmg_command=self.get_dmg_command()) self.pool.get_params(self) # Create a pool self.pool.create() def create_cont(self): """Create a TestContainer object to be used to create container.""" # Get container params self.container = TestContainer( self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # create container self.container.create() def _start_dfuse(self): """Create a DfuseCommand object to start dfuse.""" # Get Dfuse params self.dfuse = Dfuse(self.hostlist_clients, self.tmp) self.dfuse.get_params(self) # update dfuse params self.dfuse.set_dfuse_params(self.pool) self.dfuse.set_dfuse_cont_param(self.container) self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log) try: # start dfuse self.dfuse.run() except CommandFailure as error: self.log.error("Dfuse command %s failed on hosts %s", str(self.dfuse), str(NodeSet.fromlist(self.dfuse.hosts)), exc_info=error) self.fail("Test was expected to pass but it failed.\n") def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:testFile", create_pool=True, create_cont=True, stop_dfuse=True): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:testFile". Is ignored when using POSIX through DFUSE. create_pool (bool, optional): If it is true, create pool and container else just run the ior. Defaults to True. create_cont (bool, optional): Create new container. Default is True stop_dfuse (bool, optional): Stop dfuse after ior command is finished. Default is True. Returns: CmdResult: result of the ior command execution """ if create_pool: self.update_ior_cmd_with_pool(create_cont) # start dfuse if api is POSIX if self.ior_cmd.api.value == "POSIX": # Connect to the pool, create container and then start dfuse if not self.dfuse: self._start_dfuse() test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) out = self.run_ior(self.get_ior_job_manager_command(), self.processes, intercept) if stop_dfuse and self.dfuse: self.dfuse.stop() self.dfuse = None return out def update_ior_cmd_with_pool(self, create_cont=True): """Update ior_cmd with pool.""" # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Create a container, if needed. # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature if create_cont: self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if self.subprocess: self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich") else: self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich") return self.mpirun def check_subprocess_status(self, operation="write"): """Check subprocess status """ if operation == "write": self.ior_cmd.pattern = self.IOR_WRITE_PATTERN elif operation == "read": self.ior_cmd.pattern = self.IOR_READ_PATTERN else: self.fail("Exiting Test: Inappropriate operation type \ for subprocess status check") if not self.ior_cmd.check_ior_subprocess_status( self.mpirun.process, self.ior_cmd): self.fail("Exiting Test: Subprocess not running") def run_ior(self, manager, processes, intercept=None, display_space=True): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(processes) manager.assign_environment(env) try: if display_space: self.pool.display_pool_daos_space() out = manager.run() if not self.subprocess: for line in out.stdout.splitlines(): if 'WARNING' in line: self.fail("IOR command issued warnings.\n") return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: if not self.subprocess and display_space: self.pool.display_pool_daos_space() def stop_ior(self): """Stop IOR process. Args: manager (str): mpi job manager command """ self.log.info( "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__()) try: out = self.mpirun.stop() return out except CommandFailure as error: self.log.error("IOR stop Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def run_multiple_ior_with_pool(self, results, intercept=None): """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str): path to the interception library. Shall be used only for POSIX through DFUSE. ior_flags (str, optional): ior flags. Defaults to None. object_class (str, optional): daos object class. Defaults to None. """ self.update_ior_cmd_with_pool() # start dfuse for POSIX api. This is specific to interception # library test requirements. self._start_dfuse() # Create two jobs and run in parallel. # Job1 will have 3 client set up to use dfuse + interception # library # Job2 will have 1 client set up to use only dfuse. job1 = self.get_new_job(self.hostlist_clients[:-1], 1, results, intercept) job2 = self.get_new_job([self.hostlist_clients[-1]], 2, results, None) job1.start() # Since same ior_cmd is used to trigger the MPIRUN # with different parameters, pausing for 2 seconds to # avoid data collisions. time.sleep(2) job2.start() job1.join() job2.join() self.dfuse.stop() self.dfuse = None def get_new_job(self, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: clients (list): hosts on which to run ior job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ job = threading.Thread(target=self.run_multiple_ior, args=[ clients, results, job_num, intercept]) return job def run_multiple_ior(self, clients, results, job_num, intercept=None): """Run the IOR command. Args: clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space() def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) def execute_cmd(self, cmd, fail_on_err=True, display_output=True): """Execute cmd using general_utils.pcmd Args: cmd (str): String command to be executed fail_on_err (bool): Boolean for whether to fail the test if command execution returns non zero return code. display_output (bool): Boolean for whether to display output. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # execute bash cmds ret = pcmd( self.hostlist_clients, cmd, verbose=display_output, timeout=300) if 0 not in ret: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret.items() if code != 0])) if fail_on_err: raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") return ret
class MacsioTestBase(TestWithServers): """Base MACSio test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a MacsioTestBase object.""" super(MacsioTestBase, self).__init__(*args, **kwargs) self.manager = None self.macsio = None def setUp(self): """Set up each test case.""" super(MacsioTestBase, self).setUp() self.manager = Mpirun(None, subprocess=False, mpitype="mpich") self.macsio = self.get_macsio_command() def get_macsio_command(self): """Get the MacsioCommand object. Returns: MacsioCommand: object defining the macsio command """ # Create the macsio command test_repo = self.params.get("macsio", "/run/test_repo/*", "") macsio = MacsioCommand(test_repo) macsio.get_params(self) # Create all the macsio output files in the same directory as the other # test log files macsio.set_output_file_path() return macsio def run_macsio(self, pool_uuid, pool_svcl, cont_uuid=None): """Run the macsio. Parameters for the macsio command are obtained from the test yaml file, including the path to the macsio executable. By default mpirun will be used to run macsio. This can be overridden by redfining the self.manager attribute prior to calling this method. Args: pool_uuid (str): pool uuid pool_svcl (str): pool service replica cont_uuid (str, optional): container uuid. Defaults to None. Returns: CmdResult: Object that contains exit status, stdout, and other information. """ # Setup the job manager (mpirun) to run the macsio command self.macsio.daos_pool = pool_uuid self.macsio.daos_svcl = pool_svcl self.macsio.daos_cont = cont_uuid self.manager.job = self.macsio self.manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.manager.assign_processes(len(self.hostlist_clients)) self.manager.assign_environment( self.macsio.get_environment(self.server_managers[0], self.client_log)) try: return self.manager.run() except CommandFailure as error: self.log.error("MACSio Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n")
class NvmePoolCapacity(TestWithServers): # pylfloat: disable=too-many-ancestors """Test class Description: Verify NOSPC condition is reported when accessing data beyond pool size. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(NvmePoolCapacity, self).setUp() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "{}{}{}".format(oclass, api, test[2]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL") def test_create_delete(self, num_pool=2, num_cont=5, total_count=100, scm_size=100000000000, nvme_size=300000000000): """ Test Description: This method is used to create/delete pools for a long run. It verifies the NVME free space during this process. Args: num_pool (int): Total pools for running test num_cont (int): Total containers created on each pool total_count (int): Total times the test is run in a loop scm_size (int): SCM size used in the testing nvme_size (int): NVME size used in the testing Returns: None """ pool = {} cont = {} for loop_count in range(0, total_count): self.log.info("Running test %s", loop_count) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. temp = int(scm_size) / num_pool pool[val].scm_size.update(str(temp)) temp = int(nvme_size) / num_pool pool[val].nvme_size.update(str(temp)) pool[val].create() self.pool = pool[val] display_string = "pool{} space at the Beginning".format(val) self.pool.display_pool_daos_space(display_string) nvme_size_begin = self.pool.get_pool_free_space("NVME") for cont_val in range(0, num_cont): cont[cont_val] = TestContainer(pool[val]) m_leak = 0 for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) nvme_size_end = self.pool.get_pool_free_space("NVME") pool[val].destroy() if (nvme_size_begin != nvme_size_end) and (m_leak == 0): m_leak = val + 1 # After destroying pools, check memory leak for each test loop. if m_leak != 0: self.fail("Memory leak : iteration {0} \n".format(m_leak)) def test_run(self, num_pool=1): """ Method Description: This method is called with different test_cases. Args: num_pool (int): Total pools for running a test. Returns: None """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} # Iterate through IOR different ior test sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): # Create the IOR threads threads = [] for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(test[0]) / num_pool pool[val].nvme_size.value = int(test[1]) / num_pool pool[val].create() display_string = "pool{} space at the Beginning".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) for thrd in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) # Wait to finish the threads for thrd in threads: thrd.join() # Verify the queue and make sure no FAIL for any IOR run # Test should fail with ENOSPC. while not self.out_queue.empty(): if (self.out_queue.get() == "FAIL" and test[4] == "PASS") \ or (self.out_queue.get() != "FAIL" and test[4] == "FAIL"): self.fail("FAIL") for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.pool.destroy() def test_nvme_pool_capacity(self): """Jira ID: DAOS-2085. Test Description: Purpose of this test is to verify whether DAOS stack report NOSPC when accessing data beyond pool size. Use Cases Test Case 1 or 2: 1. Perform IO less than entire SSD disk space. 2. Perform IO beyond entire SSD disk space. Test Case 3: 3. Create Pool/Container and destroy them several times. Use case: :avocado: tags=all,hw,medium,ib2,nvme,full_regression :avocado: tags=nvme_pool_capacity """ # Run test with one pool. self.log.info("Running Test Case 1 with one Pool") self.test_run(1) time.sleep(5) # Run test with two pools. self.log.info("Running Test Case 1 with two Pools") self.test_run(2) time.sleep(5) # Run Create/delete pool/container self.log.info("Running Test Case 3: Pool/Cont Create/Destroy") self.test_create_delete(10, 50, 100)
def setUp(self): """Set up each test case.""" super(MacsioTestBase, self).setUp() self.manager = Mpirun(None, subprocess=False, mpitype="mpich") self.macsio = self.get_macsio_command()
def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append( TestPool(self.context, dmg_command=self.get_dmg_command())) self.pool[-1].get_params(self) if use_ior: # Get ior params mpirun = Mpirun(IorCommand()) mpirun.job.get_params(self) mpirun.assign_hosts( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) mpirun.assign_processes(len(self.hostlist_clients)) mpirun.assign_environment(mpirun.job.get_default_env("mpirun")) # Cancel any tests with tickets already assigned if rank in (1, 2): self.cancelForTicket("DAOS-2434") errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append( { "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, } ) rebuild_checks.append( { "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, } ) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags") mpirun.job.dfs_destroy.update(False, "ior.dfs_destroy") mpirun.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value) self.run_ior(loop_id, mpirun) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.pool[0].start_rebuild([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild( loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by opening them for index in range(start_index, len(self.container)): count = "{}/{}".format( index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value) mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags") mpirun.job.dfs_destroy.update(True, "ior.dfs_destroy") self.run_ior(loop_id, mpirun) else: self.log.info( "%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue( self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info( "%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
class NvmeEnospace(ServerFillUp): # pylint: disable=too-many-ancestors """ Test Class Description: To validate DER_NOSPACE for SCM and NVMe :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a NvmeEnospace object.""" super(NvmeEnospace, self).__init__(*args, **kwargs) self.daos_cmd = None def setUp(self): super(NvmeEnospace, self).setUp() # initialize daos command self.daos_cmd = DaosCommand(self.bin) self.create_pool_max_size() self.der_nospace_count = 0 self.other_errors_count = 0 def verify_enspace_log(self, der_nospace_err_count): """ Function to verify there are no other error except DER_NOSPACE in client log and also DER_NOSPACE count is higher. args: expected_err_count(int): Expected DER_NOSPACE count from client log. """ #Get the DER_NOSPACE and other error count from log self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) #Check there are no other errors in log file if self.other_errors_count > 0: self.fail('Found other errors, count {} in client log {}' .format(self.other_errors_count, self.client_log)) #Check the DER_NOSPACE error count is higher if not test will FAIL if self.der_nospace_count < der_nospace_err_count: self.fail('Expected DER_NOSPACE should be > {} and Found {}' .format(der_nospace_err_count, self.der_nospace_count)) def delete_all_containers(self): """ Delete all the containers. """ #List all the container kwargs = {"pool": self.pool.uuid} data = self.daos_cmd.pool_list_cont(**kwargs) containers = data["uuids"] #Destroy all the containers for _cont in containers: kwargs["cont"] = _cont self.daos_cmd.container_destroy(**kwargs) def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break def run_enospace_foreground(self): """ Function to run test and validate DER_ENOSPACE and expected storage size """ #Fill 75% more of SCM pool,Aggregation is Enabled so NVMe space will be #start filling print('Starting main IOR load') self.start_ior_load(storage='SCM', percent=75) print(self.pool.pool_percentage_used()) #Fill 50% more of SCM pool,Aggregation is Enabled so NVMe space will be #filled self.start_ior_load(storage='SCM', percent=50) print(self.pool.pool_percentage_used()) #Fill 60% more of SCM pool, now NVMe will be Full so data will not be #moved to NVMe but it will start filling SCM. SCM size will be going to #full and this command expected to fail with DER_NOSPACE try: self.start_ior_load(storage='SCM', percent=60) self.fail('This test suppose to FAIL because of DER_NOSPACE' 'but it got Passed') except TestFail as _error: self.log.info('Test expected to fail because of DER_NOSPACE') #Display the pool% print(self.pool.pool_percentage_used()) #verify the DER_NO_SAPCE error count is expected and no other Error in #client log self.verify_enspace_log(self.der_nospace_count) #Check both NVMe and SCM are full. pool_usage = self.pool.pool_percentage_used() #NVMe should be almost full if not test will fail. if pool_usage['nvme'] > 8: self.fail('Pool NVMe used percentage should be < 8%, instead {}'. format(pool_usage['nvme'])) #For SCM some % space used for system so it won't be 100% full. if pool_usage['scm'] > 50: self.fail('Pool SCM used percentage should be < 50%, instead {}'. format(pool_usage['scm'])) def run_enospace_with_bg_job(self): """ Function to run test and validate DER_ENOSPACE and expected storage size. Single IOR job will run in background while space is filling. """ #Get the initial DER_ENOSPACE count self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) # Start the IOR Background thread which will write small data set and # read in loop, until storage space is full. out_queue = queue.Queue() job = threading.Thread(target=self.ior_bg_thread, kwargs={"results": out_queue}) job.daemon = True job.start() #Run IOR in Foreground self.run_enospace_foreground() # Verify the background job queue and make sure no FAIL for any IOR run while not self.out_queue.empty(): if self.out_queue.get() == "FAIL": self.fail("One of the Background IOR job failed") def test_enospace_lazy_with_bg(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM and NVMe is full with default (lazy) Aggregation mode. Use Case: This tests will create the pool and fill 75% of SCM size which will trigger the aggregation because of space pressure, next fill 75% more which should fill NVMe. Try to fill 60% more and now SCM size will be full too. verify that last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full.One background IO job will be running continuously. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_bg """ print(self.pool.pool_percentage_used()) #Run IOR to fill the pool. self.run_enospace_with_bg_job() def test_enospace_lazy_with_fg(self): """Jira ID: DAOS-4756. Test Description: Fill up the system (default aggregation mode) and delete all containers in loop, which should release the space. Use Case: This tests will create the pool and fill 75% of SCM size which will trigger the aggregation because of space pressure, next fill 75% more which should fill NVMe. Try to fill 60% more and now SCM size will be full too. verify that last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full. Delete all the containers. Do this in loop for 10 times and verify space is released. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_fg """ print(self.pool.pool_percentage_used()) #Repeat the test in loop. for _loop in range(10): print("-------enospc_lazy_fg Loop--------- {}".format(_loop)) #Run IOR to fill the pool. self.run_enospace_foreground() #Delete all the containers self.delete_all_containers() #Delete container will take some time to release the space time.sleep(60) #Run last IO self.start_ior_load(storage='SCM', percent=1) def test_enospace_time_with_bg(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM is full and it release the size when container destroy with Aggregation set on time mode. Use Case: This tests will create the pool. Set Aggregation mode to Time. Start filling 75% of SCM size. Aggregation will be triggered time to time, next fill 75% more which will fill up NVMe. Try to fill 60% more and now SCM size will be full too. Verify last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full.One background IO job will be running continuously. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_time,enospc_time_bg """ print(self.pool.pool_percentage_used()) # Enabled TIme mode for Aggregation. self.pool.set_property("reclaim", "time") #Run IOR to fill the pool. self.run_enospace_with_bg_job() def test_enospace_time_with_fg(self): """Jira ID: DAOS-4756. Test Description: Fill up the system (time aggregation mode) and delete all containers in loop, which should release the space. Use Case: This tests will create the pool. Set Aggregation mode to Time. Start filling 75% of SCM size. Aggregation will be triggered time to time, next fill 75% more which will fill up NVMe. Try to fill 60% more and now SCM size will be full too. Verify last IO fails with DER_NOSPACE and SCM/NVMe pool capacity is full. Delete all the containers. Do this in loop for 10 times and verify space is released. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_time,enospc_time_fg """ print(self.pool.pool_percentage_used()) # Enabled TIme mode for Aggregation. self.pool.set_property("reclaim", "time") #Repeat the test in loop. for _loop in range(10): print("-------enospc_time_fg Loop--------- {}".format(_loop)) #Run IOR to fill the pool. self.run_enospace_with_bg_job() #Delete all the containers self.delete_all_containers() #Delete container will take some time to release the space time.sleep(60) #Run last IO self.start_ior_load(storage='SCM', percent=1) @skipForTicket("DAOS-5403") def test_performance_storage_full(self): """Jira ID: DAOS-4756. Test Description: Verify IO Read performance when pool size is full. Use Case: This tests will create the pool. Run small set of IOR as baseline.Start IOR with < 4K which will start filling SCM and trigger aggregation and start filling up NVMe. Check the IOR baseline read number and make sure it's +- 5% to the number ran prior system storage was full. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_performance """ #Write the IOR Baseline and get the Read BW for later comparison. print(self.pool.pool_percentage_used()) #Write First self.start_ior_load(storage='SCM', percent=1) #Read the baseline data set self.start_ior_load(storage='SCM', operation='Read', percent=1) max_mib_baseline = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)]) baseline_cont_uuid = self.ior_cmd.dfs_cont.value print("IOR Baseline Read MiB {}".format(max_mib_baseline)) #Run IOR to fill the pool. self.run_enospace_with_bg_job() #Read the same container which was written at the beginning. self.container.uuid = baseline_cont_uuid self.start_ior_load(storage='SCM', operation='Read', percent=1) max_mib_latest = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)]) print("IOR Latest Read MiB {}".format(max_mib_latest)) #Check if latest IOR read performance is in Tolerance of 5%, when #Storage space is full. if abs(max_mib_baseline-max_mib_latest) > (max_mib_baseline/100 * 5): self.fail('Latest IOR read performance is not under 5% Tolerance' ' Baseline Read MiB = {} and latest IOR Read MiB = {}' .format(max_mib_baseline, max_mib_latest)) def test_enospace_no_aggregation(self): """Jira ID: DAOS-4756. Test Description: IO gets DER_NOSPACE when SCM is full and it release the size when container destroy with Aggregation disabled. Use Case: This tests will create the pool and disable aggregation. Fill 75% of SCM size which should work, next try fill 10% more which should fail with DER_NOSPACE. Destroy the container and validate the Pool SCM free size is close to full (> 95%). Do this in loop ~10 times and verify the DER_NOSPACE and SCM free size after container destroy. :avocado: tags=all,hw,medium,nvme,ib2,full_regression :avocado: tags=der_enospace,enospc_no_aggregation """ # pylint: disable=attribute-defined-outside-init # pylint: disable=too-many-branches print(self.pool.pool_percentage_used()) # Disable the aggregation self.pool.set_property("reclaim", "disabled") #Get the DER_NOSPACE and other error count from log self.der_nospace_count, self.other_errors_count = error_count( "-1007", self.hostlist_clients, self.client_log) #Repeat the test in loop. for _loop in range(10): print("-------enospc_no_aggregation Loop--------- {}".format(_loop)) #Fill 75% of SCM pool self.start_ior_load(storage='SCM', percent=40) print(self.pool.pool_percentage_used()) try: #Fill 10% more to SCM ,which should Fail because no SCM space self.start_ior_load(storage='SCM', percent=40) self.fail('This test suppose to fail because of DER_NOSPACE' 'but it got Passed') except TestFail as _error: self.log.info('Expected to fail because of DER_NOSPACE') #Verify DER_NO_SAPCE error count is expected and no other Error #in client log. self.verify_enspace_log(self.der_nospace_count) #Delete all the containers self.delete_all_containers() #Get the pool usage pool_usage = self.pool.pool_percentage_used() #Delay to release the SCM size. time.sleep(60) print(pool_usage) #SCM pool size should be released (some still be used for system) #Pool SCM free % should not be less than 50% if pool_usage['scm'] > 55: self.fail('SCM pool used percentage should be < 55, instead {}'. format(pool_usage['scm'])) #Run last IO self.start_ior_load(storage='SCM', percent=1)
class RbldContainerCreate(TestWithServers): """Rebuild with container creation test cases. Test Class Description: These rebuild tests verify the ability to create additional containers while rebuild is ongoing. :avocado: recursive """ def add_containers_during_rebuild(self, loop_id, qty, pool1, pool2): """Add containers to a pool while rebuild is still in progress. Args: loop_id (str): loop identification string qty (int): the number of containers to create pool1 (TestPool): pool used to determine if rebuild is complete pool2 (TestPool): pool used to add containers """ count = 0 while not pool1.rebuild_complete() and count < qty: # Create a new container count += 1 self.log.info( "%s: Creating container %s/%s in pool %s during rebuild", loop_id, count, qty, pool2.uuid) self.container.append(TestContainer(pool2)) self.container[-1].get_params(self) self.container[-1].create() self.container[-1].write_objects() if count < qty: self.fail("{}: Rebuild completed with only {}/{} containers " "created".format(loop_id, count, qty)) def run_ior(self, loop_id, mpirun): """Run the ior command defined by the specified ior command object. Args: loop_id (str): loop identification string mpirun (Mpirun): mpirun command object to run ior """ total_bytes = mpirun.job.get_aggregate_total(mpirun.processes.value) try: mpirun.run() except CommandFailure as error: self.fail( "{}: Error populating the container with {} bytes of data " "prior to target exclusion: {}".format(loop_id, total_bytes, error)) self.log.info("%s: %s %s bytes to the container", loop_id, "Wrote" if "-w" in mpirun.job.flags.value else "Read", total_bytes) def access_container(self, loop_id, index, message): """Open and close the specified container. Args: loop_id (str): loop identification string index (int): index of the daos container object to open/close message (str): additional text describing the container Returns: bool: was the opening and closing of the container successful """ status = True self.log.info("%s: Verifying the container %s created during rebuild", loop_id, message) try: self.container[index].read_objects() self.container[index].close() except TestFail as error: self.log.error("%s: - Container read failed:", loop_id, exc_info=error) status = False return status def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,full_regression :avocado: tags=medium :avocado: tags=rebuild,rebuild_cont_create """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append(self.get_pool(create=False)) if use_ior: # Get ior params self.job_manager = Mpirun(IorCommand()) self.job_manager.job.get_params(self) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) self.job_manager.assign_processes(len(self.hostlist_clients)) self.job_manager.assign_environment( self.job_manager.job.get_default_env("mpirun")) errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append({ "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, }) rebuild_checks.append({ "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, }) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: self.job_manager.job.flags.update("-v -w -W -G 1 -k", "ior.flags") self.job_manager.job.dfs_destroy.update( False, "ior.dfs_destroy") self.job_manager.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.job_manager.job.dfs_cont.value) self.run_ior(loop_id, self.job_manager) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.server_managers[0].stop_ranks([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild(loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by opening them for index in range(start_index, len(self.container)): count = "{}/{}".format(index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, self.job_manager.job.dfs_cont.value) self.job_manager.job.flags.update("-v -r -R -G 1 -E", "ior.flags") self.job_manager.job.dfs_destroy.update( True, "ior.dfs_destroy") self.run_ior(loop_id, self.job_manager) else: self.log.info("%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue(self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info("%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
class OSAOnlineDrain(TestWithServers): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server Online Drain test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineDrain, self).setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() @fail_on(CommandFailure) def get_pool_leader(self): """Get the pool leader. Returns: int: pool leader value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["leader"]) @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL") def run_online_drain_test(self, num_pool): """Run the Online drain without data. Args: int : total pools to create for testing purposes. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] target_list = [] drain_servers = len(self.hostlist_servers) - 1 # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Drain one of the ranks (or server) rank = random.randint(1, drain_servers) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Drain the pool_uuid, rank and targets for val in range(0, num_pool): for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): threads = [] for thrd in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_drain(self.pool.uuid, rank, t_string) self.log.info(output) fail_count = 0 while fail_count <= 20: pver_drain = self.get_pool_version() time.sleep(10) fail_count += 1 if pver_drain > pver_begin + 1: break self.log.info("Pool Version after drain %s", pver_drain) # Check pool version incremented after pool exclude self.assertTrue(pver_drain > pver_begin, "Pool Version Error: After drain") # Wait to finish the threads for thrd in threads: thrd.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) pool[val].destroy() @skipForTicket("DAOS-6061") def test_osa_online_drain(self): """Test ID: DAOS-4750 Test Description: Validate Online drain :avocado: tags=all,pr,hw,large,osa,osa_drain,online_drain,DAOS_5610 """ # Perform drain testing with 1 to 2 pools for pool_num in range(1, 3): self.run_online_drain_test(pool_num)
class IorTestBase(DfuseTestBase): # pylint: disable=too-many-ancestors """Base IOR test class. :avocado: recursive """ IOR_WRITE_PATTERN = "Commencing write performance test" IOR_READ_PATTERN = "Commencing read performance test" def __init__(self, *args, **kwargs): """Initialize a IorTestBase object.""" super().__init__(*args, **kwargs) self.ior_cmd = None self.processes = None self.hostfile_clients_slots = None self.container = None self.ior_timeout = None self.ppn = None def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.ppn = self.params.get("ppn", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None) def create_pool(self): """Create a TestPool object to use with ior.""" # Get the pool params and create a pool self.add_pool(connect=False) def create_cont(self): """Create a TestContainer object to be used to create container. """ # Get container params self.container = TestContainer(self.pool, daos_command=DaosCommand(self.bin)) self.container.get_params(self) # update container oclass if self.ior_cmd.dfs_oclass: self.container.oclass.update(self.ior_cmd.dfs_oclass.value) # create container self.container.create() def display_pool_space(self, pool=None): """Display the current pool space. If the TestPool object has a DmgCommand object assigned, also display the free pool space per target. Args: pool (TestPool, optional): The pool for which to display space. Default is self.pool. """ if not pool: pool = self.pool pool.display_pool_daos_space() if pool.dmg: pool.set_query_data() def run_ior_with_pool(self, intercept=None, test_file_suffix="", test_file="daos:/testFile", create_pool=True, create_cont=True, stop_dfuse=True, plugin_path=None, timeout=None, fail_on_warning=False, mount_dir=None, out_queue=None, env=None): # pylint: disable=too-many-arguments """Execute ior with optional overrides for ior flags and object_class. If specified the ior flags and ior daos object class parameters will override the values read from the yaml file. Args: intercept (str, optional): path to the interception library. Shall be used only for POSIX through DFUSE. Defaults to None. test_file_suffix (str, optional): suffix to add to the end of the test file name. Defaults to "". test_file (str, optional): ior test file name. Defaults to "daos:/testFile". Is ignored when using POSIX through DFUSE. create_pool (bool, optional): If it is true, create pool and container else just run the ior. Defaults to True. create_cont (bool, optional): Create new container. Default is True stop_dfuse (bool, optional): Stop dfuse after ior command is finished. Default is True. plugin_path (str, optional): HDF5 vol connector library path. This will enable dfuse (xattr) working directory which is needed to run vol connector for DAOS. Default is None. timeout (int, optional): command timeout. Defaults to None. fail_on_warning (bool, optional): Controls whether the test should fail if a 'WARNING' is found. Default is False. mount_dir (str, optional): Create specific mount point out_queue (queue, optional): Pass the exception to the queue. Defaults to None env (EnvironmentVariables, optional): Pass the environment to be used when calling run_ior. Defaults to None Returns: CmdResult: result of the ior command execution """ if create_pool: self.update_ior_cmd_with_pool(create_cont) # start dfuse if api is POSIX or HDF5 with vol connector if self.ior_cmd.api.value == "POSIX" or plugin_path: # add a substring in case of HDF5-VOL if plugin_path: sub_dir = get_random_string(5) mount_dir = os.path.join(mount_dir, sub_dir) # Connect to the pool, create container and then start dfuse if not self.dfuse: self.start_dfuse(self.hostlist_clients, self.pool, self.container, mount_dir) # setup test file for POSIX or HDF5 with vol connector if self.ior_cmd.api.value == "POSIX" or plugin_path: test_file = os.path.join(self.dfuse.mount_dir.value, "testfile") elif self.ior_cmd.api.value == "DFS": test_file = os.path.join("/", "testfile") self.ior_cmd.test_file.update("".join([test_file, test_file_suffix])) job_manager = self.get_ior_job_manager_command() job_manager.timeout = timeout try: out = self.run_ior(job_manager, self.processes, intercept, plugin_path=plugin_path, fail_on_warning=fail_on_warning, out_queue=out_queue, env=env) finally: if stop_dfuse: self.stop_dfuse() return out def update_ior_cmd_with_pool(self, create_cont=True): """Update ior_cmd with pool. Args: create_cont (bool, optional): create a container. Defaults to True. """ # Create a pool if one does not already exist if self.pool is None: self.create_pool() # Create a container, if needed. # Don't pass uuid and pool handle to IOR. # It will not enable checksum feature if create_cont: self.pool.connect() self.create_cont() # Update IOR params with the pool and container params self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.uuid) def get_ior_job_manager_command(self, custom_ior_cmd=None): """Get the MPI job manager command for IOR. Args: custom_ior_cmd (IorCommand): Custom IorCommand instance to create job_manager with. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DFS mode if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") if custom_ior_cmd: self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich") else: self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich") return self.job_manager def check_subprocess_status(self, operation="write"): """Check subprocess status.""" if operation == "write": self.ior_cmd.pattern = self.IOR_WRITE_PATTERN elif operation == "read": self.ior_cmd.pattern = self.IOR_READ_PATTERN else: self.fail("Exiting Test: Inappropriate operation type \ for subprocess status check") if not self.ior_cmd.check_ior_subprocess_status( self.job_manager.process, self.ior_cmd): self.fail("Exiting Test: Subprocess not running") def run_ior(self, manager, processes, intercept=None, display_space=True, plugin_path=None, fail_on_warning=False, pool=None, out_queue=None, env=None): """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str, optional): path to interception library. display_space (bool, optional): Whether to display the pool space. Defaults to True. plugin_path (str, optional): HDF5 vol connector library path. This will enable dfuse (xattr) working directory which is needed to run vol connector for DAOS. Default is None. fail_on_warning (bool, optional): Controls whether the test should fail if a 'WARNING' is found. Default is False. pool (TestPool, optional): The pool for which to display space. Default is self.pool. out_queue (queue, optional): Pass the exception to the queue. Defaults to None. env (EnvironmentVariables, optional): Environment to be used when running ior. Defaults to None """ if not env: env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env['LD_PRELOAD'] = intercept env['D_LOG_MASK'] = 'INFO' if env.get('D_IL_REPORT', None) is None: env['D_IL_REPORT'] = '1' #env['D_LOG_MASK'] = 'INFO,IL=DEBUG' #env['DD_MASK'] = 'all' #env['DD_SUBSYS'] = 'all' if plugin_path: env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = str(plugin_path) manager.working_dir.value = self.dfuse.mount_dir.value manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) if self.ppn is None: manager.assign_processes(processes) else: manager.ppn.update(self.ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') manager.assign_environment(env) if not pool: pool = self.pool try: if display_space: self.display_pool_space(pool) out = manager.run() if self.subprocess: return out if fail_on_warning: report_warning = self.fail else: report_warning = self.log.warning for line in out.stdout_text.splitlines(): if 'WARNING' in line: report_warning("IOR command issued warnings.") return out except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) # Queue is used when we use a thread to call # ior thread (eg: thread1 --> thread2 --> ior) if out_queue is not None: out_queue.put("IOR Failed") self.fail("Test was expected to pass but it failed.\n") finally: if not self.subprocess and display_space: self.display_pool_space(pool) def stop_ior(self): """Stop IOR process. Args: manager (str): mpi job manager command """ self.log.info("<IOR> Stopping in-progress IOR command: %s", str(self.job_manager)) try: out = self.job_manager.stop() return out except CommandFailure as error: self.log.error("IOR stop Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.display_pool_space() def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread is run with the interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params(self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params(self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse() # Basic verification of the thread results status = True for key in sorted(results): if not results[key].pop(0): self.log.error("IOR Thread %d: %s", key, results[key][0]) status = False if len(results[key]) != 2: self.log.error( "IOR Thread %d: expecting 2 results; %d found: %s", key, len(results[key]), results[key]) status = False if not status: self.fail("At least one IOR thread failed!") def create_ior_thread(self, ior_command, clients, job_num, results, intercept=None): """Create a new thread for ior run. Args: ior_command (IorCommand): IOR command instance. clients (list): hosts on which to run ior job_num (int): Assigned job number results (dict): A dictionary object to store the ior metrics intercept (path): Path to interception library """ job = threading.Thread( target=self.run_custom_ior_cmd, args=[ior_command, clients, results, job_num, intercept]) return job def run_custom_ior_cmd(self, ior_command, clients, results, job_num, intercept=None): """Run customized IOR command, not self.ior_cmd. Expected to be used with a threaded code where multiple IOR commands are executed in parallel. Display pool space before running it for a reference. Args: ior_command (IorCommand): Custom IOR command instance. clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.log.info("--- IOR Thread %d: Start ---", job_num) tsize = ior_command.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" ior_command.test_file.update(testfile) # Get the custom job manager that's associated with this thread. manager = self.get_ior_job_manager_command(custom_ior_cmd=ior_command) procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = ior_command.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.log.info("--- IOR Thread %d: Starting IOR ---", job_num) self.display_pool_space() try: ior_output = manager.run() results[job_num] = [True] results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)] finally: self.display_pool_space() self.log.info("--- IOR Thread %d: End ---", job_num) def run_ior_multiple_variants(self, obj_class, apis, transfer_block_size, flags, mount_dir): """Run multiple ior commands with various different combination of ior input params. Args: obj_class(list): List of different object classes apis(list): list of different apis transfer_block_size(list): list of different transfer sizes and block sizes. eg: [1M, 32M] 1M is transfer size and 32M is block size in the above example. flags(list): list of ior flags mount_dir(str): dfuse mount directory """ results = [] for oclass in obj_class: self.ior_cmd.dfs_oclass.update(oclass) for api in apis: if api == "HDF5-VOL": self.ior_cmd.api.update("HDF5") hdf5_plugin_path = self.params.get("plugin_path", '/run/hdf5_vol/*') flags_w_k = " ".join([flags[0]] + ["-k"]) self.ior_cmd.flags.update(flags_w_k, "ior.flags") else: # run tests for different variants self.ior_cmd.flags.update(flags[0], "ior.flags") hdf5_plugin_path = None self.ior_cmd.api.update(api) for test in transfer_block_size: # update transfer and block size self.ior_cmd.transfer_size.update(test[0]) self.ior_cmd.block_size.update(test[1]) # run ior try: self.run_ior_with_pool(plugin_path=hdf5_plugin_path, timeout=self.ior_timeout, mount_dir=mount_dir) results.append(["PASS", str(self.ior_cmd)]) except CommandFailure: results.append(["FAIL", str(self.ior_cmd)]) return results def verify_pool_size(self, original_pool_info, processes): """Validate the pool size. Args: original_pool_info (PoolInfo): Pool info prior to IOR processes (int): number of processes """ # Get the current pool size for comparison current_pool_info = self.pool.pool.pool_query() # If Transfer size is < 4K, Pool size will verified against NVMe, else # it will be checked against SCM if self.ior_cmd.transfer_size.value >= 4096: self.log.info( "Size is > 4K,Size verification will be done with NVMe size") storage_index = 1 else: self.log.info( "Size is < 4K,Size verification will be done with SCM size") storage_index = 0 actual_pool_size = \ original_pool_info.pi_space.ps_space.s_free[storage_index] - \ current_pool_info.pi_space.ps_space.s_free[storage_index] expected_pool_size = self.ior_cmd.get_aggregate_total(processes) if actual_pool_size < expected_pool_size: self.fail( "Pool Free Size did not match: actual={}, expected={}".format( actual_pool_size, expected_pool_size)) def execute_cmd(self, command, fail_on_err=True, display_output=True): """Execute cmd using general_utils.pcmd. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ try: # Execute the bash command on each client host result = self._execute_command(command, fail_on_err, display_output) except CommandFailure as error: # Report an error if any command fails self.log.error("DfuseSparseFile Test Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") return result def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): """Execute the command on all client hosts. Optionally verify if the command returns a non zero return code. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Raises: CommandFailure: if 'fail_on_err' is set and the command fails on at least one of the client hosts Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ if hosts is None: hosts = self.hostlist_clients result = pcmd(hosts, command, verbose=display_output, timeout=300) if 0 not in result and fail_on_err: hosts = [ str(nodes) for code, nodes in list(result.items()) if code != 0 ] raise CommandFailure( "Error running '{}' on the following hosts: {}".format( command, NodeSet(",".join(hosts)))) return result
class OSAUtils(IorTestBase): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server offline drain test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAUtils, self).setUp() self.container = None self.obj = None self.ioreq = None self.dmg_command = self.get_dmg_command() self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*', default=[0])[0] self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*', default=[0])[0] self.record_length = self.params.get("length", '/run/record/*', default=[0])[0] @fail_on(CommandFailure) def get_pool_leader(self): """Get the pool leader. Returns: int: pool leader value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["leader"]) @fail_on(CommandFailure) def get_rebuild_status(self): """Get the rebuild status. Returns: str: reuild status """ data = self.dmg_command.pool_query(self.pool.uuid) return data["rebuild"]["status"] @fail_on(CommandFailure) def is_rebuild_done(self, time_interval): """Rebuild is completed/done. Args: time_interval: Wait interval between checks Returns: False: If rebuild_status not "done" or "completed". True: If rebuild status is "done" or "completed". """ status = False fail_count = 0 completion_flag = ["done", "completed"] while fail_count <= 20: rebuild_status = self.get_rebuild_status() time.sleep(time_interval) fail_count += 1 if rebuild_status in completion_flag: status = True break return status @fail_on(CommandFailure) def assert_on_rebuild_failure(self): """If the rebuild is not successful, raise assert. """ rebuild_status = self.get_rebuild_status() self.log.info("Rebuild Status: %s", rebuild_status) rebuild_failed_string = ["failed", "scanning", "aborted", "busy"] self.assertTrue(rebuild_status not in rebuild_failed_string, "Rebuild failed") @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) @fail_on(DaosApiError) def write_single_object(self): """Write some data to the existing pool.""" self.pool.connect(2) csum = self.params.get("enable_checksum", '/run/container/*') self.container = DaosContainer(self.context) input_param = self.container.cont_input_values input_param.enable_chksum = csum self.container.create(poh=self.pool.pool.handle, con_prop=input_param) self.container.open() self.obj = DaosObj(self.context, self.container) self.obj.create(objcls=1) self.obj.open() self.ioreq = IORequest(self.context, self.container, self.obj, objtype=4) self.log.info("Writing the Single Dataset") for dkey in range(self.no_of_dkeys): for akey in range(self.no_of_akeys): indata = ("{0}".format(str(akey)[0]) * self.record_length) d_key_value = "dkey {0}".format(dkey) c_dkey = ctypes.create_string_buffer(d_key_value) a_key_value = "akey {0}".format(akey) c_akey = ctypes.create_string_buffer(a_key_value) c_value = ctypes.create_string_buffer(indata) c_size = ctypes.c_size_t(ctypes.sizeof(c_value)) self.ioreq.single_insert(c_dkey, c_akey, c_value, c_size) self.obj.close() self.container.close() @fail_on(DaosApiError) def verify_single_object(self): """Verify the container data on the existing pool.""" self.pool.connect(2) self.container.open() self.obj.open() self.log.info("Single Dataset Verification -- Started") for dkey in range(self.no_of_dkeys): for akey in range(self.no_of_akeys): indata = ("{0}".format(str(akey)[0]) * self.record_length) c_dkey = ctypes.create_string_buffer("dkey {0}".format(dkey)) c_akey = ctypes.create_string_buffer("akey {0}".format(akey)) val = self.ioreq.single_fetch(c_dkey, c_akey, len(indata) + 1) if indata != (repr(val.value)[1:-1]): self.d_log.error("ERROR:Data mismatch for " "dkey = {0}, " "akey = {1}".format( "dkey {0}".format(dkey), "akey {0}".format(akey))) self.fail( "ERROR: Data mismatch for dkey = {0}, akey={1}".format( "dkey {0}".format(dkey), "akey {0}".format(akey))) self.obj.close() self.container.close() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(self.processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL")
def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.dfs_cont.update(container_info ["{}{}{}".format(oclass, api, test[0])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid cmd.sub_command_class.sub_command_class.svc.value = \ self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: cmd._get_result() except CommandFailure as _error: results.put("FAIL")