def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread is run with the interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params(self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params(self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse() # Basic verification of the thread results status = True for key in sorted(results): if not results[key].pop(0): self.log.error("IOR Thread %d: %s", key, results[key][0]) status = False if len(results[key]) != 2: self.log.error( "IOR Thread %d: expecting 2 results; %d found: %s", key, len(results[key]), results[key]) status = False if not status: self.fail("At least one IOR thread failed!")
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.assign_processes(nprocs) cmd.assign_environment(env, True) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands
def run_ior_collect_error(self, results, job_num, file_name, clients): """Run IOR command and store error in results. Args: results (dict): A dictionary object to store the ior metrics. job_num (int): Assigned job number. file_name (str): File name used for self.ior_cmd.test_file. clients (list): Client hostnames to run IOR from. """ ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params( group=self.server_group, pool=self.pool, cont_uuid=self.container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager( test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') try: ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # We'll verify the error message. results[job_num].append(ior_output.stderr_text) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)]
def create_ior_cmdline(self, job_params, job_spec, pool): """Create an IOR cmdline to run in slurm batch. Args: job_params (str): job params from yaml file job_spec (str): specific ior job to run pool (obj): TestPool obj Returns: cmd: cmdline string """ command = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/" ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) ior_cmd.max_duration.update(self.params.get("time", job_params + '*')) # IOR job specs with a list of parameters; update each value # transfer_size # block_size # daos object class tsize_list = ior_cmd.transfer_size.value bsize_list = ior_cmd.block_size.value oclass_list = ior_cmd.daos_oclass.value for b_size in bsize_list: ior_cmd.block_size.update(b_size) for o_type in oclass_list: ior_cmd.daos_oclass.update(o_type) for t_size in tsize_list: ior_cmd.transfer_size.update(t_size) ior_cmd.set_daos_params(self.server_group, pool) # export the user environment to test node exports = ["ALL"] if ior_cmd.api.value == "MPIIO": env = { "CRT_ATTACH_INFO_PATH": os.path.join( self.basepath, "install/tmp"), "DAOS_POOL": str(ior_cmd.daos_pool.value), "MPI_LIB": "\"\"", "DAOS_SVCL": str(ior_cmd.daos_svcl.value), "DAOS_SINGLETON_CLI": 1, "FI_PSM2_DISCONNECT": 1 } exports.extend( ["{}={}".format( key, val) for key, val in env.items()]) cmd = "srun -l --mpi=pmi2 --export={} {}".format( ",".join(exports), ior_cmd) command.append(cmd) self.log.debug("<<IOR cmdline >>: %s \n", cmd) return command
def run_ior_threads_il(self, results, intercept, with_clients, without_clients): """Execute 2 IOR threads in parallel. One thread with interception library (IL) and one without. Args: results (dict): Dictionary to store the IOR results that gets printed in the IOR output. intercept (str): Path to the interception library. Shall be used only for POSIX through DFUSE. with_clients (list): List of clients that use IL. without_clients (list): List of clients that doesn't use IL. """ # We can't use the shared self.ior_cmd, so we need to create the # IorCommand object for each thread. ior_cmd1 = IorCommand() ior_cmd1.get_params(self) # Update IOR params with the pool and container params ior_cmd1.set_daos_params( self.server_group, self.pool, self.container.uuid) ior_cmd2 = IorCommand() ior_cmd2.get_params(self) ior_cmd2.set_daos_params( self.server_group, self.pool, self.container.uuid) # start dfuse for POSIX api. This is specific to interception library # test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Create two threads and run in parallel. thread1 = self.create_ior_thread( ior_cmd1, with_clients, 1, results, intercept) thread2 = self.create_ior_thread( ior_cmd2, without_clients, 2, results, None) thread1.start() thread2.start() thread1.join() thread2.join() self.stop_dfuse()
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*')
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.daos_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.daos_cont.update(container_info ["{}{}{}".format(oclass, api, test[2])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.ppn = self.params.get("ppn", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None)
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) # lock is needed for run_multiple_ior method. self.lock = threading.Lock()
def run_ior_report_error(self, results, job_num, file_name, pool, container, namespace): """Run IOR command and store the results to results dictionary. Create a new IorCommand object instead of using the one in IorTestBase because we'll run a test that runs multiple IOR processes at the same time. Args: results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number file_name (str): File name used for self.ior_cmd.test_file. oclass (str): Value for dfs_oclass and dfs_dir_oclass. pool (TestPool): Pool to run IOR. container (TestContainer): Container to run IOR. """ # Update the object class depending on the test case. ior_cmd = IorCommand(namespace=namespace) ior_cmd.get_params(self) # Standard IOR prep sequence. ior_cmd.set_daos_params(self.server_group, pool, container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager(test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') # Run the command. try: self.log.info("--- IOR command %d start ---", job_num) ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # Command worked, but append the error message if any. results[job_num].append(ior_output.stderr_text) self.log.info("--- IOR command %d end ---", job_num) except CommandFailure as error: self.log.info("--- IOR command %d failed ---", job_num) results[job_num] = [False, "IOR failed: {}".format(error)]
def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (TestPool): Pool to run IOR command on. oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich") key = "{}{}{}".format(oclass, api, test[2]) job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(processes) job_manager.assign_environment(env, True) # run IOR Command try: job_manager.run() except CommandFailure as _error: results.put("FAIL")
def setUp(self): """Set up each test case.""" # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.mpiio_oclass = self.params.get("mpiio_oclass", '/run/ior/*') # Get the test params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create()
def ior_bg_thread(self): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. """ # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_bg_cmd, mpi_type="mpich") # create container container = self.get_container(self.pool) job_manager.job.dfs_cont.update(container.uuid) env = ior_bg_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(1) job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: job_manager.run() except (CommandFailure, TestFail) as _error: self.test_result.append("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: job_manager.run() except (CommandFailure, TestFail) as _error: break
def ior_thread(self, pool, oclass, api, test, flags, results): """This method calls job manager for IOR command invocation. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) if "-w" in flags: self.container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[0])]) manager.job.dfs_cont.update(self.container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() self.hostfile_clients = None self.ior_local_cmd = IorCommand() self.ior_local_cmd.get_params(self) self.ior_default_flags = self.ior_local_cmd.flags.value self.ior_scm_xfersize = self.params.get("transfer_size", '/run/ior/transfersize_blocksize/*', '2048') self.ior_read_flags = self.params.get("read_flags", '/run/ior/iorflags/*', '-r -R -k -G 1') self.ior_nvme_xfersize = self.params.get("nvme_transfer_size", '/run/ior/transfersize_blocksize/*', '16777216') # Get the number of daos_engine self.engines = self.server_managers[0].manager.job.yaml.engine_params self.dmg_command = self.get_dmg_command()
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.co_prop = self.params.get("container_properties", "/run/container/*") # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) # lock is needed for run_multiple_ior method. self.lock = threading.Lock()
def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append(TestPool(self.context, self.log)) self.pool[-1].get_params(self) if use_ior: # Get ior params mpirun_path = os.path.join(self.ompi_prefix, "bin") mpirun = Mpirun(IorCommand(), mpirun_path) mpirun.job.get_params(self) mpirun.setup_command( mpirun.job.get_default_env("mpirun", self.tmp), self.hostfile_clients, len(self.hostlist_clients)) # Cancel any tests with tickets already assigned if rank == 1 or rank == 2: self.cancelForTicket("DAOS-2434") errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append( { "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, } ) rebuild_checks.append( { "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, } ) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags") mpirun.job.daos_destroy.update(False, "ior.daos_destroy") mpirun.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, mpirun.job.daos_cont.value) self.run_ior(loop_id, mpirun) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls, False) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.pool[0].start_rebuild([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild( loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by openning them for index in range(start_index, len(self.container)): count = "{}/{}".format( index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, mpirun.job.daos_cont.value) mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags") mpirun.job.daos_destroy.update(True, "ior.daos_destroy") self.run_ior(loop_id, mpirun) else: self.log.info( "%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue( self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info( "%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=server,metadata,metadata_ior,nvme """ self.create_pool() files_per_thread = 400 total_ior_threads = 5 processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [[ str(uuid.uuid4()) for _ in range(files_per_thread) ] for _ in range(total_ior_threads)] # Setup the thread manager thread_manager = ThreadManager(run_ior_loop, self.timeout - 30) # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads for index in range(total_ior_threads): # Define the arguments for the run_ior_loop method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command self.ior_managers.append(Orterun(ior_cmd)) env = ior_cmd.get_default_env(str(self.ior_managers[-1])) self.ior_managers[-1].assign_hosts(self.hostlist_clients, self.workdir, None) self.ior_managers[-1].assign_processes(processes) self.ior_managers[-1].assign_environment(env) self.ior_managers[-1].verbose = False # Add a thread for these IOR arguments thread_manager.add(manager=self.ior_managers[-1], uuids=list_of_uuid_lists[index], tmpdir_base=self.test_dir) self.log.info("Created %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads self.log.info("Launching %d IOR %s threads", thread_manager.qty, operation) failed_thread_count = thread_manager.check_run() if failed_thread_count > 0: msg = "{} FAILED IOR {} Thread(s)".format( failed_thread_count, operation) self.d_log.error(msg) self.fail(msg) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Restart the servers w/o formatting the storage errors = self.restart_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format( "\n ".join(errors))) # Start the agents self.start_agent_managers() self.log.info("Test passed")
def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.dfs_cont.update(container_info ["{}{}{}".format(oclass, api, test[0])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid cmd.sub_command_class.sub_command_class.svc.value = \ self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: cmd._get_result() except CommandFailure as _error: results.put("FAIL")
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,small """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = Queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [[ str(uuid.uuid4()) for _ in range(files_per_thread) ] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Add a thread for these IOR arguments threads.append( threading.Thread(target=ior_runner_thread, kwargs={ "ior_cmd": ior_cmd, "uuids": list_of_uuid_lists[index], "mgr": self.orterun, "attach": self.tmp, "hostfile": self.hostfile_clients, "procs": processes, "results": self.out_queue })) self.log.info("Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents and servers if self.agent_sessions: stop_agent(self.agent_sessions, self.hostlist_clients) stop_server(hosts=self.hostlist_servers) # Start the agents self.agent_sessions = run_agent(self.basepath, self.hostlist_clients, self.hostlist_servers) # Start the servers run_server(self.hostfile_servers, self.server_group, self.basepath, clean=False)
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: self (obj): soak obj job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] ior_params = os.path.join(os.sep, "run", job_spec, "*") ior_timeout = self.params.get("job_timeout", ior_params, 10) mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params) tsize_list = self.params.get("transfer_size", ior_params) bsize_list = self.params.get("block_size", ior_params) oclass_list = self.params.get("dfs_oclass", ior_params) plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: # Cancel for ticket DAOS-6095 if (api in ["HDF5-VOL", "HDF5", "POSIX"] and t_size == "4k" and o_type in ["RP_2G1", 'RP_2GX']): self.add_cancel_ticket( "DAOS-6095", "IOR -a {} with -t {} and -o {}".format( api, t_size, o_type)) continue # Cancel for ticket DAOS-6308 if api == "MPIIO" and o_type == "RP_2GX": self.add_cancel_ticket( "DAOS-6308", "IOR -a {} with -o {}".format(api, o_type)) continue if api in ["HDF5-VOL", "HDF5", "POSIX"] and ppn > 16: continue ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) ior_cmd.max_duration.update(ior_timeout) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) if (api in ["HDF5-VOL", "POSIX"]): ior_cmd.dfs_oclass.update(None) ior_cmd.dfs_dir_oclass.update(None) else: ior_cmd.dfs_oclass.update(o_type) ior_cmd.dfs_dir_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update(os.path.join("/", "testfile")) add_containers(self, pool, o_type) ior_cmd.set_daos_params(self.server_group, pool, self.container[-1].uuid) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines log_name = "{}_{}_{}_{}_{}_{}_{}_{}".format( job_spec, api, b_size, t_size, o_type, nodesperjob * ppn, nodesperjob, ppn) if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, self.container[-1], nodesperjob, "SLURM", name=log_name, job_spec=job_spec) sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) commands.append([sbatch_cmds, log_name]) self.log.info("<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,large """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [ [str(uuid.uuid4()) for _ in range(files_per_thread)] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command manager = Orterun(ior_cmd) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env) # Add a thread for these IOR arguments threads.append( threading.Thread( target=ior_runner_thread, kwargs={ "manager": manager, "uuids": list_of_uuid_lists[index], "results": self.out_queue})) self.log.info( "Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Stop the servers errors = self.stop_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format("\n ".join(errors))) # Start the agents self.start_agent_managers() # Start the servers self.start_server_managers()
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" mpi_module = self.params.get( "mpi_module", "/run/", default="mpi/mpich-x86_64") # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("dfs_oclass", ior_params + "*") plugin_path = self.params.get("plugin_path", "/run/hdf5_vol/") # check if capable of doing rebuild; if yes then dfs_oclass = RP_*GX if is_harasser(self, "rebuild"): oclass_list = self.params.get("dfs_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) if api == "HDF5-VOL": ior_cmd.api.update("HDF5") else: ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.dfs_oclass.update(o_type) if ior_cmd.api.value == "DFS": ior_cmd.test_file.update( os.path.join("/", "testfile")) ior_cmd.set_daos_params(self.server_group, pool) env = ior_cmd.get_default_env("srun") sbatch_cmds = ["module load -q {}".format(mpi_module)] # include dfuse cmdlines if api in ["HDF5-VOL", "POSIX"]: dfuse, dfuse_start_cmdlist = start_dfuse( self, pool, nodesperjob, "SLURM") sbatch_cmds.extend(dfuse_start_cmdlist) ior_cmd.test_file.update( os.path.join(dfuse.mount_dir.value, "testfile")) # add envs if api is HDF5-VOL if api == "HDF5-VOL": env["HDF5_VOL_CONNECTOR"] = "daos" env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path) # env["H5_DAOS_BYPASS_DUNS"] = 1 srun_cmd = Srun(ior_cmd) srun_cmd.assign_processes(nodesperjob * ppn) srun_cmd.assign_environment(env, True) srun_cmd.ntasks_per_node.update(ppn) srun_cmd.nodes.update(nodesperjob) sbatch_cmds.append(str(srun_cmd)) sbatch_cmds.append("status=$?") if api in ["HDF5-VOL", "POSIX"]: sbatch_cmds.extend( stop_dfuse(dfuse, nodesperjob, "SLURM")) sbatch_cmds.append("exit $status") log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([sbatch_cmds, log_name]) self.log.info( "<<IOR {} cmdlines>>:".format(api)) for cmd in sbatch_cmds: self.log.info("%s", cmd) return commands