def test_ior_intercept_verify_data(self): """Jira ID: DAOS-3502. Test Description: Purpose of this test is to run ior through dfuse with interception library on 5 clients and without interception library on 1 client for at least 30 minutes and verify the data integrity using ior's Read Verify and Write Verify options. Use case: Run ior with read, write, fpp, read verify write verify for 30 minutes Run ior with read, write, read verify write verify for 30 minutes :avocado: tags=all,full_regression,hw,large :avocado: tags=daosio,iorinterceptverifydata """ intercept = os.path.join(self.prefix, 'lib64', 'libioil.so') with_intercept = dict() self.run_multiple_ior_with_pool(with_intercept, intercept) IorCommand.log_metrics(self.log, "5 clients - with " + "interception library", with_intercept[1]) IorCommand.log_metrics(self.log, "1 client - without " + "interception library", with_intercept[2])
def test_ior_intercept_multi_client(self): """Jira ID: DAOS-3499. Test Description: Purpose of this test is to run ior through dfuse in multiple clients for 5 minutes and capture the metrics and use the intercepiton library by exporting LD_PRELOAD to the libioil.so path and rerun the above ior and capture the metrics and compare the performance difference and check using interception library make significant performance improvement. Use case: Run ior with read, write for 5 minutes Run ior with read, write for 5 minutes with interception library Compare the results and check whether using interception library provides better performance. :avocado: tags=all,full_regression,hw,large :avocado: tags=daosio,iorinterceptmulticlient """ suffix = self.ior_cmd.transfer_size.value out = self.run_ior_with_pool(test_file_suffix=suffix) without_intercept = IorCommand.get_ior_metrics(out) intercept = os.path.join(self.prefix, 'lib64', 'libioil.so') suffix = suffix + "intercept" out = self.run_ior_with_pool(intercept, test_file_suffix=suffix) with_intercept = IorCommand.get_ior_metrics(out) max_mib = int(IorMetrics.Max_MiB) min_mib = int(IorMetrics.Min_MiB) mean_mib = int(IorMetrics.Mean_MiB) write_x = self.params.get("write_x", "/run/ior/iorflags/ssf/*", 1) # Verifying write performance self.assertTrue( float(with_intercept[0][max_mib]) > write_x * float(without_intercept[0][max_mib])) self.assertTrue( float(with_intercept[0][min_mib]) > write_x * float(without_intercept[0][min_mib])) self.assertTrue( float(with_intercept[0][mean_mib]) > write_x * float(without_intercept[0][mean_mib])) # Verifying read performance # The read performance is almost same with or without intercept # library. But arbitarily the read performance with interception # library can be bit lower than without it. Verifying that it is # not drastically lower by checking it is at least 60% or above. read_x = 0.6 self.assertTrue( float(with_intercept[1][max_mib]) > read_x * float(without_intercept[1][max_mib])) self.assertTrue( float(with_intercept[1][min_mib]) > read_x * float(without_intercept[1][min_mib])) self.assertTrue( float(with_intercept[1][mean_mib]) > read_x * float(without_intercept[1][mean_mib]))
def test_ior_intercept(self): """Jira ID: DAOS-3498. Test Description: Purpose of this test is to run ior using dfuse for 5 minutes and capture the metrics and use the intercepiton library by exporting LD_PRELOAD to the libioil.so path and rerun the above ior and capture the metrics and compare the performance difference and check using interception library make significant performance improvement. Use case: Run ior with read, write, CheckWrite, CheckRead for 5 minutes Run ior with read, write, CheckWrite, CheckRead for 5 minutes with interception library Compare the results and check whether using interception library provides better performance. :avocado: tags=all,full_regression,hw,small,daosio,iorinterceptbasic """ apis = self.params.get("ior_api", '/run/ior/iorflags/ssf/*') for api in apis: self.ior_cmd.api.update(api) out = self.run_ior_with_pool(fail_on_warning=False) without_intercept = IorCommand.get_ior_metrics(out) if api == "POSIX": intercept = os.path.join(self.prefix, 'lib64', 'libioil.so') out = self.run_ior_with_pool(intercept, fail_on_warning=False) with_intercept = IorCommand.get_ior_metrics(out) max_mib = int(IorMetrics.Max_MiB) min_mib = int(IorMetrics.Min_MiB) mean_mib = int(IorMetrics.Mean_MiB) write_x = self.params.get("write_x", "/run/ior/iorflags/ssf/*", 1) read_x = self.params.get("read_x", "/run/ior/iorflags/ssf/*", 1) # Verifying write performance self.assertTrue( float(with_intercept[0][max_mib]) > write_x * float(without_intercept[0][max_mib])) self.assertTrue( float(with_intercept[0][min_mib]) > write_x * float(without_intercept[0][min_mib])) self.assertTrue( float(with_intercept[0][mean_mib]) > write_x * float(without_intercept[0][mean_mib])) # Verifying read performance self.assertTrue( float(with_intercept[1][max_mib]) > read_x * float(without_intercept[1][max_mib])) self.assertTrue( float(with_intercept[1][min_mib]) > read_x * float(without_intercept[1][min_mib])) self.assertTrue( float(with_intercept[1][mean_mib]) > read_x * float(without_intercept[1][mean_mib]))
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*')
def run_il_perf_check(self): """Verify IOR performance with DFUSE + IL is similar to DFS. Steps: Run IOR with DFS. Run IOR with DFUSE + IL. Verify performance with DFUSE + IL is similar to DFS. """ # Write and read performance thresholds write_x = self.params.get("write_x", self.ior_cmd.namespace, None) read_x = self.params.get("read_x", self.ior_cmd.namespace, None) if write_x is None or read_x is None: self.fail("Failed to get write_x and read_x from config") # Run IOR with DFS self.ior_cmd.api.update("DFS") dfs_out = self.run_ior_with_pool(fail_on_warning=self.log.info) dfs_perf = IorCommand.get_ior_metrics(dfs_out) # Destroy and use a new pool and container self.container.destroy() self.container = None self.pool.destroy() self.pool = None # Run IOR with dfuse + IL self.ior_cmd.api.update("POSIX") dfuse_out = self.run_ior_with_pool(intercept=os.path.join( self.prefix, 'lib64', 'libioil.so'), fail_on_warning=self.log.info) dfuse_perf = IorCommand.get_ior_metrics(dfuse_out) # Verify write and read performance are within the thresholds. # Since Min can have a lot of variance, don't check Min or Mean. # Ideally, we might want to look at the Std Dev to ensure the results are admissible. dfs_max_write = float(dfs_perf[0][IorMetrics.Max_MiB]) dfuse_max_write = float(dfuse_perf[0][IorMetrics.Max_MiB]) actual_write_x = percent_change(dfs_max_write, dfuse_max_write) self.log.info("DFS Max Write: %.2f", dfs_max_write) self.log.info("DFUSE IL Max Write: %.2f", dfuse_max_write) self.log.info("Percent Diff: %.2f%%", actual_write_x * 100) self.assertLessEqual(abs(actual_write_x), write_x, "Max Write Diff too large") dfs_max_read = float(dfs_perf[1][IorMetrics.Max_MiB]) dfuse_max_read = float(dfuse_perf[1][IorMetrics.Max_MiB]) actual_read_x = percent_change(dfs_max_read, dfuse_max_read) self.log.info("DFS Max Read: %.2f", dfs_max_read) self.log.info("DFUSE IL Max Read: %.2f", dfuse_max_read) self.log.info("Percent Diff: %.2f%%", actual_read_x * 100) self.assertLessEqual(abs(actual_read_x), read_x, "Max Read Diff too large")
def create_ior_cmdline(self, job_spec, pool, ppn, nodesperjob): """Create an IOR cmdline to run in slurm batch. Args: job_spec (str): ior job in yaml to run pool (obj): TestPool obj ppn(int): number of tasks to run on each node nodesperjob(int): number of nodes per job Returns: cmd: cmdline string """ commands = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/*" # IOR job specs with a list of parameters; update each value api_list = self.params.get("api", ior_params + "*") tsize_list = self.params.get("transfer_size", ior_params + "*") bsize_list = self.params.get("block_size", ior_params + "*") oclass_list = self.params.get("daos_oclass", ior_params + "*") # check if capable of doing rebuild; if yes then daos_oclass = RP_*GX if self.is_harasser("rebuild"): oclass_list = self.params.get("daos_oclass", "/run/rebuild/*") # update IOR cmdline for each additional IOR obj for api in api_list: for b_size in bsize_list: for t_size in tsize_list: for o_type in oclass_list: ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) if self.job_timeout is not None: ior_cmd.max_duration.update(self.job_timeout) else: ior_cmd.max_duration.update(10) ior_cmd.api.update(api) ior_cmd.block_size.update(b_size) ior_cmd.transfer_size.update(t_size) ior_cmd.daos_oclass.update(o_type) ior_cmd.set_daos_params(self.server_group, pool) # srun cmdline nprocs = nodesperjob * ppn env = ior_cmd.get_default_env("srun") if ior_cmd.api.value == "MPIIO": env["DAOS_CONT"] = ior_cmd.daos_cont.value cmd = Srun(ior_cmd) cmd.assign_processes(nprocs) cmd.assign_environment(env, True) cmd.ntasks_per_node.update(ppn) log_name = "{}_{}_{}_{}".format( api, b_size, t_size, o_type) commands.append([cmd.__str__(), log_name]) self.log.info("<<IOR cmdline>>: %s \n", commands[-1].__str__()) return commands
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.ppn = self.params.get("ppn", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None)
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.subprocess = self.params.get("subprocess", '/run/ior/*', False) # lock is needed for run_multiple_ior method. self.lock = threading.Lock()
def start_ior_thread(self, results, create_cont, operation): """Start IOR write/read threads and wait until all threads are finished. Args: results (queue): queue for returning thread results create_cont (Bool): To create the new container or not. operation (str): Write/WriteRead: It will Write or Write/Read base on IOR parameter in yaml file. Auto_Write/Auto_Read: It will calculate the IOR block size based on requested storage % to be fill. """ # IOR flag can be Write only or Write/Read based on test yaml self.ior_cmd.flags.value = self.ior_default_flags # Calculate the block size based on server % to fill up. if 'Auto' in operation: block_size = self.calculate_ior_block_size() self.ior_cmd.block_size.update('{}'.format(block_size)) # For IOR Read operation update the read flax from yaml file. if 'Auto_Read' in operation or operation == "Read": create_cont = False self.ior_cmd.flags.value = self.ior_read_flags # run IOR Command try: out = self.run_ior_with_pool(create_cont=create_cont, fail_on_warning=self.fail_on_warning) self.ior_matrix = IorCommand.get_ior_metrics(out) results.put("PASS") except (CommandFailure, TestFail) as _error: results.put("FAIL")
def start_ior_thread(self, results, create_cont, operation='WriteRead'): """Start IOR write/read threads and wait until all threads are finished. Args: results (queue): queue for returning thread results operation (str): IOR operation for read/write. Default it will do whatever mention in ior_flags set. """ self.ior_cmd.flags.value = self.ior_default_flags #For IOR Other operation, calculate the block size based on server % #to fill up. Store the container UUID for future reading operation. if operation == 'Write': block_size = self.calculate_ior_block_size() self.ior_cmd.block_size.update('{}'.format(block_size)) #For IOR Read only operation, retrieve the stored container UUID elif operation == 'Read': create_cont = False self.ior_cmd.flags.value = self.ior_read_flags # run IOR Command try: out = self.run_ior_with_pool(create_cont=create_cont, fail_on_warning=self.fail_on_warning) self.ior_matrix = IorCommand.get_ior_metrics(out) results.put("PASS") except (CommandFailure, TestFail) as _error: results.put("FAIL")
def setUp(self): """Set up each test case.""" # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.mpiio_oclass = self.params.get("mpiio_oclass", '/run/ior/*') # Get the test params self.pool = TestPool(self.context, self.log) self.pool.get_params(self) # Create a pool self.pool.create()
def run_ior_collect_error(self, results, job_num, file_name, clients): """Run IOR command and store error in results. Args: results (dict): A dictionary object to store the ior metrics. job_num (int): Assigned job number. file_name (str): File name used for self.ior_cmd.test_file. clients (list): Client hostnames to run IOR from. """ ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params( group=self.server_group, pool=self.pool, cont_uuid=self.container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager( test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') try: ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # We'll verify the error message. results[job_num].append(ior_output.stderr_text) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)]
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super().setUp() self.hostfile_clients = None self.ior_local_cmd = IorCommand() self.ior_local_cmd.get_params(self) self.ior_default_flags = self.ior_local_cmd.flags.value self.ior_scm_xfersize = self.params.get("transfer_size", '/run/ior/transfersize_blocksize/*', '2048') self.ior_read_flags = self.params.get("read_flags", '/run/ior/iorflags/*', '-r -R -k -G 1') self.ior_nvme_xfersize = self.params.get("nvme_transfer_size", '/run/ior/transfersize_blocksize/*', '16777216') # Get the number of daos_engine self.engines = self.server_managers[0].manager.job.yaml.engine_params self.dmg_command = self.get_dmg_command()
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
def run_custom_ior_cmd(self, ior_command, clients, results, job_num, intercept=None): """Run customized IOR command, not self.ior_cmd. Expected to be used with a threaded code where multiple IOR commands are executed in parallel. Display pool space before running it for a reference. Args: ior_command (IorCommand): Custom IOR command instance. clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.log.info("--- IOR Thread %d: Start ---", job_num) tsize = ior_command.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" ior_command.test_file.update(testfile) # Get the custom job manager that's associated with this thread. manager = get_job_manager(self, "Mpirun", ior_command, self.subprocess, "mpich") procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = ior_command.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.log.info("--- IOR Thread %d: Starting IOR ---", job_num) self.display_pool_space() try: ior_output = manager.run() results[job_num] = [True] results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)] finally: self.display_pool_space() self.log.info("--- IOR Thread %d: End ---", job_num)
def create_ior_cmdline(self, job_params, job_spec, pool): """Create an IOR cmdline to run in slurm batch. Args: job_params (str): job params from yaml file job_spec (str): specific ior job to run pool (obj): TestPool obj Returns: cmd: cmdline string """ command = [] iteration = self.test_iteration ior_params = "/run/" + job_spec + "/" ior_cmd = IorCommand() ior_cmd.namespace = ior_params ior_cmd.get_params(self) if iteration is not None and iteration < 0: ior_cmd.repetitions.update(1000000) ior_cmd.max_duration.update(self.params.get("time", job_params + '*')) # IOR job specs with a list of parameters; update each value # transfer_size # block_size # daos object class tsize_list = ior_cmd.transfer_size.value bsize_list = ior_cmd.block_size.value oclass_list = ior_cmd.daos_oclass.value for b_size in bsize_list: ior_cmd.block_size.update(b_size) for o_type in oclass_list: ior_cmd.daos_oclass.update(o_type) for t_size in tsize_list: ior_cmd.transfer_size.update(t_size) ior_cmd.set_daos_params(self.server_group, pool) # export the user environment to test node exports = ["ALL"] if ior_cmd.api.value == "MPIIO": env = { "CRT_ATTACH_INFO_PATH": os.path.join( self.basepath, "install/tmp"), "DAOS_POOL": str(ior_cmd.daos_pool.value), "MPI_LIB": "\"\"", "DAOS_SVCL": str(ior_cmd.daos_svcl.value), "DAOS_SINGLETON_CLI": 1, "FI_PSM2_DISCONNECT": 1 } exports.extend( ["{}={}".format( key, val) for key, val in env.items()]) cmd = "srun -l --mpi=pmi2 --export={} {}".format( ",".join(exports), ior_cmd) command.append(cmd) self.log.debug("<<IOR cmdline >>: %s \n", cmd) return command
def test_ior_intercept_verify_data(self): """Jira ID: DAOS-3502. Test Description: Purpose of this test is to run ior through dfuse with interception library on 5 clients and without interception library on 1 client for at least 30 minutes and verify the data integrity using ior's Read Verify and Write Verify options. Use case: Run ior with read, write, fpp, read verify write verify for 30 minutes Run ior with read, write, read verify write verify for 30 minutes :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=daosio,ior_intercept_verify_data """ self.add_pool() self.add_container(self.pool) intercept = os.path.join(self.prefix, 'lib64', 'libioil.so') results = dict() client_count = len(self.hostlist_clients) w_clients = self.hostlist_clients[0:client_count - 1] wo_clients = [self.hostlist_clients[-1]] self.run_ior_threads_il(results=results, intercept=intercept, with_clients=w_clients, without_clients=wo_clients) IorCommand.log_metrics(self.log, "5 clients - with interception library", results[1]) IorCommand.log_metrics(self.log, "1 client - without interception library", results[2])
def setUp(self): """Set up each test case.""" # obtain separate logs self.update_log_file_names() # Start the servers and agents super(IorTestBase, self).setUp() # Get the parameters for IOR self.ior_cmd = IorCommand() self.ior_cmd.get_params(self) self.processes = self.params.get("np", '/run/ior/client_processes/*') self.co_prop = self.params.get("container_properties", "/run/container/*") # Until DAOS-3320 is resolved run IOR for POSIX # with single client node if self.ior_cmd.api.value == "POSIX": self.hostlist_clients = [self.hostlist_clients[0]] self.hostfile_clients = write_host_file.write_host_file( self.hostlist_clients, self.workdir, self.hostfile_clients_slots) # lock is needed for run_multiple_ior method. self.lock = threading.Lock()
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.daos_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") manager.job.daos_cont.update(container_info ["{}{}{}".format(oclass, api, test[2])]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def start_ior_thread(self, create_cont, operation): """Start IOR write/read threads and wait until all threads are finished. Args: create_cont (Bool): To create the new container or not. operation (str): Write/WriteRead: It will Write or Write/Read base on IOR parameter in yaml file. Auto_Write/Auto_Read: It will calculate the IOR block size based on requested storage % to be fill. """ # IOR flag can Write/Read based on test yaml self.ior_local_cmd.flags.value = self.ior_default_flags # Calculate the block size based on server % to fill up. if 'Auto' in operation: block_size = self.calculate_ior_block_size() self.ior_local_cmd.block_size.update('{}'.format(block_size)) # For IOR Read operation update the read only flag from yaml file. if 'Auto_Read' in operation or operation == "Read": create_cont = False self.ior_local_cmd.flags.value = self.ior_read_flags self.ior_local_cmd.set_daos_params(self.server_group, self.pool) self.ior_local_cmd.test_file.update('/testfile') # Created new container or use the existing container for reading if create_cont: self.create_container() self.ior_local_cmd.dfs_cont.update(self.nvme_local_cont.uuid) # Define the job manager for the IOR command job_manager_main = get_job_manager(self, "Mpirun", self.ior_local_cmd, mpi_type="mpich") env = self.ior_local_cmd.get_default_env(str(job_manager_main)) job_manager_main.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager_main.assign_environment(env, True) job_manager_main.assign_processes(self.params.get("np", '/run/ior/client_processes/*')) # run IOR Command try: output = job_manager_main.run() self.ior_matrix = IorCommand.get_ior_metrics(output) for line in output.stdout_text.splitlines(): if 'WARNING' in line and self.fail_on_warning: self.result.append("FAIL-IOR command issued warnings.") except (CommandFailure, TestFail) as error: self.result.append("FAIL - {}".format(error))
def log_metrics(self, without_intercept, with_intercept): """Log the ior metrics because the stdout from ior can be mixed because of multithreading. Args: without_intercept (dict): IOR Metrics without using interception library. with_intercept (dict): IOR Metrics using interception library. """ IorCommand.log_metrics(self.log, "3 clients - without " + "interception library", without_intercept[1]) IorCommand.log_metrics(self.log, "3 clients - with " + "interception library", with_intercept[1]) IorCommand.log_metrics(self.log, "1 client - without " + "interception library", without_intercept[2]) IorCommand.log_metrics(self.log, "1 clients - without " + "interception library", with_intercept[2])
def run_ior_report_error(self, results, job_num, file_name, pool, container, namespace): """Run IOR command and store the results to results dictionary. Create a new IorCommand object instead of using the one in IorTestBase because we'll run a test that runs multiple IOR processes at the same time. Args: results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number file_name (str): File name used for self.ior_cmd.test_file. oclass (str): Value for dfs_oclass and dfs_dir_oclass. pool (TestPool): Pool to run IOR. container (TestContainer): Container to run IOR. """ # Update the object class depending on the test case. ior_cmd = IorCommand(namespace=namespace) ior_cmd.get_params(self) # Standard IOR prep sequence. ior_cmd.set_daos_params(self.server_group, pool, container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager(test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') # Run the command. try: self.log.info("--- IOR command %d start ---", job_num) ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # Command worked, but append the error message if any. results[job_num].append(ior_output.stderr_text) self.log.info("--- IOR command %d end ---", job_num) except CommandFailure as error: self.log.info("--- IOR command %d failed ---", job_num) results[job_num] = [False, "IOR failed: {}".format(error)]
def ior_bg_thread(self, results): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. Args: results (queue): queue for returning thread results """ mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich") self.create_cont() self.job_manager.job.dfs_cont.update(self.container.uuid) env = ior_bg_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(1) self.job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: self.job_manager.run() except (CommandFailure, TestFail) as _error: results.put("FAIL") break
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (TestPool): Pool to run IOR command on. oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich") key = "{}{}{}".format(oclass, api, test[2]) job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(processes) job_manager.assign_environment(env, True) # run IOR Command try: job_manager.run() except CommandFailure as _error: results.put("FAIL")
def ior_bg_thread(self): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. """ # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_bg_cmd, mpi_type="mpich") # create container container = self.get_container(self.pool) job_manager.job.dfs_cont.update(container.uuid) env = ior_bg_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(1) job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: job_manager.run() except (CommandFailure, TestFail) as _error: self.test_result.append("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: job_manager.run() except (CommandFailure, TestFail) as _error: break
def ior_thread(self, pool, oclass, api, test, flags, results): """This method calls job manager for IOR command invocation. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) if "-w" in flags: self.container_info["{}{}{}" .format(oclass, api, test[0])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[0])]) manager.job.dfs_cont.update(self.container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL")
def run_multiple_ior(self, hostfile, num_clients, results, job_num, intercept=None): # pylint: disable=too-many-arguments """Run the IOR command. Args: manager (str): mpi job manager command processes (int): number of host processes intercept (str): path to interception library. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * num_clients env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.setup_command(env, hostfile, procs) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space()
def run_multiple_ior(self, clients, results, job_num, intercept=None): """Run the IOR command. Args: clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.lock.acquire(True) tsize = self.ior_cmd.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" self.ior_cmd.test_file.update(testfile) manager = self.get_ior_job_manager_command() procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = self.ior_cmd.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.lock.release() try: self.pool.display_pool_daos_space() out = manager.run() self.lock.acquire(True) results[job_num] = IorCommand.get_ior_metrics(out) self.lock.release() except CommandFailure as error: self.log.error("IOR Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") finally: self.pool.display_pool_daos_space()
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,large """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [ [str(uuid.uuid4()) for _ in range(files_per_thread)] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command manager = Orterun(ior_cmd) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env) # Add a thread for these IOR arguments threads.append( threading.Thread( target=ior_runner_thread, kwargs={ "manager": manager, "uuids": list_of_uuid_lists[index], "results": self.out_queue})) self.log.info( "Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Stop the servers errors = self.stop_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format("\n ".join(errors))) # Start the agents self.start_agent_managers() # Start the servers self.start_server_managers()
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=metadata,metadata_ior,nvme,small """ files_per_thread = 400 total_ior_threads = 5 self.out_queue = Queue.Queue() processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [[ str(uuid.uuid4()) for _ in range(files_per_thread) ] for _ in range(total_ior_threads)] # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads threads = [] for index in range(total_ior_threads): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Add a thread for these IOR arguments threads.append( threading.Thread(target=ior_runner_thread, kwargs={ "ior_cmd": ior_cmd, "uuids": list_of_uuid_lists[index], "mgr": self.orterun, "attach": self.tmp, "hostfile": self.hostfile_clients, "procs": processes, "results": self.out_queue })) self.log.info("Creatied %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads if self.thread_control(threads, operation) == "FAIL": self.d_log.error("IOR {} Thread FAIL".format(operation)) self.fail("IOR {} Thread FAIL".format(operation)) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents and servers if self.agent_sessions: stop_agent(self.agent_sessions, self.hostlist_clients) stop_server(hosts=self.hostlist_servers) # Start the agents self.agent_sessions = run_agent(self.basepath, self.hostlist_clients, self.hostlist_servers) # Start the servers run_server(self.hostfile_servers, self.server_group, self.basepath, clean=False)