def test_daos_vol_mpich(self): """Jira ID: DAOS-3656. Test Description: Run HDF5 testphdf5 and t_shapesame provided in HDF5 package with daos vol connector. Testing various I/O functions provided in HDF5 test suite such as: h5_test_testhdf5 h5vl_test h5_partest_t_bigio h5_partest_testphdf5 h5vl_test_parallel h5_partest_t_shapesame h5daos_test_map h5daos_test_map_parallel h5daos_test_oclass h5daos_test_metadata_parallel :avocado: tags=all,pr,daily_regression :avocado: tags=hw,small :avocado: tags=hdf5,vol,volunit,volmpich :avocado: tags=DAOS_5610 """ manager = get_job_manager(self, mpi_type="mpich") self.run_test(manager, "/usr/lib64/mpich/lib", "/usr/lib64/hdf5_vol_daos/mpich/tests")
def run_ior_collect_error(self, results, job_num, file_name, clients): """Run IOR command and store error in results. Args: results (dict): A dictionary object to store the ior metrics. job_num (int): Assigned job number. file_name (str): File name used for self.ior_cmd.test_file. clients (list): Client hostnames to run IOR from. """ ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params( group=self.server_group, pool=self.pool, cont_uuid=self.container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager( test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') try: ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # We'll verify the error message. results[job_num].append(ior_output.stderr_text) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)]
def test_self_test(self): """Run a few CaRT self-test scenarios. :avocado: tags=all,pr,daily_regression,smoke,unittest,tiny,cartselftest """ # Setup the orterun command orterun = get_job_manager(self, "Orterun", self.SelfTest(self.bin), mpi_type="openmpi") orterun.map_by.update(None, "orterun/map_by") orterun.enable_recovery.update(False, "orterun/enable_recovery") # Get the self_test command line parameters orterun.job.get_params(self) orterun.job.group_name.update(self.server_group, "group_name") # Setup the environment variables for the self_test orterun command orterun.assign_environment(self.cart_env) # Run the test try: orterun.run() except CommandFailure as error: self.test_log.info("CaRT self_test returned non-zero: %s", str(error)) self.fail("CaRT self_test returned non-zero")
def get_ior_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: JobManager: the mpi job manager object """ return get_job_manager(self, "Mpirun", self.ior_cmd, self.subprocess, "mpich")
def get_mdtest_job_manager_command(self, mpi_type): """Get the MPI job manager command for Mdtest. Returns: JobManager: the object for the mpi job manager command """ # pylint: disable=redefined-variable-type # Initialize MpioUtils if mdtest needs to be run using mpich if mpi_type == "MPICH": manager = get_job_manager(self, "Mpirun", self.mdtest_cmd, self.subprocess, mpi_type="mpich") else: manager = get_job_manager(self, "Orterun", self.mdtest_cmd, self.subprocess) return manager
def run_custom_ior_cmd(self, ior_command, clients, results, job_num, intercept=None): """Run customized IOR command, not self.ior_cmd. Expected to be used with a threaded code where multiple IOR commands are executed in parallel. Display pool space before running it for a reference. Args: ior_command (IorCommand): Custom IOR command instance. clients (list): hosts on which to run ior results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number intercept (str, optional): path to interception library. Defaults to None. """ self.log.info("--- IOR Thread %d: Start ---", job_num) tsize = ior_command.transfer_size.value testfile = os.path.join(self.dfuse.mount_dir.value, "testfile{}{}".format(tsize, job_num)) if intercept: testfile += "intercept" ior_command.test_file.update(testfile) # Get the custom job manager that's associated with this thread. manager = get_job_manager(self, "Mpirun", ior_command, self.subprocess, "mpich") procs = (self.processes // len(self.hostlist_clients)) * len(clients) env = ior_command.get_default_env(str(manager), self.client_log) if intercept: env["LD_PRELOAD"] = intercept manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots) manager.assign_processes(procs) manager.assign_environment(env) self.log.info("--- IOR Thread %d: Starting IOR ---", job_num) self.display_pool_space() try: ior_output = manager.run() results[job_num] = [True] results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) except CommandFailure as error: results[job_num] = [False, "IOR failed: {}".format(error)] finally: self.display_pool_space() self.log.info("--- IOR Thread %d: End ---", job_num)
def start_ior_thread(self, create_cont, operation): """Start IOR write/read threads and wait until all threads are finished. Args: create_cont (Bool): To create the new container or not. operation (str): Write/WriteRead: It will Write or Write/Read base on IOR parameter in yaml file. Auto_Write/Auto_Read: It will calculate the IOR block size based on requested storage % to be fill. """ # IOR flag can Write/Read based on test yaml self.ior_local_cmd.flags.value = self.ior_default_flags # Calculate the block size based on server % to fill up. if 'Auto' in operation: block_size = self.calculate_ior_block_size() self.ior_local_cmd.block_size.update('{}'.format(block_size)) # For IOR Read operation update the read only flag from yaml file. if 'Auto_Read' in operation or operation == "Read": create_cont = False self.ior_local_cmd.flags.value = self.ior_read_flags self.ior_local_cmd.set_daos_params(self.server_group, self.pool) self.ior_local_cmd.test_file.update('/testfile') # Created new container or use the existing container for reading if create_cont: self.create_container() self.ior_local_cmd.dfs_cont.update(self.nvme_local_cont.uuid) # Define the job manager for the IOR command job_manager_main = get_job_manager(self, "Mpirun", self.ior_local_cmd, mpi_type="mpich") env = self.ior_local_cmd.get_default_env(str(job_manager_main)) job_manager_main.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager_main.assign_environment(env, True) job_manager_main.assign_processes(self.params.get("np", '/run/ior/client_processes/*')) # run IOR Command try: output = job_manager_main.run() self.ior_matrix = IorCommand.get_ior_metrics(output) for line in output.stdout_text.splitlines(): if 'WARNING' in line and self.fail_on_warning: self.result.append("FAIL-IOR command issued warnings.") except (CommandFailure, TestFail) as error: self.result.append("FAIL - {}".format(error))
def run_ior_report_error(self, results, job_num, file_name, pool, container, namespace): """Run IOR command and store the results to results dictionary. Create a new IorCommand object instead of using the one in IorTestBase because we'll run a test that runs multiple IOR processes at the same time. Args: results (dict): A dictionary object to store the ior metrics job_num (int): Assigned job number file_name (str): File name used for self.ior_cmd.test_file. oclass (str): Value for dfs_oclass and dfs_dir_oclass. pool (TestPool): Pool to run IOR. container (TestContainer): Container to run IOR. """ # Update the object class depending on the test case. ior_cmd = IorCommand(namespace=namespace) ior_cmd.get_params(self) # Standard IOR prep sequence. ior_cmd.set_daos_params(self.server_group, pool, container.uuid) testfile = os.path.join("/", file_name) ior_cmd.test_file.update(testfile) manager = get_job_manager(test=self, class_name="Mpirun", job=ior_cmd, subprocess=self.subprocess, mpi_type="mpich") manager.assign_hosts(self.hostlist_clients, self.workdir, self.hostfile_clients_slots) ppn = self.params.get("ppn", '/run/ior/client_processes/*') manager.ppn.update(ppn, 'mpirun.ppn') manager.processes.update(None, 'mpirun.np') # Run the command. try: self.log.info("--- IOR command %d start ---", job_num) ior_output = manager.run() results[job_num] = [True] # For debugging. results[job_num].extend(IorCommand.get_ior_metrics(ior_output)) # Command worked, but append the error message if any. results[job_num].append(ior_output.stderr_text) self.log.info("--- IOR command %d end ---", job_num) except CommandFailure as error: self.log.info("--- IOR command %d failed ---", job_num) results[job_num] = [False, "IOR failed: {}".format(error)]
def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (TestPool): Pool to run IOR command on. oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich") key = "{}{}{}".format(oclass, api, test[2]) job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(processes) job_manager.assign_environment(env, True) # run IOR Command try: job_manager.run() except CommandFailure as _error: results.put("FAIL")
def ior_bg_thread(self): """Start IOR Background thread, This will write small data set and keep reading it in loop until it fails or main program exit. """ # Define the IOR Command and use the parameter from yaml file. ior_bg_cmd = IorCommand() ior_bg_cmd.get_params(self) ior_bg_cmd.set_daos_params(self.server_group, self.pool) ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value) ior_bg_cmd.api.update(self.ior_cmd.api.value) ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize) ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value) ior_bg_cmd.flags.update(self.ior_cmd.flags.value) ior_bg_cmd.test_file.update('/testfile_background') # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_bg_cmd, mpi_type="mpich") # create container container = self.get_container(self.pool) job_manager.job.dfs_cont.update(container.uuid) env = ior_bg_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) job_manager.assign_processes(1) job_manager.assign_environment(env, True) print('----Run IOR in Background-------') # run IOR Write Command try: job_manager.run() except (CommandFailure, TestFail) as _error: self.test_result.append("FAIL") return # run IOR Read Command in loop ior_bg_cmd.flags.update(self.ior_read_flags) while True: try: job_manager.run() except (CommandFailure, TestFail) as _error: break
def test_daos_vol_bigio(self): """Jira ID: DAOS-3656. Test Description: Run HDF5 h5_partest_t_bigio provided in HDF5 package with daos vol connector and mpich. Testing various I/O functions provided in HDF5 test suite such as: h5_partest_t_bigio :avocado: tags=all,full_regression :avocado: tags=hw,small :avocado: tags=hdf5,vol,volbigio :avocado: tags=DAOS_5610 """ manager = get_job_manager(self, mpi_type="mpich") self.run_test(manager, "/usr/lib64/mpich/lib", "/usr/lib64/hdf5_vol_daos/mpich/tests")
def test_metadata_server_restart(self): """JIRA ID: DAOS-1512. Test Description: This test will verify 2000 IOR small size container after server restart. Test will write IOR in 5 different threads for faster execution time. Each thread will create 400 (8bytes) containers to the same pool. Restart the servers, read IOR container file written previously and validate data integrity by using IOR option "-R -G 1". Use Cases: ? :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=server,metadata,metadata_ior,nvme """ self.create_pool() files_per_thread = 400 total_ior_threads = 5 processes = self.params.get("slots", "/run/ior/clientslots/*") list_of_uuid_lists = [ [str(uuid.uuid4()) for _ in range(files_per_thread)] for _ in range(total_ior_threads)] # Setup the thread manager thread_manager = ThreadManager(run_ior_loop, self.timeout - 30) # Launch threads to run IOR to write data, restart the agents and # servers, and then run IOR to read the data for operation in ("write", "read"): # Create the IOR threads for index in range(total_ior_threads): # Define the arguments for the run_ior_loop method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.flags.value = self.params.get( "F", "/run/ior/ior{}flags/".format(operation)) # Define the job manager for the IOR command self.ior_managers.append( get_job_manager(self, "Orterun", ior_cmd, mpi_type="openmpi")) env = ior_cmd.get_default_env(str(self.ior_managers[-1])) self.ior_managers[-1].assign_hosts(self.hostlist_clients, self.workdir, None) self.ior_managers[-1].assign_processes(processes) self.ior_managers[-1].assign_environment(env) self.ior_managers[-1].verbose = False # Add a thread for these IOR arguments thread_manager.add( manager=self.ior_managers[-1], uuids=list_of_uuid_lists[index], tmpdir_base=self.test_dir) self.log.info( "Created %s thread %s with container uuids %s", operation, index, list_of_uuid_lists[index]) # Launch the IOR threads self.log.info("Launching %d IOR %s threads", thread_manager.qty, operation) failed_thread_count = thread_manager.check_run() if failed_thread_count > 0: msg = "{} FAILED IOR {} Thread(s)".format(failed_thread_count, operation) self.d_log.error(msg) self.fail(msg) # Restart the agents and servers after the write / before the read if operation == "write": # Stop the agents errors = self.stop_agents() self.assertEqual( len(errors), 0, "Error stopping agents:\n {}".format("\n ".join(errors))) # Restart the servers w/o formatting the storage errors = self.restart_servers() self.assertEqual( len(errors), 0, "Error stopping servers:\n {}".format("\n ".join(errors))) # Start the agents self.start_agent_managers() self.log.info("Test passed")
def ior_runner_thread(self, results): """Start threads and wait until all threads are finished. Destroy the container at the end of this thread run. Args: results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} cmd = DaosCommand(os.path.join(self.prefix, "bin")) cmd.set_sub_command("container") cmd.sub_command_class.set_sub_command("destroy") # Iterate through IOR different value and run in sequence for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_transfer_size, self.ior_flags): # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[0]) ior_cmd.block_size.update(test[1]) ior_cmd.flags.update(flags) # Define the job manager for the IOR command job_manager = get_job_manager(self, "Mpirun", ior_cmd, mpi_type="mpich") cont_uuid = str(uuid.uuid4()) job_manager.job.dfs_cont.update(cont_uuid) env = ior_cmd.get_default_env(str(job_manager)) job_manager.assign_hosts( self.hostlist_clients, self.workdir, None) job_manager.assign_processes(processes) job_manager.assign_environment(env, True) # run IOR Command try: job_manager.run() container_info["{}{}{}" .format(oclass, api, test[0])] = cont_uuid except CommandFailure as _error: results.put("FAIL") # Destroy the container created by thread for key in container_info: cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid #cmd.sub_command_class.sub_command_class.svc.value = \ # self.pool.svc_ranks cmd.sub_command_class.sub_command_class.cont.value = \ container_info[key] try: # pylint: disable=protected-access cmd._get_result() except CommandFailure as _error: results.put("FAIL")
def run_test(self, test_repo, test_name): """Execute function to be used by test functions below. test_repo --absolute or relative location of test repository test_name --name of the test to be run """ # Select the commands to run if test_name not in self._test_name_class: self.fail("Unknown mpiio test name: {}".format(test_name)) # initialize test specific variables client_processes = self.params.get("np", '/run/client_processes/') # Create pool self.add_pool(connect=False) # create container self.add_container(self.pool) # Pass pool and container information to the commands env = EnvironmentVariables() env["DAOS_UNS_PREFIX"] = "daos://{}/{}/".format( self.pool.uuid, self.container.uuid) if test_name == "llnl": env["MPIO_USER_PATH"] = "daos:/" # Create commands kwargs_list = [{"path": test_repo}] if test_name == "hdf5": kwargs_list[0]["command"] = "testphdf5" kwargs_list.append(kwargs_list[0].copy()) kwargs_list[1]["command"] = "t_shapesame" env["HDF5_PARAPREFIX"] = "daos:" self.job_manager = [] job_managers = [] for kwargs in kwargs_list: manager = get_job_manager(self) # fix up a relative test_repo specification if not kwargs["path"].startswith("/"): mpi_path = os.path.split(manager.command_path)[0] kwargs["path"] = os.path.join(mpi_path, kwargs["path"]) if test_name == "romio": # Romio is not run via mpirun romio_job = self._test_name_class[test_name](**kwargs) romio_job.env = env job_managers.append(romio_job) self.job_manager[-1] = romio_job else: # finish job manager setup job_managers.append(manager) job_managers[-1].job = self._test_name_class[test_name]( **kwargs) job_managers[-1].assign_hosts(self.hostlist_clients) job_managers[-1].assign_processes(client_processes) job_managers[-1].assign_environment(env, True) # Add a list of bad words that if found should fail the command job_managers[-1].check_results_list = [ "non-zero exit code", "MPI_Abort", "MPI_ABORT", "ERROR" ] for job_manager in job_managers: try: job_manager.run() except CommandFailure as error: self.fail("<{0} Test Failed> \n{1}".format(test_name, error))
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: num_clients = self.params.get("num_clients", '/run/daos_tests/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.get_test_param("args", "") stopped_ranks = self.get_test_param("stopped_ranks", []) pools_created = self.get_test_param("pools_created", 1) self.increment_timeout(POOL_TIMEOUT_INCREMENT * pools_created) dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename if self.hostlist_clients: dmg.copy_certificates( get_log_file("daosCA/certs"), self.hostlist_clients) dmg.copy_configuration(self.hostlist_clients) cmd = " ".join( [ "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "--map-by node", "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", "-x", "COVFILE=/tmp/test.cov", self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args) ] ) job_cmd = ExecutableCommand(namespace=None, command=cmd) job = get_job_manager(self, "Orterun", job_cmd, mpi_type="openmpi") # Assign the test to run job.hostfile.update(self.hostfile_clients) job.processes.update(num_clients) job_str = str(job) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_cmocka_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) # Update the expected status for each ranks that will be stopped by this # test to avoid a false failure during tearDown(). if "random" in stopped_ranks: # Set each expected rank state to be either stopped or running for manager in self.server_managers: manager.update_expected_states( None, ["Joined", "Stopped", "Excluded"]) else: # Set the specific expected rank state to stopped for rank in stopped_ranks: for manager in self.server_managers: manager.update_expected_states( rank, ["Stopped", "Excluded"]) try: process.run(job_str, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result, "Failed to run {}.".format( self.daos_test)) self.fail( "{0} failed with return code={1}.\n".format( job_str, result.result.exit_status))
def test_ior_intercept_verify_data(self): """Jira ID: DAOS-3502. Test Description: Purpose of this test is to run ior through dfuse with interception library on 5 clients and without interception library on 1 client for at least 30 minutes and verify the data integrity using ior's Read Verify and Write Verify options. Use case: Run ior with read, write, fpp, read verify write verify for 30 minutes Run ior with read, write, read verify write verify for 30 minutes :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=daosio,dfuse,il,ior_intercept :avocado: tags=ior_intercept_verify_data """ self.add_pool() self.add_container(self.pool) # Start dfuse for POSIX api. This is specific to interception library test requirements. self.start_dfuse(self.hostlist_clients, self.pool, self.container) # Setup the thread manager thread_manager = ThreadManager(run_ior, self.timeout - 30) index_clients_intercept_file = [ (0, self.hostlist_clients[0:-1], os.path.join(self.prefix, 'lib64', 'libioil.so'), os.path.join(self.dfuse.mount_dir.value, "testfile_0_intercept")), (1, self.hostlist_clients[-1:], None, os.path.join(self.dfuse.mount_dir.value, "testfile_1")), ] self.job_manager = [] for index, clients, intercept, test_file in index_clients_intercept_file: # Add a job manager for each ior command. Use a timeout for the ior command that leaves # enough time to report the summary of all the threads job_manager = get_job_manager(self, "Mpirun", None, False, "mpich", self.get_remaining_time() - 30) # Define the parameters that will be used to run an ior command in this thread thread_manager.add( test=self, manager=job_manager, log=self.client_log, hosts=clients, path=self.workdir, slots=None, group=self.server_group, pool=self.pool, container=self.container, processes=(self.processes // len(self.hostlist_clients)) * len(clients), intercept=intercept, ior_params={"test_file": test_file}) self.log.info("Created thread %s for %s with intercept: %s", index, clients, str(intercept)) # Launch the IOR threads self.log.info("Launching %d IOR threads", thread_manager.qty) results = thread_manager.run() # Stop dfuse self.stop_dfuse() # Check the ior thread results failed_thread_count = thread_manager.check(results) if failed_thread_count > 0: msg = "{} FAILED IOR Thread(s)".format(failed_thread_count) self.d_log.error(msg) self.fail(msg) for index, clients, intercept, _ in index_clients_intercept_file: with_intercept = "without" if intercept is None else "with" IorCommand.log_metrics( self.log, "{} clients {} interception library".format( len(clients), with_intercept), IorCommand.get_ior_metrics(results[index].result))