def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run()
def test_daos_racer(self): """JIRA-3855: daos_racer/consistency checker test. Test Description: The daos_racer test tool generates a bunch of simultaneous, conflicting I/O requests. After it is run it will verify that all the replicas of a given object are consistent. Run daos_racer for 5-10 minutes or so on 3-way replicated object data (at least 6 servers) and verify the object replicas. Use Cases: Running simultaneous, conflicting I/O requests. :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=io,daosracer :avocado: tags=daosracer_multi """ dmg = self.get_dmg_command() self.assertGreater( len(self.hostlist_clients), 0, "This test requires one client: {}".format(self.hostlist_clients)) daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg) daos_racer.get_params(self) daos_racer.set_environment( daos_racer.get_environment(self.server_managers[0])) daos_racer.run()
def create_racer_cmdline(self, job_spec): """Create the srun cmdline to run daos_racer. Args: self (obj): soak obj job_spec (str): fio job in yaml to run Returns: cmd(list): list of cmdlines """ commands = [] racer_namespace = os.path.join(os.sep, "run", job_spec, "*") daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) daos_racer.namespace = racer_namespace daos_racer.get_params(self) racer_log = os.path.join( self.test_log_dir, self.test_name + "_" + job_spec + "_${SLURM_JOB_NODELIST}_" "${SLURM_JOB_ID}_" + "racer_log") env = daos_racer.get_environment(self.server_managers[0], racer_log) daos_racer.set_environment(env) log_name = job_spec srun_cmds = [] srun_cmds.append(str(daos_racer.__str__())) srun_cmds.append("status=$?") # add exit code commands.append([srun_cmds, log_name]) self.log.info("<<DAOS racer cmdlines>>:") for cmd in srun_cmds: self.log.info("%s", cmd) return commands
def test_parallel(self): """JIRA-8445: multi-client daos_racer/consistency checker test. Test Description: The daos_racer test tool generates a bunch of simultaneous, conflicting I/O requests. It will test both replicated objects and EC objects and verify the data consistency. The duration will depend on parameters in test yaml configuration file. Use Cases: Running simultaneous, conflicting I/O requests. :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=io,daosracer,daos_racer_parallel """ # Create the dmg command daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.get_dmg_command()) daos_racer.get_params(self) # Create the orterun command self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(len(self.hostlist_clients)) self.job_manager.assign_environment( daos_racer.get_environment(self.server_managers[0])) self.job_manager.job = daos_racer self.job_manager.check_results_list = ["<stderr>"] self.job_manager.timeout = daos_racer.clush_timeout.value self.log.info("Multi-process command: %s", str(self.job_manager)) # Run the daos_perf command and check for errors try: self.job_manager.run() except CommandFailure as error: self.log.error("DAOS Racer Failed: %s", str(error)) self.fail("Test was expected to pass but it failed.\n") self.log.info("Test passed!")
def create_racer_cmdline(self, job_spec, pool): """Create the srun cmdline to run daos_racer. Args: self (obj): soak obj job_spec (str): fio job in yaml to run pool (obj): TestPool obj Returns: cmd(list): list of cmdlines """ commands = [] racer_namespace = "/run/{}/*".format(job_spec) daos_racer = DaosRacerCommand( self.bin, self.hostlist_clients[0], self.dmg_command) daos_racer.namespace = racer_namespace daos_racer.get_params(self) racer_log = os.path.join( self.test_log_dir, self.test_name + "_" + job_spec + "_${SLURM_JOB_NODELIST}_" "${SLURM_JOB_ID}_" + "racer_log") env = daos_racer.get_environment(self.server_managers[0], racer_log) daos_racer.set_environment(env) daos_racer.pool_uuid.update(pool.uuid) add_containers(self, pool, path=racer_namespace) daos_racer.cont_uuid.update(self.container[-1].uuid) log_name = job_spec srun_cmds = [] # add fio cmline srun_cmds.append(str(daos_racer.__str__())) srun_cmds.append("status=$?") # add exit code commands.append([srun_cmds, log_name]) self.log.info("<<DAOS racer cmdlines>>:") for cmd in srun_cmds: self.log.info("%s", cmd) return commands
class OSAOnlineReintegration(OSAUtils): # pylint: disable=too-many-ancestors """Online Server Addition online re-integration test class. Test Class Description: This test runs the daos_server Online reintegration test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.dmg_command = self.get_dmg_command() self.daos_command = DaosCommand(self.bin) self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) self.pool = None self.ds_racer_queue = queue.Queue() self.daos_racer = None self.dmg_command.exit_status_exception = True def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False, oclass=None): """Run the Online reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defults to False. oclass (str) : daos object class string (eg: "RP_2G8"). Defaults to None. """ if oclass is None: oclass = self.ior_cmd.dfs_oclass.value test_seq = self.ior_test_sequence[0] # Create a pool pool = {} exclude_servers = (len(self.hostlist_servers) * 2) - 1 # Exclude one rank : other than rank 0. rank = random.randint(1, exclude_servers) #nosec # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = add_pool(self, connect=False) pool[val].set_property("reclaim", "disabled") # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] self.pool = pool[val] # Instantiate aggregation if self.test_during_aggregation is True: for _ in range(0, 2): self.run_ior_thread("Write", oclass, test_seq) self.delete_extra_container(self.pool) # The following thread runs while performing osa operations. threads.append(threading.Thread(target=self.run_ior_thread, kwargs={"action": "Write", "oclass": oclass, "test": test_seq})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: output = self.dmg_command.pool_exclude( self.pool.uuid, rank) else: output = self.dmg_command.system_stop(ranks=rank, force=True) self.pool.wait_for_rebuild(False) self.log.info(output) output = self.dmg_command.system_start(ranks=rank) self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.get_pool_version() self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude # pver_exclude should be greater than # pver_begin + 8 targets. self.assertTrue(pver_exclude > (pver_begin + 8), "Pool Version Error: After exclude") output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank) self.print_and_assert_on_rebuild_failure(output) pver_reint = self.get_pool_version() self.log.info("Pool Version after reintegrate %d", pver_reint) # Check pool version incremented after pool reintegrate self.assertTrue(pver_reint > (pver_exclude + 1), "Pool Version Error: After reintegrate") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.run_ior_thread("Read", oclass, test_seq) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output) @skipForTicket("DAOS-7420") def test_osa_online_reintegration(self): """Test ID: DAOS-5075. Test Description: Validate Online Reintegration :avocado: tags=all,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=online_reintegration,online_reintegration_basic """ self.log.info("Online Reintegration : Basic test") self.run_online_reintegration_test(1) @skipForTicket("DAOS-7195") def test_osa_online_reintegration_server_stop(self): """Test ID: DAOS-5920. Test Description: Validate Online Reintegration with server stop :avocado: tags=all,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=online_reintegration,online_reintegration_srv_stop """ self.log.info("Online Reintegration : System stop/start") self.run_online_reintegration_test(1, server_boot=True) @skipForTicket("DAOS-7420") def test_osa_online_reintegration_without_csum(self): """Test ID: DAOS-5075. Test Description: Validate Online Reintegration without checksum :avocado: tags=all,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=online_reintegration,online_reintegration_without_csum """ self.log.info("Online Reintegration : No Checksum") self.test_with_checksum = self.params.get("test_with_checksum", '/run/checksum/*') self.run_online_reintegration_test(1) @skipForTicket("DAOS-7996") def test_osa_online_reintegration_with_aggregation(self): """Test ID: DAOS-6715 Test Description: Reintegrate rank while aggregation is happening in parallel :avocado: tags=all,full_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=online_reintegration,online_reintegration_aggregation """ self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') self.log.info("Online Reintegration : Aggregation") self.run_online_reintegration_test(1) @skipForTicket("DAOS-7996") def test_osa_online_reintegration_oclass(self): """Test ID: DAOS-6715 Test Description: Reintegrate rank with different object class :avocado: tags=all,full_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=online_reintegration,online_reintegration_oclass """ self.log.info("Online Reintegration : Object Class") for oclass in self.test_oclass: self.run_online_reintegration_test(1, oclass=oclass)
class OSAOnlineReintegration(OSAUtils): # pylint: disable=too-many-ancestors """Online Server Addition online re-integration test class. Test Class Description: This test runs the daos_server Online reintegration test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def run_online_reintegration_test(self, num_pool, racer=False, server_boot=False): """Run the Online reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defults to False. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] exclude_servers = (len(self.hostlist_servers) * 2) - 1 # Exclude one rank : other than rank 0. rank = random.randint(1, exclude_servers) # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool = pool[val] time.sleep(5) self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: output = self.dmg_command.pool_exclude(self.pool.uuid, rank) else: output = self.dmg_command.system_stop(ranks=rank) self.pool.wait_for_rebuild(True) self.log.info(output) output = self.dmg_command.system_start(ranks=rank) self.log.info(output) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_exclude = self.get_pool_version() time.sleep(5) self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude # pver_exclude should be greater than # pver_begin + 8 targets. self.assertTrue(pver_exclude > (pver_begin + 8), "Pool Version Error: After exclude") output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank) self.log.info(output) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_reint = self.get_pool_version() self.log.info("Pool Version after reintegrate %d", pver_reint) # Check pool version incremented after pool reintegrate self.assertTrue(pver_reint > (pver_exclude + 1), "Pool Version Error: After reintegrate") # Wait to finish the threads for thrd in threads: thrd.join(timeout=20) # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) @skipForTicket("DAOS-6573") def test_osa_online_reintegration(self): """Test ID: DAOS-5075. Test Description: Validate Online Reintegration :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa :avocado: tags=online_reintegration """ # Perform reintegration testing with 1 pool. for pool_num in range(1, 2): self.run_online_reintegration_test(pool_num) @skipForTicket("DAOS-6766, DAOS-6783") def test_osa_online_reintegration_server_stop(self): """Test ID: DAOS-5920. Test Description: Validate Online Reintegration with server stop :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa :avocado: tags=online_reintegration_srv_stop """ self.run_online_reintegration_test(1, server_boot=True)
def verify_client_run(self, exp_iface, env): """Verify the interface assigned by running a libdaos client. Args: exp_iface (str): expected interface to check. env (bool): add OFI_INTERFACE variable to exported variables of client command. Returns: bool: returns status """ clients = self.agent_managers[0].hosts # Get counter values for hfi devices before and after port_info_before = self.get_port_cnt(clients, "port_rcv_data") # get the dmg config file for daos_racer dmg = self.get_dmg_command() # Let's run daos_racer as a client daos_racer = DaosRacerCommand(self.bin, clients[0], dmg) daos_racer.get_params(self) # Update env_name list to add OFI_INTERFACE if needed. if env: daos_racer.update_env_names(["OFI_INTERFACE"]) # Setup the environment and logfile log_file = "daos_racer_{}_{}.log".format(exp_iface, env) # Add FI_LOG_LEVEL to get more info on device issues racer_env = daos_racer.get_environment(self.server_managers[0], log_file) racer_env["FI_LOG_LEVEL"] = "info" racer_env["D_LOG_MASK"] = "INFO,object=ERR,placement=ERR" daos_racer.set_environment(racer_env) # Run client daos_racer.run() # Verify output and port count to check what iface CaRT init with. port_info_after = self.get_port_cnt(clients, "port_rcv_data") self.log.info("Client interface port_rcv_data counters") msg_format = "%16s %9s %9s %9s %s" self.log.info(msg_format, "Host(s)", "Interface", "Before", "After", "Difference") self.log.info(msg_format, "-" * 16, "-" * 9, "-" * 9, "-" * 9, "-" * 9) no_traffic = set() for interface in sorted(port_info_before): for host in sorted(port_info_before[interface]): before = port_info_before[interface][host][1]["port_rcv_data"] try: after = port_info_after[interface][host][1]["port_rcv_data"] diff = int(after) - int(before) if diff <= 0: no_traffic.add(interface) except (KeyError, ValueError) as error: after = "Error" diff = "Unknown - {}".format(error) no_traffic.add(interface) self.log.info(msg_format, host, interface, before, after, diff) # Read daos.log to verify device used and prevent false positives self.assertTrue(self.get_log_info(clients, exp_iface, env, get_log_file(log_file))) # If we don't see data going through the device, fail for interface in no_traffic: self.log.info("No client traffic seen through device: %s", interface) return len(no_traffic) != len(self.interfaces)
class OSAOnlineParallelTest(OSAUtils): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server online drain,reintegration, extend test cases in parallel. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineParallelTest, self).setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None def daos_racer_thread(self, results): """Start the daos_racer thread. """ self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() results.put("Daos Racer Started") def dmg_thread(self, action, action_args, results): """Generate different dmg command related to OSA. Args: action_args(dict) : {action: {"puuid": pool[val].uuid, "rank": rank, "target": t_string, "action": action,} results (queue) : dmg command output queue. """ # Give sometime for IOR threads to start dmg = copy.copy(self.dmg_command) try: if action == "reintegrate": time.sleep(60) # For each action, read the values from the # dictionary. # example {"exclude" : {"puuid": self.pool, "rank": rank # "target": t_string, "action": exclude}} # getattr is used to obtain the method in dmg object. # eg: dmg -> pool_exclude method, then pass arguments like # puuid, rank, target to the pool_exclude method. getattr(dmg, "pool_{}".format(action))(**action_args[action]) except CommandFailure as _error: results.put("{} failed".format(action_args[action])) # Future enhancement for extend # elif action == "extend": # dmg.pool_extend(puuid, (rank + 2)) def run_online_parallel_test(self, num_pool, racer=False): """Run multiple OSA commands / IO in parallel. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] target_list = [] # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Exclude rank 2. rank = 2 # Start the daos_racer thread if racer is True: kwargs = {"results": self.ds_racer_queue} daos_racer_thread = threading.Thread(target=self.daos_racer_thread, kwargs=kwargs) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) threads = [] for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): # Action dictionary with OSA dmg command parameters action_args = { "drain": { "pool": self.pool.uuid, "rank": rank, "tgt_idx": None }, "exclude": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string }, "reintegrate": { "pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string } } for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) for action in sorted(action_args): # Add dmg threads threads.append( threading.Thread(target=self.dmg_thread, kwargs={ "action": action, "action_args": action_args, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(2) # Wait to finish the threads for thrd in threads: thrd.join(timeout=20) # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_end = self.get_pool_version() self.log.info("Pool Version at the End %s", pver_end) self.assertTrue(pver_end == 25, "Pool Version Error: at the end") pool[val].destroy() @skipForTicket("DAOS-6107") def test_osa_online_parallel_test(self): """ JIRA ID: DAOS-4752 Test Description: Runs multiple OSA commands/IO in parallel :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa :avocado: tags=osa_parallel,online_parallel """ self.run_online_parallel_test(1)
class OSAOnlineParallelTest(TestWithServers): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server online drain,reintegration, extend test cases in parallel. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineParallelTest, self).setUp() self.dmg_command = self.get_dmg_command() self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*') self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*') self.record_length = self.params.get("length", '/run/record/*') self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) def daos_racer_thread(self, results): """Start the daos_racer thread. """ self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() results.put("Daos Racer Started") def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results Returns: None """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") def dmg_thread(self, action, action_args, results): """Generate different dmg command related to OSA. Args: action_args(dict) : {action: {"puuid": pool[val].uuid, "rank": rank, "target": t_string, "action": action,} results (queue) : dmg command output queue. """ # Give sometime for IOR threads to start dmg = copy.copy(self.dmg_command) try: if action == "reintegrate": time.sleep(60) # For each action, read the values from the # dictionary. # example {"exclude" : {"puuid": self.pool, "rank": rank # "target": t_string, "action": exclude}} # getattr is used to obtain the method in dmg object. # eg: dmg -> pool_exclude method, then pass arguments like # puuid, rank, target to the pool_exclude method. getattr(dmg, "pool_{}".format(action))(**action_args[action]) except CommandFailure as _error: results.put("{} failed".format(action_args[action])) # Future enhancement for extend # elif action == "extend": # dmg.pool_extend(puuid, (rank + 2)) def run_online_parallel_test(self, num_pool): """Run multiple OSA commands / IO in parallel. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] target_list = [] # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n+1) t_string = "{},{}".format(target_list[0], target_list[1]) # Exclude rank 2. rank = 2 # Start the daos_racer thread kwargs = {"results": self.ds_racer_queue} daos_racer_thread = threading.Thread(target=self.daos_racer_thread, kwargs=kwargs) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): threads = [] # Action dictionary with OSA dmg command parameters action_args = { "drain": {"pool": self.pool.uuid, "rank": rank, "tgt_idx": None}, "exclude": {"pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string}, "reintegrate": {"pool": self.pool.uuid, "rank": (rank + 1), "tgt_idx": t_string} } for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append(threading.Thread(target=self.ior_thread, kwargs={"pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue})) for action in sorted(action_args): # Add dmg threads threads.append(threading.Thread(target=self.dmg_thread, kwargs={"action": action, "action_args": action_args, "results": self.out_queue})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(3) # Wait to finish the threads for thrd in threads: thrd.join() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) pool[val].display_pool_daos_space(display_string) fail_count = 0 while fail_count <= 20: pver_end = self.get_pool_version() time.sleep(10) fail_count += 1 if pver_end > 23: break self.log.info("Pool Version at the End %s", pver_end) self.assertTrue(pver_end == 25, "Pool Version Error: at the end") pool[val].destroy() @skipForTicket("DAOS-5877") def test_osa_online_parallel_test(self): """ JIRA ID: DAOS-4752 Test Description: Runs multiple OSA commands/IO in parallel :avocado: tags=all,pr,hw,large,osa,osa_parallel,online_parallel """ self.run_online_parallel_test(1)
def verify_client_run(self, exp_iface, env): """Verify the interface assigned by running a libdaos client. Args: exp_iface (str): expected interface to check. env (bool): add OFI_INTERFACE variable to exported variables of client command. Returns: bool: returns status """ hfi_map = {"ib0": "hfi1_0", "ib1": "hfi1_1"} # Get counter values for hfi devices before and after cnt_before = self.get_port_cnt(self.hostlist_clients, hfi_map[exp_iface], "port_rcv_data") # get the dmg config file for daos_racer dmg = self.get_dmg_command() # Let's run daos_racer as a client daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg) daos_racer.get_params(self) # Update env_name list to add OFI_INTERFACE if needed. if env: daos_racer.update_env_names(["OFI_INTERFACE"]) # Setup the environment and logfile logf = "daos_racer_{}_{}.log".format(exp_iface, env) # Add FI_LOG_LEVEL to get more info on device issues racer_env = daos_racer.get_environment(self.server_managers[0], logf) racer_env["FI_LOG_LEVEL"] = "info" daos_racer.set_environment(racer_env) # Run client daos_racer.run() # Verify output and port count to check what iface CaRT init with. cnt_after = self.get_port_cnt(self.hostlist_clients, hfi_map[exp_iface], "port_rcv_data") diff = 0 for cnt_b, cnt_a in zip(cnt_before.values(), cnt_after.values()): diff = int(cnt_a) - int(cnt_b) self.log.info("Port [%s] count difference: %s", exp_iface, diff) # Read daos.log to verify device used and prevent false positives self.assertTrue( self.get_log_info(self.hostlist_clients, exp_iface, env, get_log_file(logf))) # If we don't see data going through the device, fail status = True if diff <= 0: self.log.info("No traffic seen through device: %s", exp_iface) status = False else: status = True return status
class OSAOnlineExtend(TestWithServers): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server Online Extend test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineExtend, self).setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_daos_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get( "obj_class", '/run/ior/iorflags/*') # Start an additional server. self.extra_servers = self.params.get("test_servers", "/run/extra_servers/*") # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool_version_value """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class API (str): IOR API test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}" .format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(manager)) manager.assign_hosts(self.hostlist_clients, self.workdir, None) manager.assign_processes(processes) manager.assign_environment(env, True) # run IOR Command try: manager.run() except CommandFailure as _error: results.put("FAIL") def run_online_extend_test(self, num_pool): """Run the Online extend without data. Args: int : total pools to create for testing purposes. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] total_servers = len(self.hostlist_servers) # Extend one of the ranks (or server) # rank index starts from zero rank = total_servers # Start the daos_racer thread daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Extend the pool_uuid, rank and targets for val in range(0, num_pool): for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): threads = [] for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append(threading.Thread(target=self.ior_thread, kwargs={"pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue})) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) self.pool = pool[val] scm_size = self.pool.scm_size nvme_size = self.pool.nvme_size self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. time.sleep(5) self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_extend(self.pool.uuid, rank, scm_size, nvme_size) self.log.info(output) fail_count = 0 while fail_count <= 20: pver_extend = self.get_pool_version() time.sleep(15) fail_count += 1 if pver_extend > pver_begin: break self.log.info("Pool Version after extend %s", pver_extend) # Check pool version incremented after pool exclude self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") # Wait to finish the threads for thrd in threads: thrd.join() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) pool[val].destroy() @skipForTicket("DAOS-5869") def test_osa_online_extend(self): """Test ID: DAOS-4751 Test Description: Validate Online extend :avocado: tags=all,pr,hw,large,osa,osa_extend,online_extend """ # Perform extend testing with 1 to 2 pools self.run_online_extend_test(1)
class OSAOnlineReintegration(TestWithServers): # pylint: disable=too-many-ancestors """Online Server Addition online re-integration test class. Test Class Description: This test runs the daos_server Online reintegration test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineReintegration, self).setUp() self.dmg_command = self.get_dmg_command() self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*') self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*') self.record_length = self.params.get("length", '/run/record/*') self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None @fail_on(CommandFailure) def get_pool_leader(self): """Get the pool leader. Returns: int: pool leader number """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["leader"]) @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. Returns: int: pool version number """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ processes = self.params.get("slots", "/run/ior/clientslots/*") container_info = {} mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test : Mpich not installed on :" " {}".format(self.hostfile_clients[0])) self.pool = pool # Define the arguments for the ior_runner_thread method ior_cmd = IorCommand() ior_cmd.get_params(self) ior_cmd.set_daos_params(self.server_group, self.pool) ior_cmd.dfs_oclass.update(oclass) ior_cmd.api.update(api) ior_cmd.transfer_size.update(test[2]) ior_cmd.block_size.update(test[3]) ior_cmd.flags.update(flags) container_info["{}{}{}".format(oclass, api, test[2])] = str(uuid.uuid4()) # Define the job manager for the IOR command self.job_manager = Mpirun(ior_cmd, mpitype="mpich") key = "".join([oclass, api, str(test[2])]) self.job_manager.job.dfs_cont.update(container_info[key]) env = ior_cmd.get_default_env(str(self.job_manager)) self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) self.job_manager.assign_processes(processes) self.job_manager.assign_environment(env, True) # run IOR Command try: self.job_manager.run() except CommandFailure as _error: results.put("FAIL") def run_online_reintegration_test(self, num_pool): """Run the Online reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] target_list = [] exclude_servers = len(self.hostlist_servers) - 1 # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) target_list.append(n) target_list.append(n + 1) t_string = "{},{}".format(target_list[0], target_list[1]) # Exclude one rank : other than rank 0. rank = random.randint(1, exclude_servers) # Start the daos_racer thread daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Exclude and reintegrate the pool_uuid, rank and targets for val in range(0, num_pool): for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): threads = [] for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(5) self.pool = pool[val] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_exclude(self.pool.uuid, rank, t_string) self.log.info(output) fail_count = 0 while fail_count <= 20: pver_exclude = self.get_pool_version() time.sleep(10) fail_count += 1 if pver_exclude > (pver_begin + len(target_list)): break self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude self.assertTrue(pver_exclude > (pver_begin + len(target_list)), "Pool Version Error: After exclude") output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank, t_string) self.log.info(output) fail_count = 0 while fail_count <= 20: pver_reint = self.get_pool_version() time.sleep(10) fail_count += 1 if pver_reint > (pver_exclude + 1): break self.log.info("Pool Version after reintegrate %d", pver_reint) # Check pool version incremented after pool reintegrate self.assertTrue(pver_reint > (pver_exclude + 1), "Pool Version Error: After reintegrate") # Wait to finish the threads for thrd in threads: thrd.join() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) pool[val].destroy() def test_osa_online_reintegration(self): """Test ID: DAOS-5075. Test Description: Validate Online Reintegration :avocado: tags=all,pr,hw,large,osa,online_reintegration,DAOS_5610 """ # Perform reintegration testing with 1 pool. for pool_num in range(1, 2): self.run_online_reintegration_test(pool_num)
class OSAOnlineExtend(OSAUtils): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server Online Extend test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super(OSAOnlineExtend, self).setUp() self.dmg_command = self.get_dmg_command() self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.ior_daos_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get("obj_class", '/run/ior/iorflags/*') # Start an additional server. self.extra_servers = self.params.get("test_servers", "/run/extra_servers/*") # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.out_queue = queue.Queue() self.ds_racer_queue = queue.Queue() self.daos_racer = None def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def run_online_extend_test(self, num_pool, racer=False): """Run the Online extend without data. Args: int : total pools to create for testing purposes. """ num_jobs = self.params.get("no_parallel_job", '/run/ior/*') # Create a pool pool = {} pool_uuid = [] # Extend one of the ranks 4 and 5 rank = [4, 5] # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(self.context, self.get_dmg_command()) pool[val].get_params(self) # Split total SCM and NVME size for creating multiple pools. pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool) pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) # Extend the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] for oclass, api, test, flags in product(self.ior_dfs_oclass, self.ior_apis, self.ior_test_sequence, self.ior_flags): for _ in range(0, num_jobs): # Add a thread for these IOR arguments threads.append( threading.Thread(target=self.ior_thread, kwargs={ "pool": pool[val], "oclass": oclass, "api": api, "test": test, "flags": flags, "results": self.out_queue })) # Launch the IOR threads for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool = pool[val] scm_size = self.pool.scm_size nvme_size = self.pool.nvme_size self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) # Give sometime for the additional server to come up. time.sleep(25) self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_extend(self.pool.uuid, rank, scm_size, nvme_size) self.log.info(output) self.is_rebuild_done(3) self.assert_on_rebuild_failure() pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %s", pver_extend) # Check pool version incremented after pool exclude self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") # Wait to finish the threads for thrd in threads: thrd.join(timeout=20) # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) pool[val].destroy() @skipForTicket("DAOS-5869") def test_osa_online_extend(self): """Test ID: DAOS-4751 Test Description: Validate Online extend :avocado: tags=all,pr,daily_regression,hw,medium,ib2 :avocado: tags=osa,osa_extend,online_extend """ # Perform extend testing with 1 to 2 pools self.run_online_extend_test(1)
class OSAOnlineExtend(OSAUtils): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs daos_server Online Extend test cases. :avocado: recursive """ def setUp(self): """Set up for test case.""" super().setUp() self.dmg_command = self.get_dmg_command() self.daos_command = DaosCommand(self.bin) self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') self.ranks = self.params.get("rank_list", '/run/test_ranks/*') # Start an additional server. self.extra_servers = self.params.get("test_servers", "/run/extra_servers/*") # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file(self.hostlist_clients, self.workdir, None) self.pool = None self.dmg_command.exit_status_exception = True self.daos_racer = None def daos_racer_thread(self): """Start the daos_racer thread.""" self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], self.dmg_command) self.daos_racer.get_params(self) self.daos_racer.set_environment( self.daos_racer.get_environment(self.server_managers[0])) self.daos_racer.run() def run_online_extend_test(self, num_pool, racer=False, oclass=None, app_name="ior"): """Run the Online extend without data. Args: num_pool(int) : total pools to create for testing purposes. racer(bool) : Run the testing along with daos_racer. Defaults to False. oclass(str) : Object Class (eg: RP_2G1, etc). Default to None. app_name(str) : App (ior or mdtest) to run during the testing. Defaults to ior. """ # Pool dictionary pool = {} if oclass is None: oclass = self.ior_cmd.dfs_oclass.value test_seq = self.ior_test_sequence[0] # Start the daos_racer thread if racer is True: daos_racer_thread = threading.Thread(target=self.daos_racer_thread) daos_racer_thread.start() time.sleep(30) for val in range(0, num_pool): pool[val] = TestPool(context=self.context, dmg_command=self.get_dmg_command(), label_generator=self.label_generator) pool[val].get_params(self) pool[val].create() pool[val].set_property("reclaim", "disabled") # Extend the pool_uuid, rank and targets for val in range(0, num_pool): threads = [] self.pool = pool[val] # Start the additional servers and extend the pool self.log.info("Extra Servers = %s", self.extra_servers) self.start_additional_servers(self.extra_servers) if self.test_during_aggregation is True: for _ in range(0, 2): self.run_ior_thread("Write", oclass, test_seq) self.delete_extra_container(self.pool) # The following thread runs while performing osa operations. if app_name == "ior": threads.append( threading.Thread(target=self.run_ior_thread, kwargs={ "action": "Write", "oclass": oclass, "test": test_seq })) else: threads.append(threading.Thread(target=self.run_mdtest_thread)) # Make sure system map has all ranks in joined state. for retry in range(0, 10): scan_info = self.get_dmg_command().system_query() if not check_system_query_status(scan_info): if retry == 9: self.fail("One or more servers not in expected status") else: break # Launch the IOR or mdtest thread for thrd in threads: self.log.info("Thread : %s", thrd) thrd.start() time.sleep(1) self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) output = self.dmg_command.pool_extend(self.pool.uuid, self.ranks) self.print_and_assert_on_rebuild_failure(output) pver_extend = self.get_pool_version() self.log.info("Pool Version after extend %s", pver_extend) # Check pool version incremented after pool exclude self.assertTrue(pver_extend > pver_begin, "Pool Version Error: After extend") # Wait to finish the threads for thrd in threads: thrd.join() if not self.out_queue.empty(): self.assert_on_exception() # Check data consistency for IOR in future # Presently, we are running daos_racer in parallel # to IOR and checking the data consistency only # for the daos_racer objects after exclude # and reintegration. if racer is True: daos_racer_thread.join() for val in range(0, num_pool): display_string = "Pool{} space at the End".format(val) self.pool = pool[val] self.pool.display_pool_daos_space(display_string) self.run_ior_thread("Read", oclass, test_seq) self.container = self.pool_cont_dict[self.pool][0] kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid} output = self.daos_command.container_check(**kwargs) self.log.info(output) @skipForTicket("DAOS-7195,DAOS-7955") def test_osa_online_extend(self): """Test ID: DAOS-4751 Test Description: Validate Online extend with checksum enabled. :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=osa_extend,online_extend,online_extend_with_csum """ self.log.info("Online Extend : With Checksum") self.run_online_extend_test(1) @skipForTicket("DAOS-7195,DAOS-7955") def test_osa_online_extend_without_checksum(self): """Test ID: DAOS-6645 Test Description: Validate Online extend without checksum enabled. :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=osa_extend,online_extend,online_extend_without_csum """ self.log.info("Online Extend : Without Checksum") self.test_with_checksum = self.params.get("test_with_checksum", '/run/checksum/*') self.run_online_extend_test(1) @skipForTicket("DAOS-7195,DAOS-7955") def test_osa_online_extend_oclass(self): """Test ID: DAOS-6645 Test Description: Validate Online extend with different object class. :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=osa_extend,online_extend,online_extend_oclass """ self.log.info("Online Extend : Oclass") self.run_online_extend_test(1, oclass=self.test_oclass[0]) @skipForTicket("DAOS-7195,DAOS-7955") def test_osa_online_extend_mdtest(self): """Test ID: DAOS-6645 Test Description: Validate Online extend with mdtest application. :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=osa_extend,online_extend,online_extend_mdtest """ self.log.info("Online Extend : Mdtest") self.run_online_extend_test(1, app_name="mdtest") @skipForTicket("DAOS-7195,DAOS-7955") def test_osa_online_extend_with_aggregation(self): """Test ID: DAOS-6645 Test Description: Validate Online extend with aggregation on. :avocado: tags=all,pr,daily_regression :avocado: tags=hw,medium,ib2 :avocado: tags=osa,checksum :avocado: tags=osa_extend,online_extend,online_extend_with_aggregation """ self.log.info("Online Extend : Aggregation") self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') self.run_online_extend_test(1)