class RbldReadArrayTest(RebuildTestBase): # pylint: disable=too-many-ancestors """Run rebuild tests with DAOS servers and clients. :avocado: recursive """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.daos_cmd = None def execute_during_rebuild(self): """Read the objects during rebuild.""" self.daos_cmd = DaosCommand(self.bin) self.daos_cmd.container_set_prop(pool=self.pool.uuid, cont=self.container.uuid, prop="status", value="healthy") message = "Reading the array objects during rebuild" self.log.info(message) self.d_log.info(message) self.assertTrue(self.pool.read_data_during_rebuild(self.container), "Error reading data during rebuild") def test_read_array_during_rebuild(self): """Jira ID: DAOS-691. Test Description: Configure 5 targets with 1 pool with a service leader quantity of 2. Add 1 container to the pool configured with 3 replicas. Add 10 objects of 10 records each populated with an array of 5 values (currently a sufficient amount of data to be read fully before rebuild completes) to a specific rank. Exclude this rank and verify that rebuild is initiated. While rebuild is active, confirm that all the objects and records can be read. Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,full_regression :avocado: tags=vm,large,rebuild,rebuildreadarray """ self.execute_rebuild_test()
class ContSecurityTestBase(TestWithServers): """Container security test cases. Test Class Description: Test methods to verify the Container security with acl by using daos tool. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a ContSecurityTestBase object.""" super().__init__(*args, **kwargs) self.dmg = None self.daos_tool = None self.user_uid = None self.user_gid = None self.current_user = None self.current_group = None self.pool_uuid = None self.container_uuid = None def setUp(self): """Set up each test case.""" super().setUp() self.user_uid = os.geteuid() self.user_gid = os.getegid() self.current_user = pwd.getpwuid(self.user_uid)[0] self.current_group = grp.getgrgid(self.user_uid)[0] self.co_prop = self.params.get("container_properties", "/run/container/*") self.dmg = self.get_dmg_command() self.daos_tool = DaosCommand(self.bin) @fail_on(CommandFailure) def create_pool_with_dmg(self): """Create a pool with the dmg tool. Obtains the pool uuid from the operation's result Returns: pool_uuid (str): Pool UUID, randomly generated. """ self.prepare_pool() pool_uuid = self.pool.pool.get_uuid_str() return pool_uuid def create_container_with_daos(self, pool, acl_type=None, acl_file=None): """Create a container with the daos tool. Also, obtains the container uuid from the operation's result. Args: pool (TestPool): Pool object. acl_type (str, optional): valid or invalid. Returns: container_uuid: Container UUID created. """ file_name = None get_acl_file = None expected_acl_types = [None, "valid", "invalid"] if acl_file is None: if acl_type not in expected_acl_types: self.fail(" Invalid '{}' acl type passed.".format(acl_type)) if acl_type: get_acl_file = "acl_{}.txt".format(acl_type) file_name = os.path.join(self.tmp, get_acl_file) else: get_acl_file = "" else: file_name = acl_file try: self.container = TestContainer(pool=pool, daos_command=self.daos_tool) self.container.get_params(self) self.container.create(acl_file=file_name) container_uuid = self.container.uuid except TestFail as error: if acl_type != "invalid": raise DaosTestError( "Could not create expected container ") from error container_uuid = None return container_uuid def get_container_acl_list(self, pool_uuid, container_uuid, verbose=False, outfile=None): """Get daos container acl list by daos container get-acl. Args: pool_uuid (str): Pool uuid. container_uuid (str): Container uuid. verbose (bool, optional): Verbose mode. outfile (str, optional): Write ACL to file Return: cont_permission_list: daos container acl list. """ if not general_utils.check_uuid_format(pool_uuid): self.fail(" Invalid Pool UUID '{}' provided.".format(pool_uuid)) if not general_utils.check_uuid_format(container_uuid): self.fail(" Invalid Container UUID '{}' provided.".format( container_uuid)) result = self.daos_tool.container_get_acl(pool_uuid, container_uuid, verbose, outfile) cont_permission_list = [] for line in result.stdout_text.splitlines(): if not line.startswith("A:"): continue elif line.startswith("A::"): found_user = re.search(r"A::(.+)@:(.*)", line) if found_user: cont_permission_list.append(line) elif line.startswith("A:G:"): found_group = re.search(r"A:G:(.+)@:(.*)", line) if found_group: cont_permission_list.append(line) return cont_permission_list def overwrite_container_acl(self, acl_file): """Overwrite existing container acl-entries with acl_file. Args: acl_file (str): acl filename. Return: result (str): daos_tool.container_overwrite_acl. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_overwrite_acl( self.pool_uuid, self.container_uuid, acl_file) return result def update_container_acl(self, entry): """Update container acl entry. Args: entry (str): acl entry to be updated. Return: result (str): daos_tool.container_update_acl. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_update_acl(self.pool_uuid, self.container_uuid, entry=entry) return result def test_container_destroy(self, pool_uuid, container_uuid): """Test container destroy/delete. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. Return: result (str): daos_tool.container_destroy result. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_destroy(pool_uuid, container_uuid, True) return result def set_container_attribute(self, pool_uuid, container_uuid, attr, value): """Write/Set container attribute. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. attr (str): container attribute. value (str): container attribute value to be set. Return: result (str): daos_tool.container_set_attr result. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_set_attr(pool_uuid, container_uuid, attr, value) return result def get_container_attribute(self, pool_uuid, container_uuid, attr): """Get container attribute. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. attr (str): container attribute. Return: CmdResult: Object that contains exit status, stdout, and other information. """ self.daos_tool.exit_status_exception = False self.daos_tool.container_get_attr(pool_uuid, container_uuid, attr) return self.daos_tool.result def list_container_attribute(self, pool_uuid, container_uuid): """List container attribute. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. Return: result (str): daos_tool.container_list_attrs result. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_list_attrs(pool_uuid, container_uuid) return result def set_container_property(self, pool_uuid, container_uuid, prop, value): """Write/Set container property. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. prop (str): container property name. value (str): container property value to be set. Return: result (str): daos_tool.container_set_prop result. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_set_prop(pool_uuid, container_uuid, prop, value) return result def get_container_property(self, pool_uuid, container_uuid): """Get container property. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. Return: result (str): daos_tool.container_get_prop. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_get_prop(pool_uuid, container_uuid) return result def set_container_owner(self, pool_uuid, container_uuid, user, group): """Set container owner. Args: pool_uuid (str): pool uuid. container_uuid (str): container uuid. user (str): container user-name to be set owner to. group (str): container group-name to be set owner to. Return: result (str): daos_tool.container_set_owner. """ self.daos_tool.exit_status_exception = False result = self.daos_tool.container_set_owner(pool_uuid, container_uuid, user, group) return result def compare_acl_lists(self, get_acl_list, expected_list): """Compare two permission lists. Args: get_acl_list (str list): list of permissions obtained by get-acl expected_list (str list): list of expected permissions Returns: True or False if both permission lists are identical or not """ self.log.info(" ===> get-acl ACL: %s", get_acl_list) self.log.info(" ===> Expected ACL: %s", expected_list) exp_list = expected_list[:] if len(get_acl_list) != len(exp_list): return False for acl in get_acl_list: if acl in exp_list: exp_list.remove(acl) else: return False return True def get_base_acl_entries(self, test_user): """Get container acl entries per cont enforcement order for test_user. Args: test_user (str): test user. Returns (list str): List of base container acl entries for the test_user. """ if test_user == "OWNER": base_acl_entries = [ secTestBase.acl_entry("user", "OWNER", ""), secTestBase.acl_entry("user", self.current_user, ""), secTestBase.acl_entry("group", "GROUP", "rwcdtTaAo"), secTestBase.acl_entry("group", self.current_group, "rwcdtTaAo"), secTestBase.acl_entry("user", "EVERYONE", "rwcdtTaAo") ] elif test_user == "user": base_acl_entries = [ "", secTestBase.acl_entry("user", self.current_user, ""), secTestBase.acl_entry("group", "GROUP", "rwcdtTaAo"), secTestBase.acl_entry("group", self.current_group, ""), secTestBase.acl_entry("user", "EVERYONE", "rwcdtTaAo") ] elif test_user == "group": base_acl_entries = [ "", "", secTestBase.acl_entry("group", "GROUP", ""), secTestBase.acl_entry("group", self.current_group, ""), secTestBase.acl_entry("user", "EVERYONE", "rwcdtTaAo") ] elif test_user == "GROUP": base_acl_entries = [ "", "", "", secTestBase.acl_entry("group", self.current_group, ""), secTestBase.acl_entry("user", "EVERYONE", "rwcdtTaAo") ] elif test_user == "EVERYONE": base_acl_entries = [ "", "", "", "", secTestBase.acl_entry("user", "EVERYONE", "") ] else: base_acl_entries = ["", "", "", "", ""] return base_acl_entries def cleanup(self, types): """Remove all temporal acl files created during the test. Args: types (list): types of acl files [valid, invalid] """ for typ in types: get_acl_file = "acl_{}.txt".format(typ) file_name = os.path.join(self.tmp, get_acl_file) cmd = "rm -r {}".format(file_name) general_utils.run_command(cmd) def error_handling(self, results, err_msg): """Handle errors when test fails and when command unexpectedly passes. Args: results (CmdResult): object containing stdout, stderr and exit status. err_msg (str): error message string to look for in stderr. Returns: list: list of test errors encountered. """ test_errs = [] if results.exit_status == 0: test_errs.append("{} passed unexpectedly: {}".format( results.command, results.stdout_text)) elif results.exit_status == 1: # REMOVE BELOW IF Once DAOS-5635 is resolved if results.stdout_text and err_msg in results.stdout_text: self.log.info("Found expected error %s", results.stdout_text) # REMOVE ABOVE IF Once DAOS-5635 is resolved elif results.stderr_text and err_msg in results.stderr_text: self.log.info("Found expected error %s", results.stderr_text) else: self.fail("{} seems to have failed with \ unexpected error: {}".format(results.command, results)) return test_errs def acl_file_diff(self, prev_acl, flag=True): """Compare current content of acl-file with helper function. If provided prev_acl file information is different from current acl file information test will fail if flag=True. If flag=False, test will fail in the case that the acl contents are found to have no difference. Args: prev_acl (list): list of acl entries within acl-file. Defaults to True. flag (bool, optional): if True, test will fail when acl-file contents are different, else test will fail when acl-file contents are same. Defaults to True. """ current_acl = self.get_container_acl_list(self.pool.uuid, self.container.uuid) if self.compare_acl_lists(prev_acl, current_acl) != flag: self.fail("Previous ACL:\n{} \nPost command ACL:\n{}".format( prev_acl, current_acl))
class ErasureCodeSingle(TestWithServers): # pylint: disable=too-many-ancestors # pylint: disable=too-many-instance-attributes """ Class to used for EC testing for single type data. """ def __init__(self, *args, **kwargs): """Initialize a TestWithServers object.""" super().__init__(*args, **kwargs) self.server_count = None self.set_online_rebuild = False self.rank_to_kill = None self.daos_cmd = None self.container = [] def setUp(self): """Set up each test case.""" # Start the servers and agents super().setUp() engine_count = self.server_managers[0].get_config_value( "engines_per_host") self.server_count = len(self.hostlist_servers) * engine_count self.obj_class = self.params.get("dfs_oclass_list", '/run/objectclass/*') self.singledata_set = self.params.get("single_data_set", '/run/container/*') self.add_pool() self.out_queue = queue.Queue() def ec_container_create(self, index, oclass): """Create the container for EC object Args: index(int): container number oclass(str): object class for creating the container. """ self.container.append(TestContainer(self.pool)) # Get container parameters self.container[index].get_params(self) # update object class for container create, if supplied explicitly. self.container[index].oclass.update(oclass) # Get the Parity count for setting the container RF property. ec_object = get_data_parity_number(self.log, oclass) self.container[index].properties.update("rf:{}".format( ec_object['parity'])) # create container self.container[index].create() def single_type_param_update(self, index, data): """Update the data set content provided from yaml file. Args: index(int): container number data(list): dataset content from test yaml file. """ self.container[index].object_qty.update(data[0]) self.container[index].record_qty.update(data[1]) self.container[index].dkey_size.update(data[2]) self.container[index].akey_size.update(data[3]) self.container[index].data_size.update(data[4]) def write_single_type_dataset(self, results=None): """Write single type data set with different EC object and different sizes. Args: results (queue): queue for returning thread results """ cont_count = 0 for oclass in self.obj_class: for sizes in self.singledata_set: # Skip the object type if server count does not meet the minimum EC object server # count if oclass[1] > self.server_count: continue # Create the new container with correct redundancy factor for EC object type try: self.ec_container_create(cont_count, oclass[0]) self.single_type_param_update(cont_count, sizes) # Write the data self.container[cont_count].write_objects( obj_class=oclass[0]) cont_count += 1 if results is not None: results.put("PASS") except (CommandFailure, DaosApiError, DaosTestError): if results is not None: results.put("FAIL") raise def read_single_type_dataset(self, results=None, parity=1): """Read single type data and verify for different EC object and different sizes. Args: results (queue): queue for returning thread results parity(int): object parity number for reading, default All. """ cont_count = 0 self.daos_cmd = DaosCommand(self.bin) for oclass in self.obj_class: for _sizes in self.singledata_set: # Skip the object type if server count does not meet the minimum EC object server # count if oclass[1] > self.server_count: continue parity_set = "P{}".format(parity) # Read the requested data+parity data set only if parity != 1 and parity_set not in oclass[0]: print("Skipping Read as object type is {}".format( oclass[0])) cont_count += 1 continue self.daos_cmd.container_set_prop( pool=self.pool.uuid, cont=self.container[cont_count].uuid, prop="status", value="healthy") # Read data and verified the content try: if not self.container[cont_count].read_objects(): if results is not None: results.put("FAIL") self.fail("Data verification Error") cont_count += 1 if results is not None: results.put("PASS") except (CommandFailure, DaosApiError, DaosTestError): if results is not None: results.put("FAIL") raise def start_online_single_operation(self, operation, parity=1): """Do Write/Read operation with single data type. Args: operation (string): Write/Read operation """ # Create the single data Write/Read threads if operation == 'WRITE': job = threading.Thread(target=self.write_single_type_dataset, kwargs={"results": self.out_queue}) elif operation == 'READ': job = threading.Thread(target=self.read_single_type_dataset, kwargs={ "results": self.out_queue, "parity": parity }) # Launch the single data write/read thread job.start() # Kill the server rank while IO operation in progress if self.set_online_rebuild: time.sleep(10) # Kill the server rank if self.rank_to_kill is not None: self.server_managers[0].stop_ranks([self.rank_to_kill], self.d_log, force=True) # Wait to finish the thread job.join() # Verify the queue and make sure no FAIL for any run while not self.out_queue.empty(): if self.out_queue.get() == "FAIL": self.fail("FAIL")
class RbldCascadingFailures(RebuildTestBase): # pylint: disable=too-many-ancestors """Test cascading failures during rebuild. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a CascadingFailures object.""" super().__init__(*args, **kwargs) self.mode = None self.daos_cmd = None def create_test_container(self): """Create a container and write objects.""" self.container.create() self.container.write_objects( self.inputs.rank.value[0], self.inputs.object_class.value) def verify_rank_has_objects(self): """Verify the first rank to be excluded has at least one object.""" rank_list = self.container.get_target_rank_lists(" before rebuild") objects = { rank: self.container.get_target_rank_count(rank, rank_list) for rank in self.inputs.rank.value } self.assertGreater( objects[self.inputs.rank.value[0]], 0, "No objects written to rank {}".format(self.inputs.rank.value[0])) def verify_rank_has_no_objects(self): """Verify the excluded rank has zero objects.""" rank_list = self.container.get_target_rank_lists(" after rebuild") objects = { rank: self.container.get_target_rank_count(rank, rank_list) for rank in self.inputs.rank.value } for rank in self.inputs.rank.value: self.assertEqual( objects[rank], 0, "Excluded rank {} still has objects".format(rank)) def start_rebuild(self): """Start the rebuild process.""" if self.mode == "simultaneous": # Exclude both ranks from the pool to initiate rebuild self.server_managers[0].stop_ranks( self.inputs.rank.value, self.d_log) else: # Exclude the first rank from the pool to initiate rebuild self.server_managers[0].stop_ranks( [self.inputs.rank.value[0]], self.d_log) if self.mode == "sequential": # Exclude the second rank from the pool self.server_managers[0].stop_ranks( [self.inputs.rank.value[1]], self.d_log) # Wait for rebuild to start self.pool.wait_for_rebuild(True, 1) def execute_during_rebuild(self): """Execute test steps during rebuild.""" self.daos_cmd = DaosCommand(self.bin) if self.mode == "cascading": # Exclude the second rank from the pool during rebuild self.server_managers[0].stop_ranks( [self.inputs.rank.value[1]], self.d_log) self.daos_cmd.container_set_prop( pool=self.pool.uuid, cont=self.container.uuid, prop="status", value="healthy") # Populate the container with additional data during rebuild self.container.write_objects(obj_class=self.inputs.object_class.value) def test_simultaneous_failures(self): """Jira ID: DAOS-842. Test Description: Configure a pool with sufficient redundancy to survive and rebuild from two target failures. Trigger two target failures at the same time. User application I/O should continue to succeed throughout the rebuild process and after. Once the rebuild is complete the pool should reflect a normal status. Use Cases: Verify rebuild with multiple server failures. :avocado: tags=all,large,full_regression,rebuild :avocado: tags=multitarget,simultaneous """ self.mode = "simultaneous" self.execute_rebuild_test() def test_sequential_failures(self): """Jira ID: DAOS-843. Test Description: Configure a pool with sufficient redundancy to survive and rebuild from two target failures. Trigger a single target failure. Before rebuilding from the first failure, activate a second failure. User application I/O should continue to succeed throughout the rebuild process and after. Once the rebuild is complete the pool should reflect a normal status. Use Cases: Verify rebuild with multiple server failures. :avocado: tags=all,large,full_regression,rebuild :avocado: tags=multitarget,sequential """ self.mode = "sequential" self.execute_rebuild_test() def test_cascading_failures(self): """Jira ID: DAOS-844. Test Description: Configure a pool with sufficient redundancy to survive and rebuild from two target failures. Trigger a single target failure. While rebuilding from the first failure, activate a second failure. User application I/O should continue to succeed throughout the rebuild process and after. Once the rebuild is complete the pool should reflect a normal status. Use Cases: Verify rebuild with multiple server failures. :avocado: tags=all,large,full_regression,rebuild :avocado: tags=multitarget,cascading """ self.mode = "cascading" self.execute_rebuild_test()
class RebuildTestBase(TestWithServers): """Base rebuild test class. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a RebuildTestBase object.""" super().__init__(*args, **kwargs) self.inputs = RebuildTestParams() self.targets = None self.server_count = 0 self.info_checks = None self.rebuild_checks = None self.daos_cmd = None def setUp(self): """Set up each test case.""" # Start the servers and agents super().setUp() # Get the test parameters self.inputs.get_params(self) # Get the number of targets per engine for pool info calculations self.targets = self.params.get("targets", "/run/server_config/*") self.server_count = len(self.hostlist_servers) def setup_test_pool(self): """Define a TestPool object.""" self.add_pool(create=False) def setup_test_container(self): """Define a TestContainer object.""" self.add_container(self.pool, create=False) def setup_pool_verify(self): """Set up pool verification initial expected values.""" self.info_checks = { "pi_uuid": self.pool.uuid, "pi_nnodes": self.server_count, "pi_ntargets": (self.server_count * self.targets), "pi_ndisabled": 0, } self.rebuild_checks = { "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, "rs_errno": 0, } def update_pool_verify(self): """Update the pool verification expected values.""" self.info_checks["pi_ndisabled"] = ">0" self.rebuild_checks["rs_obj_nr"] = ">0" self.rebuild_checks["rs_rec_nr"] = ">0" def execute_pool_verify(self, msg=None): """Verify the pool info. Args: msg (str, optional): additional information to include in the error message. Defaults to None. """ status = self.pool.check_pool_info(**self.info_checks) status &= self.pool.check_rebuild_status(**self.rebuild_checks) self.assertTrue( status, "Error confirming pool info{}".format("" if msg is None else msg)) def create_test_pool(self): """Create the pool and verify its info.""" # Create a pool self.pool.create() # Verify the pool information before rebuild self.setup_pool_verify() self.execute_pool_verify(" before rebuild") def create_test_container(self): """Create a container and write objects.""" if self.container is not None: self.container.create() self.container.write_objects( self.inputs.rank.value, self.inputs.object_class.value) def verify_rank_has_objects(self): """Verify the rank to be excluded has at least one object.""" if self.container is not None: rank = self.inputs.rank.value rank_list = self.container.get_target_rank_lists(" before rebuild") qty = self.container.get_target_rank_count(rank, rank_list) self.assertGreater( qty, 0, "No objects written to rank {}".format(rank)) def verify_rank_has_no_objects(self): """Verify the excluded rank has zero objects.""" if self.container is not None: rank = self.inputs.rank.value rank_list = self.container.get_target_rank_lists(" after rebuild") qty = self.container.get_target_rank_count(rank, rank_list) self.assertEqual( qty, 0, "Excluded rank {} still has objects".format(rank)) def start_rebuild(self): """Start the rebuild process.""" # Exclude the rank from the pool to initiate rebuild if isinstance(self.inputs.rank.value, list): self.server_managers[0].stop_ranks( self.inputs.rank.value, self.d_log, force=True) else: self.server_managers[0].stop_ranks( [self.inputs.rank.value], self.d_log, force=True) # Wait for rebuild to start self.pool.wait_for_rebuild(True, 1) def execute_during_rebuild(self): """Execute test steps during rebuild.""" def verify_container_data(self, txn=None): """Verify the container data. Args: txn (int, optional): transaction timestamp to read. Defaults to None which uses the last timestamp written. """ if self.container is not None: self.assertTrue( self.container.read_objects(txn), "Error verifying container data") def execute_rebuild_test(self, create_container=True): """Execute the rebuild test steps. Args: create_container (bool, optional): should the test create a container. Defaults to True. """ # Get the test params self.setup_test_pool() self.daos_cmd = DaosCommand(self.bin) if create_container: self.setup_test_container() # Create a pool and verify the pool information before rebuild self.create_test_pool() # Create a container and write objects self.create_test_container() # Verify the rank to be excluded has at least one object self.verify_rank_has_objects() # Start the rebuild process self.start_rebuild() # Execute the test steps during rebuild self.execute_during_rebuild() # Confirm rebuild completes self.pool.wait_for_rebuild(False, 1) # clear container status for the RF issue self.daos_cmd.container_set_prop( pool=self.pool.uuid, cont=self.container.uuid, prop="status", value="healthy") # Refresh local pool and container self.pool.check_pool_info() self.container.check_container_info() # Verify the excluded rank is no longer used with the objects self.verify_rank_has_no_objects() # Verify the pool information after rebuild self.update_pool_verify() self.execute_pool_verify(" after rebuild") # Verify the container data can still be accessed self.verify_container_data() self.log.info("Test passed")
class RbldBasic(TestWithServers): """Test class for rebuild tests. Test Class Description: This class contains tests for pool rebuild. :avocado: recursive """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.daos_cmd = None def run_rebuild_test(self, pool_quantity): """Run the rebuild test for the specified number of pools. Args: pool_quantity (int): number of pools to test """ # Get the test parameters self.pool = [] self.container = [] self.daos_cmd = DaosCommand(self.bin) for _ in range(pool_quantity): self.pool.append(self.get_pool(create=False)) self.container.append( self.get_container(self.pool[-1], create=False)) rank = self.params.get("rank", "/run/testparams/*") obj_class = self.params.get("object_class", "/run/testparams/*") # Collect server configuration information server_count = len(self.hostlist_servers) engine_count = self.server_managers[0].get_config_value( "engines_per_host") engine_count = 1 if engine_count is None else int(engine_count) target_count = int(self.server_managers[0].get_config_value("targets")) self.log.info( "Running with %s servers, %s engines per server, and %s targets " "per engine", server_count, engine_count, target_count) # Create the pools and confirm their status status = True for index in range(pool_quantity): self.pool[index].create() status &= self.pool[index].check_pool_info( pi_nnodes=server_count * engine_count, pi_ntargets=server_count * engine_count * target_count, pi_ndisabled=0) status &= self.pool[index].check_rebuild_status(rs_done=1, rs_obj_nr=0, rs_rec_nr=0, rs_errno=0) self.assertTrue(status, "Error confirming pool info before rebuild") # Create containers in each pool and fill them with data rs_obj_nr = [] rs_rec_nr = [] for index in range(pool_quantity): self.container[index].create() self.container[index].write_objects(rank, obj_class) # Determine how many objects will need to be rebuilt for index in range(pool_quantity): target_rank_lists = self.container[index].get_target_rank_lists( " prior to rebuild") rebuild_qty = self.container[index].get_target_rank_count( rank, target_rank_lists) rs_obj_nr.append(rebuild_qty) self.log.info( "Expecting %s/%s rebuilt objects in container %s after " "excluding rank %s", rs_obj_nr[-1], len(target_rank_lists), self.container[index], rank) rs_rec_nr.append(rs_obj_nr[-1] * self.container[index].record_qty.value) self.log.info( "Expecting %s/%s rebuilt records in container %s after " "excluding rank %s", rs_rec_nr[-1], self.container[index].object_qty.value * self.container[index].record_qty.value, self.container[index], rank) # Manually exclude the specified rank for index in range(pool_quantity): if index == 0: self.server_managers[0].stop_ranks([rank], self.d_log, True) else: self.pool[index].exclude(ranks=[rank]) # Wait for recovery to start for first pool. self.pool[0].wait_for_rebuild(True) # Wait for recovery to complete for index in range(pool_quantity): self.pool[index].wait_for_rebuild(False) # Check the pool information after the rebuild status = True for index in range(pool_quantity): status &= self.pool[index].check_pool_info( pi_nnodes=server_count * engine_count, pi_ntargets=server_count * engine_count * target_count, pi_ndisabled=target_count) status &= self.pool[index].check_rebuild_status( rs_done=1, rs_obj_nr=rs_obj_nr[index], rs_rec_nr=rs_rec_nr[index], rs_errno=0) self.assertTrue(status, "Error confirming pool info after rebuild") # Verify the data after rebuild for index in range(pool_quantity): self.daos_cmd.container_set_prop(pool=self.pool[index].uuid, cont=self.container[index].uuid, prop="status", value="healthy") if self.container[index].object_qty.value != 0: self.assertTrue(self.container[index].read_objects(), "Data verification error after rebuild") self.log.info("Test Passed") def test_simple_rebuild(self): """JIRA ID: DAOS-XXXX Rebuild-001. Test Description: The most basic rebuild test. Use Cases: single pool rebuild, single client, various record/object counts :avocado: tags=all,daily_regression :avocado: tags=vm,large :avocado: tags=rebuild :avocado: tags=pool,rebuild_tests,test_simple_rebuild """ self.run_rebuild_test(1) def test_multipool_rebuild(self): """JIRA ID: DAOS-XXXX (Rebuild-002). Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: multipool rebuild, single client, various object and record counts :avocado: tags=all,daily_regression :avocado: tags=vm,large :avocado: tags=rebuild :avocado: tags=pool,rebuild_tests,test_multipool_rebuild """ self.run_rebuild_test(self.params.get("quantity", "/run/testparams/*"))
class RbldWithIO(TestWithServers): """Test class for pool rebuild during I/O. Test Class Description: This class contains tests for pool rebuild that feature I/O going on during the rebuild. :avocado: recursive """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.daos_cmd = None def test_rebuild_with_io(self): """JIRA ID: Rebuild-003. Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: single pool, single client performing continuous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=all,pool,rebuild,daily_regression,medium,rebuildwithio """ # Get the test params self.add_pool(create=False) self.add_container(self.pool, create=False) targets = self.params.get("targets", "/run/server_config/*") # data = self.params.get("datasize", "/run/testparams/*") rank = self.params.get("rank", "/run/testparams/*") obj_class = self.params.get("object_class", "/run/testparams/*") server_count = len(self.hostlist_servers) # Create a pool and verify the pool info before rebuild (also connects) self.pool.create() checks = { "pi_nnodes": server_count, "pi_ntargets": server_count * targets, "pi_ndisabled": 0, } self.assertTrue( self.pool.check_pool_info(**checks), "Invalid pool information detected before rebuild") self.assertTrue( self.pool.check_rebuild_status(rs_errno=0, rs_done=1, rs_obj_nr=0, rs_rec_nr=0), "Invalid pool rebuild info detected before rebuild") # Create and open the container self.container.create() # Write data to the container for 30 seconds self.log.info( "Wrote %s bytes to container %s", self.container.execute_io(30, rank, obj_class), self.container.uuid) # Determine how many objects will need to be rebuilt self.container.get_target_rank_lists(" prior to rebuild") # Trigger rebuild self.server_managers[0].stop_ranks([rank], self.d_log) # Wait for recovery to start self.pool.wait_for_rebuild(True) self.daos_cmd = DaosCommand(self.bin) self.daos_cmd.container_set_prop( pool=self.pool.uuid, cont=self.container.uuid, prop="status", value="healthy") # Write data to the container for another 30 seconds self.log.info( "Wrote an additional %s bytes to container %s", self.container.execute_io(30), self.container.uuid) # Wait for recovery to complete self.pool.wait_for_rebuild(False) # Check the pool information after the rebuild status = status = self.pool.check_pool_info( pi_nnodes=server_count, pi_ntargets=(server_count * targets), # DAOS-2799 pi_ndisabled=targets, # DAOS-2799 ) status &= self.pool.check_rebuild_status( rs_done=1, rs_obj_nr=">0", rs_rec_nr=">0", rs_errno=0) self.assertTrue(status, "Error confirming pool info after rebuild") # Verify the data after rebuild self.assertTrue( self.container.read_objects(), "Data verification error after rebuild") self.log.info("Test Passed")
class RbldDeleteObjects(RebuildTestBase): # pylint: disable=too-many-ancestors """Test class for deleting objects during pool rebuild. Test Class Description: This class contains tests for deleting objects from a container during rebuild. :avocado: recursive """ def __init__(self, *args, **kwargs): """Initialize a RebuildDeleteObjects object.""" super().__init__(*args, **kwargs) self.punched_indices = None self.punched_qty = 0 self.punch_type = None self.daos_cmd = None def execute_during_rebuild(self): """Delete half of the objects from the container during rebuild.""" self.daos_cmd = DaosCommand(self.bin) self.daos_cmd.container_set_prop(pool=self.pool.uuid, cont=self.container.uuid, prop="status", value="healthy") if self.punch_type == "object": # Punch half of the objects self.punched_indices = [ index for index in range(self.container.object_qty.value) if index % 2 ] self.punched_qty = self.container.punch_objects( self.punched_indices) elif self.punch_type == "record": # Punch half of the records in each object self.punched_indices = [ index for index in range(self.container.record_qty.value) if index % 2 ] self.punched_qty = self.container.punch_records( self.punched_indices) # self.punched_qty /= self.container.object_qty.value def verify_container_data(self, txn=0): """Verify the container data. Args: txn (int, optional): transaction timestamp to read. Defaults to 0. """ # Verify the expected number of objects/records were punched if self.punch_type == "object": expected_qty = len(self.punched_indices) elif self.punch_type == "record": expected_qty = \ len(self.punched_indices) * self.container.object_qty.value else: expected_qty = 0 self.assertEqual( expected_qty, self.punched_qty, "Error punching {}s during rebuild: {}/{}".format( self.punch_type, self.punched_qty, expected_qty)) # Read objects from the last transaction super().verify_container_data(txn) def test_rebuild_delete_objects(self): """JIRA ID: DAOS-2572. Test Description: Delete objects during rebuild. Rebuild should complete successfully and only the remaining data should be accessible and it should only exist on the rebuild target and non-excluded, original targets. The data in the deleted objects should not be accessible. Use Cases: foo :avocado: tags=all,full_regression :avocado: tags=large :avocado: tags=rebuild,delete_objects,rebuilddeleteobject """ self.punch_type = "object" self.execute_rebuild_test() def test_rebuild_delete_records(self): """JIRA ID: DAOS-2574. Test Description: Delete records during rebuild. Rebuild should complete successfully and only the remaining data should be accessible and it should only exist on the rebuild target and non-excluded, original targets. The data in the deleted records should not be accessible. Use Cases: foo :avocado: tags=all,full_regression :avocado: tags=large :avocado: tags=rebuild,delete_objects,rebuilddeleterecord """ self.punch_type = "record" self.execute_rebuild_test()