def tearDown(self): try: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.stop_server(hosts=self.hostlist) finally: # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist)
def tearDown(self): try: super(GlobalHandle, self).tearDown() finally: # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist_servers)
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') self.hostlist_servers = self.params.get("test_machines", '/run/hosts/') hostfile_servers = write_host_file.write_host_file( self.hostlist_servers, self.workdir) try: self.agent_sessions = agent_utils.run_agent(self.basepath, self.hostlist_servers) server_utils.run_server(hostfile_servers, server_group, self.basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist_servers) os.remove(hostfile_servers) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist_servers) finally: if self.agent_sessions: agent_utils.stop_agent(self.agent_sessions) server_utils.kill_server(self.hostlist_servers)
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) try: self.agent_sessions = AgentUtils.run_agent(basepath, self.hostlist) server_utils.run_server(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist) os.remove(hostfile) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.kill_server(self.hostlist)
def tearDown(self): try: # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist_servers) finally: super(RebuildTests, self).tearDown()
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ try: # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.context) pool2 = DaosPool(self.context) pool1.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) pool2.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.context) container1.create(pool1.handle) container2 = DaosContainer(self.context) container2.create(pool2.handle) # now open them container1.open() container2.open() # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) # Used DAOS_OC_R1S_SPEC_RANK # 1 replica with specified rank obj, txn = container1.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) obj, txn = container2.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct containers data2 = container1.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P1, read it back, didn't match\n") data2 = container2.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([self.rank]) pool2.exclude([self.rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail( "P1 number disabled targets reporting incorrectly: {}". format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}".format( pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool1.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("P1 rebuilt recs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount * self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail( "Number disabled targets reporting incorrectly: {}".format( pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool2.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_rec_nr, (self.reccount * self.objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print(excp) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: server_utils.stop_server(hosts=self.hostlist_servers) check_for_pool.cleanup_pools(self.hostlist_servers) server_utils.kill_server(self.hostlist_servers)