class PoolSvc(TestWithServers): """ Tests svc argument while pool create. :avocado: recursive """ def setUp(self): super(PoolSvc, self).setUp() self.pool = None self.hostfile_servers = None self.hostlist_servers = self.params.get("test_machines", '/run/hosts/*') self.hostfile_servers = write_host_file.write_host_file( self.hostlist_servers, self.workdir) print("Host file is: {}".format(self.hostfile_servers)) self.agent_sessions = agent_utils.run_agent(self.basepath, self.hostlist_servers) server_utils.run_server(self.hostfile_servers, self.server_group, self.basepath) def tearDown(self): try: if self.pool is not None and self.pool.attached: self.pool.destroy(1) finally: super(PoolSvc, self).tearDown() def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 iterator = 0 while (int(self.pool.svc.rl_ranks[iterator]) > 0 and int(self.pool.svc.rl_ranks[iterator]) <= createsvc[0] and int(self.pool.svc.rl_ranks[iterator]) != 999999): iterator += 1 if iterator != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for iterator in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[iterator])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") if createsvc[0] == 3: self.pool.disconnect() cmd = ('{0} kill-leader --uuid={1}'.format( self.daosctl, self.pool.get_uuid_str())) process.system(cmd) self.pool.connect(1 << 1) self.pool.disconnect() server = DaosServer(self.context, self.server_group, 2) server.kill(1) self.pool.exclude([2]) self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) try: self.agent_sessions = AgentUtils.run_agent(basepath, self.hostlist) server_utils.run_server(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist) os.remove(hostfile) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.kill_server(self.hostlist)
class DestroyRebuild(Test): """ Test Class Description: This test verifies destruction of a pool that is rebuilding. :avocado: tags=pool,pooldestroy,rebuild,desreb """ build_paths = [] server_group = "" context = None pool = None hostfile = "" def setUp(self): """ setup for the test """ self.agent_sessions = None # get paths from the build_vars generated by build with open('../../../.build_vars.json') as build_file: build_paths = json.load(build_file) self.context = DaosContext(build_paths['PREFIX'] + '/lib/') self.basepath = os.path.normpath(build_paths['PREFIX'] + "/../") # generate a hostfile self.hostlist = self.params.get("test_machines", '/run/hosts/') self.hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) # fire up the DAOS servers self.server_group = self.params.get("name", '/run/server_config/', 'daos_server') self.agent_sessions = AgentUtils.run_agent(self.basepath, self.hostlist) server_utils.run_server(self.hostfile, self.server_group, build_paths['PREFIX'] + '/../') # create a pool to test with createmode = self.params.get("mode", '/run/pool/createmode/') createuid = self.params.get("uid", '/run/pool/createuid/') creategid = self.params.get("gid", '/run/pool/creategid/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid) self.pool.get_uuid_str() time.sleep(2) def tearDown(self): """ cleanup after the test """ try: os.remove(self.hostfile) if self.pool: self.pool.destroy(1) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.stop_server(hosts=self.hostlist) def test_destroy_while_rebuilding(self): """ :avocado: tags=pool,pooldestroy,rebuild,desreb """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) print("created server ") # BUG if you don't connect the rebuild doesn't start correctly self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist): self.fail("target count wrong.\n") if not status.pi_ndisabled == 0: self.fail("disabled target count wrong.\n") print("connect ") time.sleep(1) server.kill(1) print("killed server ") # exclude the target from the dead server self.pool.exclude([svr_to_kill]) print("exclude target ") #self.pool.disconnect() #print "disconnect " # the rebuild won't take long since there is no data so do # the destroy quickly self.pool.destroy(1) print("destroy ") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
class DestroyRebuild(Test): """ Test Class Description: This test verifies destruction of a pool that is rebuilding. :avocado: recursive """ build_paths = [] server_group = "" context = None pool = None hostfile_servers = "" def setUp(self): """ setup for the test """ self.agent_sessions = None # get paths from the build_vars generated by build with open('../../../.build_vars.json') as build_file: build_paths = json.load(build_file) self.context = DaosContext(build_paths['PREFIX'] + '/lib/') self.basepath = os.path.normpath(build_paths['PREFIX'] + "/../") # generate a hostfile self.hostlist_servers = self.params.get("test_machines", '/run/hosts/') self.hostfile_servers = write_host_file.write_host_file( self.hostlist_servers, self.workdir) # fire up the DAOS servers self.server_group = self.params.get("name", '/run/server_config/', 'daos_server') self.agent_sessions = agent_utils.run_agent(self.basepath, self.hostlist_servers) server_utils.run_server(self.hostfile_servers, self.server_group, build_paths['PREFIX'] + '/../') # create a pool to test with createmode = self.params.get("mode", '/run/pool/createmode/') createuid = self.params.get("uid", '/run/pool/createuid/') creategid = self.params.get("gid", '/run/pool/creategid/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid) self.pool.get_uuid_str() time.sleep(2) def tearDown(self): """ cleanup after the test """ try: os.remove(self.hostfile_servers) if self.pool: self.pool.destroy(1) finally: if self.agent_sessions: agent_utils.stop_agent(self.agent_sessions) server_utils.stop_server(hosts=self.hostlist_servers) def test_destroy_while_rebuilding(self): """ :avocado: tags=pool,pooldestroy,rebuild,desreb """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) print("created server ") # BUG if you don't connect the rebuild doesn't start correctly self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist_servers): self.fail("target count wrong.\n") if not status.pi_ndisabled == 0: self.fail("disabled target count wrong.\n") print("connect ") time.sleep(1) server.kill(1) print("killed server ") # exclude the target from the dead server self.pool.exclude([svr_to_kill]) print("exclude target ") #self.pool.disconnect() #print "disconnect " # the rebuild won't take long since there is no data so do # the destroy quickly self.pool.destroy(1) print("destroy ") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
class RebuildNoCap(Test): """ Test Class Description: This class contains tests for pool rebuild. :avocado: tags=pool,rebuild,nocap """ build_paths = [] server_group = "" CONTEXT = None POOL = None hostfile = "" def setUp(self): """ setup for the test """ # get paths from the build_vars generated by build with open('../../../.build_vars.json') as f: build_paths = json.load(f) self.CONTEXT = DaosContext(build_paths['PREFIX'] + '/lib/') # generate a hostfile self.host_list = self.params.get("test_machines", '/run/hosts/') tmp = build_paths['PREFIX'] + '/tmp' self.hostfile = WriteHostFile.WriteHostFile(self.host_list, tmp) # fire up the DAOS servers self.server_group = self.params.get("server_group", '/run/server/', 'daos_server') ServerUtils.runServer(self.hostfile, self.server_group, build_paths['PREFIX'] + '/../') time.sleep(3) # create a pool to test with createmode = self.params.get("mode", '/run/pool/createmode/') createuid = self.params.get("uid", '/run/pool/createuid/') creategid = self.params.get("gid", '/run/pool/creategid/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') self.POOL = DaosPool(self.CONTEXT) self.POOL.create(createmode, createuid, creategid, createsize, createsetid) uuid = self.POOL.get_uuid_str() time.sleep(2) # stuff some bogus data into the pool how_many_bytes = long( self.params.get("datasize", '/run/testparams/datatowrite/')) exepath = build_paths['PREFIX'] +\ "/../src/tests/ftest/util/WriteSomeData.py" cmd = "export DAOS_POOL={0}; export DAOS_SVCL=1; mpirun"\ " --np 1 --host {1} {2} {3} testfile".format( uuid, self.host_list[0], exepath, how_many_bytes) subprocess.call(cmd, shell=True) def tearDown(self): """ cleanup after the test """ os.remove(self.hostfile) self.POOL.destroy(1) ServerUtils.stopServer() def test_rebuild_no_capacity(self): """ :avocado: tags=pool,rebuild,nocap """ try: print "\nsetup complete, starting test\n" # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int( self.params.get("rank_to_kill", '/run/testparams/ranks/')) sh = DaosServer(self.CONTEXT, bytes(self.server_group), svr_to_kill) time.sleep(1) sh.kill(1) # exclude the target from the dead server self.POOL.exclude([svr_to_kill]) # exclude should trigger rebuild, check self.POOL.connect(1 << 1) status = self.POOL.pool_query() if not status.pi_ntargets == len(self.host_list): self.fail("target count wrong.\n") if not status.pi_ndisabled == 1: self.fail("disabled target count wrong.\n") # the pool should be too full to start a rebuild so # expecting an error # not sure yet specifically what error if status.pi_rebuild_st[2] == 0: self.fail("expecting rebuild to fail but it didn't.\n") except ValueError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be setid = self.params.get("setname", '/run/testparams/setnames/') server_group = self.params.get("server_group", '/server/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") tmp = self.build_paths['PREFIX'] + '/tmp' self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = WriteHostFile.WriteHostFile(self.hostlist, tmp) try: ServerUtils.runServer(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.Context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail( "Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") pool_version = pool.pool_info.pi_rebuild_st.rs_version # create a container container = DaosContainer(self.Context) container.create(pool.handle) # now open it container.open() # how many objects and records are we creating objcount = self.params.get("objcount", '/run/testparams/numobjects/*') reccount = self.params.get("reccount", '/run/testparams/numrecords/*') if objcount == 0: reccount = 0 # which rank to write to and kill rank = self.params.get("rank", '/run/testparams/ranks/*') # how much data to write with each key size = self.params.get("size", '/run/testparams/datasize/') saved_data = [] for i in range(0, objcount): obj = None for j in range(0, reccount): # make some stuff up and write dkey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) akey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) data = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(size)) obj, tx = container.write_an_obj(data, len(data), dkey, akey, obj, rank) saved_data.append((obj, dkey, akey, data, tx)) # read the data back and make sure its correct data2 = container.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.Context, server_group, rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool.exclude([rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail( "Number of disabled targets reporting incorrectly: {}". format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_rec_nr, reccount * objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: try: ServerUtils.stopServer(hosts=self.hostlist) os.remove(hostfile) # really make sure everything is gone CheckForPool.CleanupPools(self.hostlist) finally: ServerUtils.killServer(self.hostlist)
class RebuildNoCap(TestWithServers): """ Test Class Description: This class contains tests for pool rebuild. :avocado: recursive """ def setUp(self): super(RebuildNoCap, self).setUp() # create a pool to test with createmode = self.params.get("mode", '/run/pool/createmode/') createuid = self.params.get("uid", '/run/pool/createuid/') creategid = self.params.get("gid", '/run/pool/creategid/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid) uuid = self.pool.get_uuid_str() time.sleep(2) # stuff some bogus data into the pool how_many_bytes = long(self.params.get("datasize", '/run/testparams/datatowrite/')) exepath = self.prefix +\ "/../src/tests/ftest/util/write_some_data.py" cmd = "export DAOS_POOL={0}; export DAOS_SVCL=1; mpirun"\ " --np 1 --host {1} {2} {3} testfile".format( uuid, self.hostlist_servers[0], exepath, how_many_bytes) subprocess.call(cmd, shell=True) def tearDown(self): """ cleanup after the test """ try: if self.pool: self.pool.destroy(1) finally: super(RebuildNoCap, self).tearDown() def test_rebuild_no_capacity(self): """ :avocado: tags=pool,rebuild,nocap """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) d_server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) time.sleep(1) d_server.kill(1) # exclude the target from the dead server self.pool.exclude([svr_to_kill]) # exclude should trigger rebuild, check self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist_servers): self.fail("target count wrong.\n") if not status.pi_ndisabled == 1: self.fail("disabled target count wrong.\n") # the pool should be too full to start a rebuild so # expecting an error # not sure yet specifically what error if status.pi_rebuild_st.rs_errno == 0: self.fail("expecting rebuild to fail but it didn't.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
class PoolSvc(TestWithServers): """ Tests svc argument while pool create. :avocado: recursive """ def tearDown(self): try: if self.pool is not None and self.pool.attached: self.pool.destroy(1) finally: super(PoolSvc, self).tearDown() def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 iterator = 0 while (int(self.pool.svc.rl_ranks[iterator]) > 0 and int(self.pool.svc.rl_ranks[iterator]) <= createsvc[0] and int(self.pool.svc.rl_ranks[iterator]) != 999999): iterator += 1 if iterator != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for iterator in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[iterator])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") self.pool.pool_query() leader = self.pool.pool_info.pi_leader if createsvc[0] == 3: # kill pool leader and exclude it self.pool.pool_svc_stop() self.pool.exclude([leader]) # perform pool disconnect, try connect again and disconnect self.pool.disconnect() self.pool.connect(1 << 1) self.pool.disconnect() # kill another server which is not a leader and exclude it server = DaosServer(self.context, self.server_group, leader - 1) server.kill(1) self.pool.exclude([leader - 1]) # perform pool connect self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ try: # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.context) pool2 = DaosPool(self.context) pool1.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) pool2.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.context) container1.create(pool1.handle) container2 = DaosContainer(self.context) container2.create(pool2.handle) # now open them container1.open() container2.open() # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) # Used DAOS_OC_R1S_SPEC_RANK # 1 replica with specified rank obj, txn = container1.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) obj, txn = container2.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct containers data2 = container1.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Wrote data P1, read it back, didn't match\n") data2 = container2.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([self.rank]) pool2.exclude([self.rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail("P1 number disabled targets reporting incorrectly: {}" .format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}" .format(pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}" .format(pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool1.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("P1 rebuilt recs not as expected: {0} {1}" .format(pool1.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount*self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail("Number disabled targets reporting incorrectly: {}" .format(pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}" .format(pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}" .format(pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool2.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}". format(pool2.pool_info.pi_rebuild_st.rs_rec_nr, (self.reccount*self.objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print (excp) print (traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ try: # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") # create a container container = DaosContainer(self.context) container.create(pool.handle) # now open it container.open() saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) obj, txn = container.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=16) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct data2 = container.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done manually pool.exclude([self.rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}" .format(pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}" .format(pool.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}" .format(pool.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount*self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print (excp) print (traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ try: # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail( "Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") # create a container container = DaosContainer(self.context) container.create(pool.handle) # now open it container.open() saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) obj, txn = container.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=16) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct data2 = container.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done manually pool.exclude([self.rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail( "Number of disabled targets reporting incorrectly: {}". format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount * self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print(excp) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ try: # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.context) pool2 = DaosPool(self.context) pool1.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) pool2.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.context) container1.create(pool1.handle) container2 = DaosContainer(self.context) container2.create(pool2.handle) # now open them container1.open() container2.open() # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) # Used DAOS_OC_R1S_SPEC_RANK # 1 replica with specified rank obj, txn = container1.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) obj, txn = container2.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct containers data2 = container1.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P1, read it back, didn't match\n") data2 = container2.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([self.rank]) pool2.exclude([self.rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail( "P1 number disabled targets reporting incorrectly: {}". format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}".format( pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool1.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("P1 rebuilt recs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount * self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail( "Number disabled targets reporting incorrectly: {}".format( pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool2.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_rec_nr, (self.reccount * self.objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print(excp) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: server_utils.stop_server(hosts=self.hostlist_servers) check_for_pool.cleanup_pools(self.hostlist_servers) server_utils.kill_server(self.hostlist_servers)
class TestPool(TestDaosApiBase): """A class for functional testing of DaosPools objects.""" def __init__(self, context, log, cb_handler=None): """[summary]. Args: context (DaosContext): [description] log (logging): logging object used to report the pool status cb_handler (CallbackHandler, optional): callback object to use with the API methods. Defaults to None. """ super(TestPool, self).__init__(cb_handler) self.context = context self.log = log self.uid = os.geteuid() self.gid = os.getegid() self.mode = TestParameter(None) self.name = TestParameter(None) self.group = TestParameter(None) self.svcn = TestParameter(None) self.target_list = TestParameter(None) self.scm_size = TestParameter(None) self.nvme_size = TestParameter(None) self.pool = None self.uuid = None self.info = None self.connected = False def get_params(self, test, path="/run/pool/*"): """Get the pool parameters from the yaml file. Args: test (Test): avocado Test object path (str, optional): yaml namespace. Defaults to "/run/pool/*". """ super(TestPool, self).get_params(test, path) @fail_on(DaosApiError) def create(self): """Create a pool. Destroys an existing pool if defined and assigns self.pool and self.uuid. """ self.destroy() self.log.info("Creating a pool") self.pool = DaosPool(self.context) kwargs = { "mode": self.mode.value, "uid": self.uid, "gid": self.gid, "scm_size": self.scm_size.value, "group": self.name.value} for key in ("target_list", "svcn", "nvme_size"): value = getattr(self, key).value if value: kwargs[key] = value self._call_method(self.pool.create, kwargs) self.uuid = self.pool.get_uuid_str() @fail_on(DaosApiError) def connect(self, permission=1): """Connect to the pool. Args: permission (int, optional): connect permission. Defaults to 1. Returns: bool: True if the pool has been connected; False if the pool was already connected or the pool is not defined. """ if self.pool and not self.connected: kwargs = {"flags": 1 << permission} self.log.info( "Connecting to pool %s with permission %s (flag: %s)", self.uuid, permission, kwargs["flags"]) self._call_method(self.pool.connect, kwargs) self.connected = True return True return False @fail_on(DaosApiError) def disconnect(self): """Disconnect from connected pool. Returns: bool: True if the pool has been disconnected; False if the pool was already disconnected or the pool is not defined. """ if self.pool and self.connected: self.log.info("Disonnecting from pool %s", self.uuid) self._call_method(self.pool.disconnect, {}) self.connected = False return True return False @fail_on(DaosApiError) def destroy(self, force=1): """Destroy the pool. Args: force (int, optional): force flag. Defaults to 1. Returns: bool: True if the pool has been destoyed; False if the pool is not defined. """ if self.pool: self.disconnect() self.log.info("Destroying pool %s", self.uuid) self._call_method(self.pool.destroy, {"force": force}) self.pool = None self.uuid = None self.info = None return True return False @fail_on(DaosApiError) def get_info(self): """Query the pool for information. Sets the self.info attribute. """ if self.pool: self.connect() self._call_method(self.pool.pool_query, {}) self.info = self.pool.pool_info def check_pool_info(self, pi_uuid=None, pi_ntargets=None, pi_nnodes=None, pi_ndisabled=None, pi_map_ver=None, pi_leader=None, pi_bits=None): # pylint: disable=unused-argument """Check the pool info attributes. Args: pi_uuid (str, optional): pool uuid. Defaults to None. pi_ntargets (int, optional): number of targets. Defaults to None. pi_nnodes (int, optional): number of nodes. Defaults to None. pi_ndisabled (int, optional): number of disabled. Defaults to None. pi_map_ver (int, optional): pool map version. Defaults to None. pi_leader (int, optional): pool leader. Defaults to None. pi_bits (int, optional): pool bits. Defaults to None. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ (key, c_uuid_to_str(getattr(self.info, key)) if key == "pi_uuid" else getattr(self.info, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def check_pool_space(self, ps_free_min=None, ps_free_max=None, ps_free_mean=None, ps_ntargets=None, ps_padding=None): # pylint: disable=unused-argument """Check the pool info space attributes. Args: ps_free_min (list, optional): minimum free space per device. Defaults to None. ps_free_max (list, optional): maximum free space per device. Defaults to None. ps_free_mean (list, optional): mean free space per device. Defaults to None. ps_ntargets (int, optional): number of targets. Defaults to None. ps_padding (int, optional): space padding. Defaults to None. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [] for key in ("ps_free_min", "ps_free_max", "ps_free_mean"): val = locals()[key] if isinstance(val, list): for index, item in val: checks.append(( "{}[{}]".format(key, index), getattr(self.info.pi_space, key)[index], item)) for key in ("ps_ntargets", "ps_padding"): val = locals()[key] if val is not None: checks.append(key, getattr(self.info.pi_space, key), val) return self._check_info(checks) def check_pool_daos_space(self, s_total=None, s_free=None): # pylint: disable=unused-argument """Check the pool info daos space attributes. Args: s_total (list, optional): total space per device. Defaults to None. s_free (list, optional): free space per device. Defaults to None. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ ("{}_{}".format(key, index), getattr(self.info.pi_space.ps_space, key)[index], item) for key, val in locals().items() if key != "self" and val is not None for index, item in enumerate(val)] return self._check_info(checks) def check_rebuild_status(self, rs_version=None, rs_pad_32=None, rs_errno=None, rs_done=None, rs_toberb_obj_nr=None, rs_obj_nr=None, rs_rec_nr=None): # pylint: disable=unused-argument """Check the pool info rebuild attributes. Args: rs_version (int, optional): rebuild version. Defaults to None. rs_pad_32 (int, optional): rebuild pad. Defaults to None. rs_errno (int, optional): rebuild error number. Defaults to None. rs_done (int, optional): rebuild done flag. Defaults to None. rs_toberb_obj_nr (int, optional): number of objects to be rebuilt. Defaults to None. rs_obj_nr (int, optional): number of rebuilt objects. Defaults to None. rs_rec_nr (int, optional): number of rebuilt records. Defaults to None. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ (key, getattr(self.info.pi_rebuild_st, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def _check_info(self, check_list): """Verify each pool info attribute value matches an expected value. Args: check_list (list): a list of tuples containing the name of the pool information attribute to check, the current value of the attribute, and the expected value of the attribute. Returns: bool: True if at least one check has been specified and all the actual and expected values match; False otherwise. """ check_status = len(check_list) > 0 for check, actual, expect in check_list: self.log.info( "Verifying the pool %s: %s ?= %s", check, actual, expect) if actual != expect: msg = "The {} does not match: actual: {}, expected: {}".format( check, actual, expect) self.log.error(msg) check_status = False return check_status def rebuild_complete(self): """Determine if the pool rebuild is complete. Returns: bool: True if pool rebuild is complete; False otherwise """ self.get_info() return self.info.pi_rebuild_st.rs_done == 1 def wait_for_rebuild(self, to_start, interval=1): """Wait for the rebuild to start or end. Args: to_start (bool): whether to wait for rebuild to start or end interval (int): number of seconds to wait in between rebuild completion checks """ self.log.info( "Waiting for rebuild to %s ...", "start" if to_start else "complete") while self.rebuild_complete() == to_start: self.log.info( " Rebuild %s ...", "has not yet started" if to_start else "in progress") sleep(interval) self.log.info( "Rebuild %s detected", "start" if to_start else "completion") @fail_on(DaosApiError) def start_rebuild(self, server_group, rank, daos_log): """Kill a specific server rank using this pool. Args: server_group (str): daos server group name rank (int): daos server rank to kill daos_log (DaosLog): object for logging messages """ msg = "Killing DAOS server {} (rank {})".format(server_group, rank) self.log.info(msg) daos_log.info(msg) server = DaosServer(self.context, server_group, rank) server.kill(1) msg = "Excluding server rank {} from pool {}".format(rank, self.uuid) self.log.info(msg) daos_log.info(msg) self.pool.exclude([rank]) def check_files(self, hosts): """Check if pool files exist on the specified list of hosts. Args: hosts (list): list of hosts Returns: bool: True if the files for this pool exist on each host; False otherwise """ return check_pool_files(self.log, hosts, self.uuid.lower()) def write_file(self, orterun, processes, hostfile, size, timeout=60): """Write a file to the pool. Args: orterun (str): full path to the orterun command processes (int): number of processes to launch hosts (list): list of clients from which to write the file size (int): size of the file to create in bytes timeout (int, optional): number of seconds before timing out the command. Defaults to 60 seconds. Returns: process.CmdResult: command execution result """ self.log.info("Writing {} bytes to pool {}".format(size, self.uuid)) env = { "DAOS_POOL": self.uuid, "DAOS_SVCL": "1", "DAOS_SINGLETON_CLI": "1", } current_path = os.path.dirname(os.path.abspath(__file__)) command = "{} --np {} --hostfile {} {} {} testfile".format( orterun, processes, hostfile, os.path.join(current_path, "write_some_data.py"), size) return process.run(command, timeout, True, False, "both", True, env)
def test_exclude(self): """ Pass bad parameters to pool connect :avocado: tags=pool,poolexclude,badparam,badexclude """ # parameters used in pool create createmode = self.params.get("mode", '/run/pool/createmode/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') createuid = os.geteuid() creategid = os.getegid() # Accumulate a list of pass/fail indicators representing what is # expected for each parameter then "and" them to determine the # expected result of the test expected_for_param = [] tgtlist = self.params.get("ranklist", '/run/testparams/tgtlist/*/') targets = [] if tgtlist[0] == "NULLPTR": targets = None self.cancel("skipping null pointer test until DAOS-1929 is fixed") else: targets.append(tgtlist[0]) expected_for_param.append(tgtlist[1]) svclist = self.params.get("ranklist", '/run/testparams/svrlist/*/') svc = svclist[0] expected_for_param.append(svclist[1]) setlist = self.params.get("setname", '/run/testparams/connectsetnames/*/') connectset = setlist[0] expected_for_param.append(setlist[1]) uuidlist = self.params.get("uuid", '/run/testparams/UUID/*/') excludeuuid = uuidlist[0] expected_for_param.append(uuidlist[1]) # if any parameter is FAIL then the test should FAIL, in this test # virtually everyone should FAIL since we are testing bad parameters expected_result = 'PASS' for result in expected_for_param: if result == 'FAIL': expected_result = 'FAIL' break saved_svc = None saved_grp = None saved_uuid = None pool = None try: # setup the DAOS python API with open('../../../.build_vars.json') as build_file: data = json.load(build_file) context = DaosContext(data['PREFIX'] + '/lib/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) # trash the the pool service rank list if not svc == 'VALID': self.cancel("skipping this test until DAOS-1931 is fixed") saved_svc = RankList(pool.svc.rl_ranks, pool.svc.rl_nr) pool.svc = None # trash the pool group value if connectset == 'NULLPTR': saved_grp = pool.group pool.group = None # trash the UUID value in various ways if excludeuuid == 'NULLPTR': self.cancel("skipping this test until DAOS-1932 is fixed") ctypes.memmove(saved_uuid, pool.uuid, 16) pool.uuid = 0 if excludeuuid == 'CRAP': self.cancel("skipping this test until DAOS-1932 is fixed") ctypes.memmove(saved_uuid, pool.uuid, 16) pool.uuid[4] = 244 pool.exclude(targets) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result in ['PASS']: self.fail("Test was expected to pass but it failed.\n") finally: if pool is not None: if saved_svc is not None: pool.svc = saved_svc if saved_grp is not None: pool.group = saved_grp if saved_uuid is not None: ctypes.memmove(pool.uuid, saved_uuid, 16) pool.destroy(1)
class TestPool(TestDaosApiBase): """A class for functional testing of DaosPools objects.""" def __init__(self, context, log, cb_handler=None): """[summary]. Args: context (DaosContext): [description] log (logging): logging object used to report the pool status cb_handler (CallbackHandler, optional): callback object to use with the API methods. Defaults to None. """ super(TestPool, self).__init__("/run/pool/*", cb_handler) self.context = context self.log = log self.uid = os.geteuid() self.gid = os.getegid() self.mode = BasicParameter(None) self.name = BasicParameter(None) self.group = BasicParameter(None) self.svcn = BasicParameter(None) self.target_list = BasicParameter(None) self.scm_size = BasicParameter(None) self.nvme_size = BasicParameter(None) self.pool = None self.uuid = None self.info = None self.svc_ranks = None self.connected = False @fail_on(DaosApiError) def create(self): """Create a pool. Destroys an existing pool if defined and assigns self.pool and self.uuid. """ self.destroy() self.log.info( "Creating a pool{}".format( " on targets {}".format(self.target_list.value) if self.target_list.value else "")) self.pool = DaosPool(self.context) kwargs = { "mode": self.mode.value, "uid": self.uid, "gid": self.gid, "scm_size": self.scm_size.value, "group": self.name.value} for key in ("target_list", "svcn", "nvme_size"): value = getattr(self, key).value if value: kwargs[key] = value self._call_method(self.pool.create, kwargs) self.uuid = self.pool.get_uuid_str() self.svc_ranks = [ int(self.pool.svc.rl_ranks[index]) for index in range(self.pool.svc.rl_nr)] self.log.info(" Pool created with uuid {} and svc ranks {}".format( self.uuid, self.svc_ranks)) @fail_on(DaosApiError) def connect(self, permission=1): """Connect to the pool. Args: permission (int, optional): connect permission. Defaults to 1. Returns: bool: True if the pool has been connected; False if the pool was already connected or the pool is not defined. """ if self.pool and not self.connected: kwargs = {"flags": 1 << permission} self.log.info( "Connecting to pool %s with permission %s (flag: %s)", self.uuid, permission, kwargs["flags"]) self._call_method(self.pool.connect, kwargs) self.connected = True return True return False @fail_on(DaosApiError) def disconnect(self): """Disconnect from connected pool. Returns: bool: True if the pool has been disconnected; False if the pool was already disconnected or the pool is not defined. """ if self.pool and self.connected: self.log.info("Disonnecting from pool %s", self.uuid) self._call_method(self.pool.disconnect, {}) self.connected = False return True return False @fail_on(DaosApiError) def destroy(self, force=1): """Destroy the pool. Args: force (int, optional): force flag. Defaults to 1. Returns: bool: True if the pool has been destroyed; False if the pool is not defined. """ if self.pool: self.disconnect() self.log.info("Destroying pool %s", self.uuid) if self.pool.attached: self._call_method(self.pool.destroy, {"force": force}) self.pool = None self.uuid = None self.info = None self.svc_ranks = None return True return False @fail_on(DaosApiError) def get_info(self): """Query the pool for information. Sets the self.info attribute. """ if self.pool: self.connect() self._call_method(self.pool.pool_query, {}) self.info = self.pool.pool_info def check_pool_info(self, pi_uuid=None, pi_ntargets=None, pi_nnodes=None, pi_ndisabled=None, pi_map_ver=None, pi_leader=None, pi_bits=None): # pylint: disable=unused-argument """Check the pool info attributes. Args: pi_uuid (str, optional): pool uuid. Defaults to None. pi_ntargets (int, optional): number of targets. Defaults to None. pi_nnodes (int, optional): number of nodes. Defaults to None. pi_ndisabled (int, optional): number of disabled. Defaults to None. pi_map_ver (int, optional): pool map version. Defaults to None. pi_leader (int, optional): pool leader. Defaults to None. pi_bits (int, optional): pool bits. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ (key, c_uuid_to_str(getattr(self.info, key)) if key == "pi_uuid" else getattr(self.info, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def check_pool_space(self, ps_free_min=None, ps_free_max=None, ps_free_mean=None, ps_ntargets=None, ps_padding=None): # pylint: disable=unused-argument """Check the pool info space attributes. Args: ps_free_min (list, optional): minimum free space per device. Defaults to None. ps_free_max (list, optional): maximum free space per device. Defaults to None. ps_free_mean (list, optional): mean free space per device. Defaults to None. ps_ntargets (int, optional): number of targets. Defaults to None. ps_padding (int, optional): space padding. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [] for key in ("ps_free_min", "ps_free_max", "ps_free_mean"): val = locals()[key] if isinstance(val, list): for index, item in val: checks.append(( "{}[{}]".format(key, index), getattr(self.info.pi_space, key)[index], item)) for key in ("ps_ntargets", "ps_padding"): val = locals()[key] if val is not None: checks.append(key, getattr(self.info.pi_space, key), val) return self._check_info(checks) def check_pool_daos_space(self, s_total=None, s_free=None): # pylint: disable=unused-argument """Check the pool info daos space attributes. Args: s_total (list, optional): total space per device. Defaults to None. s_free (list, optional): free space per device. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ ("{}_{}".format(key, index), getattr(self.info.pi_space.ps_space, key)[index], item) for key, val in locals().items() if key != "self" and val is not None for index, item in enumerate(val)] return self._check_info(checks) def check_rebuild_status(self, rs_version=None, rs_pad_32=None, rs_errno=None, rs_done=None, rs_toberb_obj_nr=None, rs_obj_nr=None, rs_rec_nr=None): # pylint: disable=unused-argument """Check the pool info rebuild attributes. Args: rs_version (int, optional): rebuild version. Defaults to None. rs_pad_32 (int, optional): rebuild pad. Defaults to None. rs_errno (int, optional): rebuild error number. Defaults to None. rs_done (int, optional): rebuild done flag. Defaults to None. rs_toberb_obj_nr (int, optional): number of objects to be rebuilt. Defaults to None. rs_obj_nr (int, optional): number of rebuilt objects. Defaults to None. rs_rec_nr (int, optional): number of rebuilt records. Defaults to None. Note: Arguments may also be provided as a string with a number preceeded by '<', '<=', '>', or '>=' for other comparisions besides the default '=='. Returns: bool: True if at least one expected value is specified and all the specified values match; False otherwise """ self.get_info() checks = [ (key, getattr(self.info.pi_rebuild_st, key), val) for key, val in locals().items() if key != "self" and val is not None] return self._check_info(checks) def _check_info(self, check_list): """Verify each pool info attribute value matches an expected value. Args: check_list (list): a list of tuples containing the name of the pool information attribute to check, the current value of the attribute, and the expected value of the attribute. If the expected value is specified as a string with a number preceeded by '<', '<=', '>', or '>=' then this comparision will be used instead of the defult '=='. Returns: bool: True if at least one check has been specified and all the actual and expected values match; False otherwise. """ check_status = len(check_list) > 0 for check, actual, expect in check_list: # Determine which comparision to utilize for this check compare = ("==", lambda x, y: x == y, "does not match") if isinstance(expect, str): comparisions = { "<": (lambda x, y: x < y, "is too large"), ">": (lambda x, y: x > y, "is too small"), "<=": ( lambda x, y: x <= y, "is too large or does not match"), ">=": ( lambda x, y: x >= y, "is too small or does not match"), } for key, val in comparisions.items(): # If the expected value is preceeded by one of the known # comparision keys, use the comparision and remove the key # from the expected value if expect[:len(key)] == key: compare = (key, val[0], val[1]) expect = expect[len(key):] try: expect = int(expect) except ValueError: # Allow strings to be strings pass break self.log.info( "Verifying the pool %s: %s %s %s", check, actual, compare[0], expect) if not compare[1](actual, expect): msg = " The {} {}: actual={}, expected={}".format( check, compare[2], actual, expect) self.log.error(msg) check_status = False return check_status def rebuild_complete(self): """Determine if the pool rebuild is complete. Returns: bool: True if pool rebuild is complete; False otherwise """ self.get_info() return self.info.pi_rebuild_st.rs_done == 1 def wait_for_rebuild(self, to_start, interval=1): """Wait for the rebuild to start or end. Args: to_start (bool): whether to wait for rebuild to start or end interval (int): number of seconds to wait in between rebuild completion checks """ self.log.info( "Waiting for rebuild to %s ...", "start" if to_start else "complete") while self.rebuild_complete() == to_start: self.log.info( " Rebuild %s ...", "has not yet started" if to_start else "in progress") sleep(interval) self.log.info( "Rebuild %s detected", "start" if to_start else "completion") @fail_on(DaosApiError) def start_rebuild(self, server_group, rank, daos_log): """Kill a specific server rank using this pool. Args: server_group (str): daos server group name rank (int): daos server rank to kill daos_log (DaosLog): object for logging messages Returns: bool: True if the server has been killed and the rank has been excluded from the pool; False if the pool is undefined """ msg = "Killing DAOS server {} (rank {})".format(server_group, rank) self.log.info(msg) daos_log.info(msg) server = DaosServer(self.context, server_group, rank) server.kill(1) return self.exclude(rank, daos_log) @fail_on(DaosApiError) def exclude(self, rank, daos_log): """Manually exclude a rank from this pool. Args: rank (int): daos server rank to kill daos_log (DaosLog): object for logging messages Returns: bool: True if rank has been excluded from the pool; False if the pool is undefined """ if self.pool: msg = "Excluding server rank {} from pool {}".format( rank, self.uuid) self.log.info(msg) daos_log.info(msg) self.pool.exclude([rank]) return True return False def check_files(self, hosts): """Check if pool files exist on the specified list of hosts. Args: hosts (list): list of hosts Returns: bool: True if the files for this pool exist on each host; False otherwise """ return check_pool_files(self.log, hosts, self.uuid.lower()) def write_file(self, orterun, processes, hostfile, size, timeout=60): """Write a file to the pool. Args: orterun (str): full path to the orterun command processes (int): number of processes to launch hosts (list): list of clients from which to write the file size (int): size of the file to create in bytes timeout (int, optional): number of seconds before timing out the command. Defaults to 60 seconds. Returns: process.CmdResult: command execution result """ self.log.info("Writing {} bytes to pool {}".format(size, self.uuid)) env = { "DAOS_POOL": self.uuid, "DAOS_SVCL": "1", "DAOS_SINGLETON_CLI": "1", "PYTHONPATH": os.getenv("PYTHONPATH", ""), } current_path = os.path.dirname(os.path.abspath(__file__)) command = "{} --np {} --hostfile {} {} {} testfile".format( orterun, processes, hostfile, os.path.join(current_path, "write_some_data.py"), size) return process.run(command, timeout, True, False, "both", True, env) def get_pool_daos_space(self): """Get the pool info daos space attributes as a dictionary. Returns: dict: a dictionary of lists of the daos space attributes """ self.get_info() keys = ("s_total", "s_free") return {key: getattr(self.info.pi_space.ps_space, key) for key in keys} def display_pool_daos_space(self, msg=None): """Display the pool info daos space attributes. Args: msg (str, optional): optional text to include in the output. Defaults to None. """ daos_space = self.get_pool_daos_space() sizes = [ "{}[{}]={}".format(key, index, item) for key in sorted(daos_space.keys()) for index, item in enumerate(daos_space[key])] self.log.info( "Pool %s space%s:\n %s", self.uuid, " " + msg if isinstance(msg, str) else "", "\n ".join(sizes))
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') self.hostlist_servers = self.params.get("test_machines", '/run/hosts/') hostfile_servers = write_host_file.write_host_file( self.hostlist_servers, self.workdir) try: self.agent_sessions = agent_utils.run_agent(self.basepath, self.hostlist_servers) server_utils.run_server(hostfile_servers, server_group, self.basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist_servers) os.remove(hostfile_servers) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist_servers) finally: if self.agent_sessions: agent_utils.stop_agent(self.agent_sessions) server_utils.kill_server(self.hostlist_servers)
class RebuildNoCap(Test): """ Test Class Description: This class contains tests for pool rebuild. :avocado: tags=pool,rebuild,nocap """ def setUp(self): """ setup for the test """ self.agent_sessions = None # get paths from the build_vars generated by build with open('../../../.build_vars.json') as build_file: build_paths = json.load(build_file) self.context = DaosContext(build_paths['PREFIX'] + '/lib/') self.basepath = os.path.normpath(build_paths['PREFIX'] + "/../") # generate a hostfile self.hostlist = self.params.get("test_machines", '/run/hosts/') self.hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) # fire up the DAOS servers self.server_group = self.params.get("name", '/run/server_config/', 'daos_server') self.agent_sessions = AgentUtils.run_agent(self.basepath, self.hostlist) server_utils.run_server(self.hostfile, self.server_group, build_paths['PREFIX'] + '/../') # create a pool to test with createmode = self.params.get("mode", '/run/pool/createmode/') createuid = self.params.get("uid", '/run/pool/createuid/') creategid = self.params.get("gid", '/run/pool/creategid/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid) uuid = self.pool.get_uuid_str() time.sleep(2) # stuff some bogus data into the pool how_many_bytes = long(self.params.get("datasize", '/run/testparams/datatowrite/')) exepath = os.path.join(build_paths['PREFIX'], "/../src/tests/ftest/util/write_some_data.py") cmd = "export DAOS_POOL={0}; export DAOS_SVCL=1; mpirun"\ " --np 1 --host {1} {2} {3} testfile".format( uuid, self.hostlist[0], exepath, how_many_bytes) subprocess.call(cmd, shell=True) def tearDown(self): """ cleanup after the test """ try: os.remove(self.hostfile) if self.pool: self.pool.destroy(1) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.stop_server(hosts=self.hostlist) def test_rebuild_no_capacity(self): """ :avocado: tags=pool,rebuild,nocap """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) d_server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) time.sleep(1) d_server.kill(1) # exclude the target from the dead server self.pool.exclude([svr_to_kill]) # exclude should trigger rebuild, check self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist): self.fail("target count wrong.\n") if not status.pi_ndisabled == 1: self.fail("disabled target count wrong.\n") # the pool should be too full to start a rebuild so # expecting an error # not sure yet specifically what error if status.pi_rebuild_st.rs_errno == 0: self.fail("expecting rebuild to fail but it didn't.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
class PoolSvc(Test): """ Tests svc argument while pool create. """ def setUp(self): # get paths from the build_vars generated by build with open('../../../.build_vars.json') as f: build_paths = json.load(f) self.basepath = os.path.normpath(build_paths['PREFIX'] + "/../") self.tmp = build_paths['PREFIX'] + '/tmp' self.server_group = self.params.get("server_group",'/server/','daos_server') self.daosctl = self.basepath + '/install/bin/daosctl' # setup the DAOS python API self.Context = DaosContext(build_paths['PREFIX'] + '/lib/') self.POOL = None self.hostfile = None self.hostlist = self.params.get("test_machines",'/run/hosts/*') self.hostfile = WriteHostFile.WriteHostFile(self.hostlist, self.tmp) print("Host file is: {}".format(self.hostfile)) ServerUtils.runServer(self.hostfile, self.server_group, self.basepath) time.sleep(5) def tearDown(self): try: if self.hostfile is not None: os.remove(self.hostfile) if self.POOL is not None and self.POOL.attached: self.POOL.destroy(1) finally: ServerUtils.stopServer(hosts=self.hostlist) def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode",'/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname",'/run/createtests/createset/') createsize = self.params.get("size",'/run/createtests/createsize/') createsvc = self.params.get("svc",'/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.POOL = DaosPool(self.Context) self.POOL.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.POOL.connect(1 << 1) # checking returned rank list value for single server if ((len(self.hostlist) == 1) and (int(self.POOL.svc.rl_ranks[i] != 0))): self.fail("Incorrect returned rank list value for single server") # checking returned rank list for server more than 1 i = 0 while ((int(self.POOL.svc.rl_ranks[i]) > 0) and \ (int(self.POOL.svc.rl_ranks[i]) <= createsvc[0]) and \ (int(self.POOL.svc.rl_ranks[i]) != 999999)): i +=1 if i != createsvc[0]: self.fail("Length of Returned Rank list is not equal to" \ " the number of Pool Service members.\n") list = [] for j in range(createsvc[0]): list.append(int(self.POOL.svc.rl_ranks[j])) if len(list) != len(set(list)): self.fail("Duplicate values in returned rank list") if (createsvc[0] == 3): self.POOL.disconnect() cmd = ('{0} kill-leader --uuid={1}' .format(self.daosctl, self.POOL.get_uuid_str())) process.system(cmd) self.POOL.connect(1 << 1) self.POOL.disconnect() server = DaosServer(self.Context, self.server_group, 2) server.kill(1) self.POOL.exclude([2]) self.POOL.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as e: print(e) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_exclude(self): """ Pass bad parameters to pool connect :avocado: tags=pool,poolexclude,badparam,badexclude """ global basepath # parameters used in pool create createmode = self.params.get("mode", '/run/excludetests/createmode/') createuid = self.params.get("uid", '/run/excludetests/createuid/') creategid = self.params.get("gid", '/run/excludetests/creategid/') createsetid = self.params.get("setname", '/run/excludetests/createset/') createsize = self.params.get("size", '/run/excludetests/createsize/') # Accumulate a list of pass/fail indicators representing what is # expected for each parameter then "and" them to determine the # expected result of the test expected_for_param = [] tgtlist = self.params.get("ranklist", '/run/excludetests/tgtlist/*/') targets = [] targets.append(tgtlist[0]) expected_for_param.append(tgtlist[1]) svclist = self.params.get("ranklist", '/run/excludetests/svrlist/*/') svc = svclist[0] expected_for_param.append(svclist[1]) setlist = self.params.get("setname", '/run/excludetests/connectsetnames/*/') connectset = setlist[0] expected_for_param.append(setlist[1]) uuidlist = self.params.get("uuid", '/run/excludetests/UUID/*/') excludeuuid = uuidlist[0] expected_for_param.append(uuidlist[1]) # if any parameter is FAIL then the test should FAIL, in this test # virtually everyone should FAIL since we are testing bad parameters expected_result = 'PASS' for result in expected_for_param: if result == 'FAIL': expected_result = 'FAIL' break try: # setup the DAOS python API with open('../../../.build_vars.json') as f: data = json.load(f) CONTEXT = DaosContext(data['PREFIX'] + '/lib/') # initialize a python pool object then create the underlying # daos storage POOL = DaosPool(CONTEXT) POOL.create(createmode, createuid, creategid, createsize, createsetid, None) # trash the the pool service rank list #if not svc == 'VALID': # rl_ranks = ctypes.POINTER(ctypes.c_uint)() # POOL.svc = RankList(rl_ranks, 1); # trash the pool group value #if connectset == None: # POOL.group = None # trash the UUID value in various ways #if excludeuuid == None: # POOL.uuid = None #if excludeuuid == 'CRAP': # POOL.uuid[4] = 244 POOL.exclude(targets) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except ValueError as e: print e print traceback.format_exc() if expected_result in ['PASS']: self.fail("Test was expected to pass but it failed.\n")
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be setid = self.params.get("setname", '/run/testparams/setnames/') server_group = self.params.get("server_group", '/server/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") tmp = self.build_paths['PREFIX'] + '/tmp' self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = WriteHostFile.WriteHostFile(self.hostlist, tmp) try: ServerUtils.runServer(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.Context) pool2 = DaosPool(self.Context) pool1.create(createmode, createuid, creategid, createsize, createsetid, None) pool2.create(createmode, createuid, creategid, createsize, createsetid, None) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.Context) container1.create(pool1.handle) container2 = DaosContainer(self.Context) container2.create(pool2.handle) # now open them container1.open() container2.open() # how many objects and records are we creating objcount = self.params.get("objcount", '/run/testparams/numobjects/*') reccount = self.params.get("reccount", '/run/testparams/numrecords/*') if objcount == 0: reccount = 0 # which rank to write to and kill rank = self.params.get("rank", '/run/testparams/ranks/*') # how much data to write with each key size = self.params.get("size", '/run/testparams/datasize/') # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for i in range(0, objcount): obj = None for j in range(0, reccount): # make some stuff up and write dkey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) akey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) data = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(size)) obj, tx = container1.write_an_obj(data, len(data), dkey, akey, obj, rank) obj, tx = container2.write_an_obj(data, len(data), dkey, akey, obj, rank) saved_data.append((obj, dkey, akey, data, tx)) # read the data back and make sure its correct # containers data2 = container1.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail( "Wrote data P1, read it back, didn't match\n") # containers data2 = container2.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail( "Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.Context, server_group, rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([rank]) pool2.exclude([rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail( "P1 number disabled targets reporting incorrectly: {}". format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}".format( pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool1.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("P1 rebuilt recs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_rec_nr, reccount * objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail( "Number disabled targets reporting incorrectly: {}".format( pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool2.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_rec_nr, (reccount * objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: ServerUtils.stopServer(hosts=self.hostlist) os.remove(hostfile) CheckForPool.CleanupPools(self.hostlist) ServerUtils.killServer(self.hostlist)
def test_exclude(self): """ Pass bad parameters to pool connect :avocado: tags=pool,poolexclude,badparam,badexclude """ # parameters used in pool create createmode = self.params.get("mode", '/run/pool/createmode/') createsetid = self.params.get("setname", '/run/pool/createset/') createsize = self.params.get("size", '/run/pool/createsize/') createuid = os.geteuid() creategid = os.getegid() # Accumulate a list of pass/fail indicators representing what is # expected for each parameter then "and" them to determine the # expected result of the test expected_for_param = [] tgtlist = self.params.get("ranklist", '/run/testparams/tgtlist/*/') targets = [] if tgtlist[0] == "NULLPTR": targets = None self.cancel("skipping null pointer test until DAOS-1929 is fixed") else: targets.append(tgtlist[0]) expected_for_param.append(tgtlist[1]) svclist = self.params.get("ranklist", '/run/testparams/svrlist/*/') svc = svclist[0] expected_for_param.append(svclist[1]) setlist = self.params.get("setname", '/run/testparams/connectsetnames/*/') connectset = setlist[0] expected_for_param.append(setlist[1]) uuidlist = self.params.get("uuid", '/run/testparams/UUID/*/') excludeuuid = uuidlist[0] expected_for_param.append(uuidlist[1]) # if any parameter is FAIL then the test should FAIL, in this test # virtually everyone should FAIL since we are testing bad parameters expected_result = 'PASS' for result in expected_for_param: if result == 'FAIL': expected_result = 'FAIL' break saved_svc = None saved_grp = None saved_uuid = None pool = None try: # setup the DAOS python API with open('../../../.build_vars.json') as f: data = json.load(f) context = DaosContext(data['PREFIX'] + '/lib/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) # trash the the pool service rank list if not svc == 'VALID': self.cancel("skipping this test until DAOS-1931 is fixed") saved_svc = RankList(pool.svc.rl_ranks, pool.svc.rl_nr) pool.svc = None # trash the pool group value if connectset == 'NULLPTR': saved_grp = pool.group pool.group = None # trash the UUID value in various ways if excludeuuid == 'NULLPTR': self.cancel("skipping this test until DAOS-1932 is fixed") ctypes.memmove(saved_uuid, pool.uuid, 16) pool.uuid = 0 if excludeuuid == 'CRAP': self.cancel("skipping this test until DAOS-1932 is fixed") ctypes.memmove(saved_uuid, pool.uuid, 16) pool.uuid[4] = 244 pool.exclude(targets) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as e: print(e) print(traceback.format_exc()) if expected_result in ['PASS']: self.fail("Test was expected to pass but it failed.\n") finally: if pool is not None: if saved_svc is not None: pool.svc = saved_svc if saved_grp is not None: pool.group = saved_grp if saved_uuid is not None: ctypes.memmove(pool.uuid, saved_uuid, 16) pool.destroy(1)
class DestroyRebuild(Test): """ Test Class Description: This test verifies destruction of a pool that is rebuilding. :avocado: tags=pool,pooldestroy,rebuild,desreb """ build_paths = [] server_group = "" CONTEXT = None POOL = None hostfile = "" def setUp(self): """ setup for the test """ # get paths from the build_vars generated by build with open('../../../.build_vars.json') as f: build_paths = json.load(f) self.CONTEXT = DaosContext(build_paths['PREFIX'] + '/lib/') # generate a hostfile self.hostlist = self.params.get("test_machines",'/run/hosts/') tmp = build_paths['PREFIX'] + '/tmp' self.hostfile = WriteHostFile.WriteHostFile(self.hostlist, tmp) # fire up the DAOS servers self.server_group = self.params.get("server_group",'/run/server/', 'daos_server') ServerUtils.runServer(self.hostfile, self.server_group, build_paths['PREFIX'] + '/../') time.sleep(3) # create a pool to test with createmode = self.params.get("mode",'/run/pool/createmode/') createuid = self.params.get("uid",'/run/pool/createuid/') creategid = self.params.get("gid",'/run/pool/creategid/') createsetid = self.params.get("setname",'/run/pool/createset/') createsize = self.params.get("size",'/run/pool/createsize/') self.POOL = DaosPool(self.CONTEXT) self.POOL.create(createmode, createuid, creategid, createsize, createsetid) uuid = self.POOL.get_uuid_str() time.sleep(2) def tearDown(self): """ cleanup after the test """ try: os.remove(self.hostfile) if self.POOL: self.POOL.destroy(1) finally: ServerUtils.stopServer(hosts=self.hostlist) def test_destroy_while_rebuilding(self): """ :avocado: tags=pool,pooldestroy,rebuild,desreb """ try: print "\nsetup complete, starting test\n" # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) sh = DaosServer(self.CONTEXT, bytes(self.server_group), svr_to_kill) print "created server " # BUG if you don't connect the rebuild doesn't start correctly self.POOL.connect(1 << 1) status = self.POOL.pool_query() if not status.pi_ntargets == len(self.hostlist): self.fail("target count wrong.\n") if not status.pi_ndisabled == 0: self.fail("disabled target count wrong.\n") print "connect " time.sleep(1) sh.kill(1) print "killed server " # exclude the target from the dead server self.POOL.exclude([svr_to_kill]) print "exclude target " #self.POOL.disconnect() #print "disconnect " # the rebuild won't take long since there is no data so do # the destroy quickly self.POOL.destroy(1) print "destroy " except DaosApiError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
class PoolSvc(Test): """ Tests svc argument while pool create. """ def setUp(self): # get paths from the build_vars generated by build with open('../../../.build_vars.json') as build_file: build_paths = json.load(build_file) self.basepath = os.path.normpath(build_paths['PREFIX'] + "/../") self.server_group = self.params.get("name", '/server_config/', 'daos_server') self.daosctl = self.basepath + '/install/bin/daosctl' # setup the DAOS python API self.context = DaosContext(build_paths['PREFIX'] + '/lib/') self.pool = None self.hostfile = None self.hostlist = self.params.get("test_machines", '/run/hosts/*') self.hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) print("Host file is: {}".format(self.hostfile)) self.agent_sessions = AgentUtils.run_agent(self.basepath, self.hostlist) server_utils.run_server(self.hostfile, self.server_group, self.basepath) def tearDown(self): try: if self.pool is not None and self.pool.attached: self.pool.destroy(1) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.stop_server(hosts=self.hostlist) def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 i = 0 while ( int(self.pool.svc.rl_ranks[i]) > 0 and int(self.pool.svc.rl_ranks[i]) <= createsvc[0] and int(self.pool.svc.rl_ranks[i]) != 999999 ): i += 1 if i != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for j in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[j])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") if createsvc[0] == 3: self.pool.disconnect() cmd = ('{0} kill-leader --uuid={1}' .format(self.daosctl, self.pool.get_uuid_str())) process.system(cmd) self.pool.connect(1 << 1) self.pool.disconnect() server = DaosServer(self.context, self.server_group, 2) server.kill(1) self.pool.exclude([2]) self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")