Exemplo n.º 1
0
 def tearDown(self):
     try:
         if self.agent_sessions:
             AgentUtils.stop_agent(self.hostlist, self.agent_sessions)
         server_utils.stop_server(hosts=self.hostlist)
     finally:
         # really make sure everything is gone
         check_for_pool.cleanup_pools(self.hostlist)
Exemplo n.º 2
0
 def tearDown(self):
     try:
         super(GlobalHandle, self).tearDown()
     finally:
         # really make sure everything is gone
         check_for_pool.cleanup_pools(self.hostlist_servers)
Exemplo n.º 3
0
    def test_rebuild_with_io(self):
        """
        Test ID: Rebuild-003

        Test Description: Trigger a rebuild while I/O is ongoing.

        Use Cases:
          -- single pool, single client performing continous read/write/verify
             sequence while failure/rebuild is triggered in another process

        :avocado: tags=pool,rebuild,rebuildwithio
        """

        # the rebuild tests need to redo this stuff each time so not in setup
        # as it usually would be
        server_group = self.params.get("name", '/server_config/',
                                       'daos_server')

        self.hostlist_servers = self.params.get("test_machines", '/run/hosts/')
        hostfile_servers = write_host_file.write_host_file(
            self.hostlist_servers, self.workdir)

        try:
            self.agent_sessions = agent_utils.run_agent(self.basepath,
                                                        self.hostlist_servers)
            server_utils.run_server(hostfile_servers, server_group,
                                    self.basepath)

            # use the uid/gid of the user running the test, these should
            # be perfectly valid
            createuid = os.geteuid()
            creategid = os.getegid()

            # parameters used in pool create that are in yaml
            createmode = self.params.get("mode", '/run/testparams/createmode/')
            createsetid = self.params.get("setname",
                                          '/run/testparams/createset/')
            createsize = self.params.get("size", '/run/testparams/createsize/')

            # initialize a python pool object then create the underlying
            # daos storage
            pool = DaosPool(self.context)
            pool.create(createmode, createuid, creategid,
                        createsize, createsetid, None)
            pool.connect(1 << 1)
            container = DaosContainer(self.context)
            container.create(pool.handle)
            container.open()

            # get pool status and make sure it all looks good before we start
            pool.pool_query()
            if pool.pool_info.pi_ndisabled != 0:
                self.fail("Number of disabled targets reporting incorrectly.\n")
            if pool.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("Rebuild error but rebuild hasn't run.\n")
            if pool.pool_info.pi_rebuild_st.rs_done != 1:
                self.fail("Rebuild is running but device hasn't failed yet.\n")
            if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0:
                self.fail("Rebuilt objs not zero.\n")
            if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0:
                self.fail("Rebuilt recs not zero.\n")
            dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version

            # do I/O for 30 seconds
            dummy_bw = io_utilities.continuous_io(container, 30)

            # trigger the rebuild
            rank = self.params.get("rank", '/run/testparams/ranks/*')
            server = DaosServer(self.context, server_group, rank)
            server.kill(1)
            pool.exclude([rank])

            # do another 30 seconds of I/O,
            # waiting for some improvements in server bootstrap
            # at which point we can move the I/O to a separate client and
            # really pound it with I/O
            dummy_bw = io_utilities.continuous_io(container, 30)

            # wait for the rebuild to finish
            while True:
                pool.pool_query()
                if pool.pool_info.pi_rebuild_st.rs_done == 1:
                    break
                else:
                    time.sleep(2)

            # check rebuild statistics
            if pool.pool_info.pi_ndisabled != 1:
                self.fail("Number of disabled targets reporting incorrectly: {}"
                          .format(pool.pool_info.pi_ndisabled))
            if pool.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("Rebuild error reported: {}".format(
                    pool.pool_info.pi_rebuild_st.rs_errno))
            if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0:
                self.fail("No objects have been rebuilt.")
            if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0:
                self.fail("No records have been rebuilt.")

        except (ValueError, DaosApiError) as excep:
            print(excep)
            print(traceback.format_exc())
            self.fail("Expecting to pass but test has failed.\n")

        finally:
            # wait for the I/O process to finish
            try:
                server_utils.stop_server(hosts=self.hostlist_servers)
                os.remove(hostfile_servers)
                # really make sure everything is gone
                check_for_pool.cleanup_pools(self.hostlist_servers)
            finally:
                if self.agent_sessions:
                    agent_utils.stop_agent(self.agent_sessions)
                server_utils.kill_server(self.hostlist_servers)
Exemplo n.º 4
0
    def test_rebuild_with_io(self):
        """
        Test ID: Rebuild-003

        Test Description: Trigger a rebuild while I/O is ongoing.

        Use Cases:
          -- single pool, single client performing continous read/write/verify
             sequence while failure/rebuild is triggered in another process

        :avocado: tags=pool,rebuild,rebuildwithio
        """

        # the rebuild tests need to redo this stuff each time so not in setup
        # as it usually would be
        server_group = self.params.get("name", '/server_config/',
                                       'daos_server')

        basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../")

        self.hostlist = self.params.get("test_machines", '/run/hosts/')
        hostfile = write_host_file.write_host_file(self.hostlist, self.workdir)

        try:
            self.agent_sessions = AgentUtils.run_agent(basepath, self.hostlist)
            server_utils.run_server(hostfile, server_group, basepath)

            # use the uid/gid of the user running the test, these should
            # be perfectly valid
            createuid = os.geteuid()
            creategid = os.getegid()

            # parameters used in pool create that are in yaml
            createmode = self.params.get("mode", '/run/testparams/createmode/')
            createsetid = self.params.get("setname",
                                          '/run/testparams/createset/')
            createsize = self.params.get("size", '/run/testparams/createsize/')

            # initialize a python pool object then create the underlying
            # daos storage
            pool = DaosPool(self.context)
            pool.create(createmode, createuid, creategid,
                        createsize, createsetid, None)
            pool.connect(1 << 1)
            container = DaosContainer(self.context)
            container.create(pool.handle)
            container.open()

            # get pool status and make sure it all looks good before we start
            pool.pool_query()
            if pool.pool_info.pi_ndisabled != 0:
                self.fail("Number of disabled targets reporting incorrectly.\n")
            if pool.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("Rebuild error but rebuild hasn't run.\n")
            if pool.pool_info.pi_rebuild_st.rs_done != 1:
                self.fail("Rebuild is running but device hasn't failed yet.\n")
            if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0:
                self.fail("Rebuilt objs not zero.\n")
            if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0:
                self.fail("Rebuilt recs not zero.\n")
            dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version

            # do I/O for 30 seconds
            dummy_bw = io_utilities.continuous_io(container, 30)

            # trigger the rebuild
            rank = self.params.get("rank", '/run/testparams/ranks/*')
            server = DaosServer(self.context, server_group, rank)
            server.kill(1)
            pool.exclude([rank])

            # do another 30 seconds of I/O,
            # waiting for some improvements in server bootstrap
            # at which point we can move the I/O to a separate client and
            # really pound it with I/O
            dummy_bw = io_utilities.continuous_io(container, 30)

            # wait for the rebuild to finish
            while True:
                pool.pool_query()
                if pool.pool_info.pi_rebuild_st.rs_done == 1:
                    break
                else:
                    time.sleep(2)

            # check rebuild statistics
            if pool.pool_info.pi_ndisabled != 1:
                self.fail("Number of disabled targets reporting incorrectly: {}"
                          .format(pool.pool_info.pi_ndisabled))
            if pool.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("Rebuild error reported: {}".format(
                    pool.pool_info.pi_rebuild_st.rs_errno))
            if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0:
                self.fail("No objects have been rebuilt.")
            if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0:
                self.fail("No records have been rebuilt.")

        except (ValueError, DaosApiError) as excep:
            print(excep)
            print(traceback.format_exc())
            self.fail("Expecting to pass but test has failed.\n")

        finally:
            # wait for the I/O process to finish
            try:
                server_utils.stop_server(hosts=self.hostlist)
                os.remove(hostfile)
                # really make sure everything is gone
                check_for_pool.cleanup_pools(self.hostlist)
            finally:
                if self.agent_sessions:
                    AgentUtils.stop_agent(self.hostlist, self.agent_sessions)
                server_utils.kill_server(self.hostlist)
Exemplo n.º 5
0
 def tearDown(self):
     try:
         # really make sure everything is gone
         check_for_pool.cleanup_pools(self.hostlist_servers)
     finally:
         super(RebuildTests, self).tearDown()
Exemplo n.º 6
0
    def test_multipool_rebuild(self):
        """
        Test ID: Rebuild-002
        Test Description: Expand on the basic test by rebuilding 2
        pools at once.

        Use Cases:
          -- multipool rebuild, single client, various object and record counds

        :avocado: tags=pool,rebuild,rebuildmulti
        """
        try:
            # initialize python pool object then create the underlying
            # daos storage, the way the code is now the pools should be
            # on the same storage and have the same service leader
            pool1 = DaosPool(self.context)
            pool2 = DaosPool(self.context)
            pool1.create(self.createmode, self.createuid, self.creategid,
                         self.createsize, self.createsetid)
            pool2.create(self.createmode, self.createuid, self.creategid,
                         self.createsize, self.createsetid)

            # want an open connection during rebuild
            pool1.connect(1 << 1)
            pool2.connect(1 << 1)

            # create containers
            container1 = DaosContainer(self.context)
            container1.create(pool1.handle)
            container2 = DaosContainer(self.context)
            container2.create(pool2.handle)

            # now open them
            container1.open()
            container2.open()

            # Putting the same data in both pools, at least for now to simplify
            # checking its correct
            saved_data = []
            for _objc in range(self.objcount):
                obj = None
                for _recc in range(self.reccount):

                    # make some stuff up and write
                    dkey = (''.join(
                        random.choice(string.ascii_uppercase + string.digits)
                        for _ in range(5)))
                    akey = (''.join(
                        random.choice(string.ascii_uppercase + string.digits)
                        for _ in range(5)))
                    data = (''.join(
                        random.choice(string.ascii_uppercase + string.digits)
                        for _ in range(self.size)))

                    # Used DAOS_OC_R1S_SPEC_RANK
                    # 1 replica with specified rank
                    obj, txn = container1.write_an_obj(data,
                                                       len(data),
                                                       dkey,
                                                       akey,
                                                       obj,
                                                       self.rank,
                                                       obj_cls=15)
                    obj, txn = container2.write_an_obj(data,
                                                       len(data),
                                                       dkey,
                                                       akey,
                                                       obj,
                                                       self.rank,
                                                       obj_cls=15)
                    saved_data.append((obj, dkey, akey, data, txn))

                    # read the data back and make sure its correct containers
                    data2 = container1.read_an_obj(self.size, dkey, akey, obj,
                                                   txn)
                    if data != data2.value:
                        self.fail(
                            "Wrote data P1, read it back, didn't match\n")
                    data2 = container2.read_an_obj(self.size, dkey, akey, obj,
                                                   txn)
                    if data != data2.value:
                        self.fail(
                            "Wrote data P2, read it back, didn't match\n")

            # kill a server
            server = DaosServer(self.context, self.server_group, self.rank)
            server.kill(1)

            # temporarily, the exclude of a failed target must be done
            # manually
            pool1.exclude([self.rank])
            pool2.exclude([self.rank])

            # check that rebuild finishes, no errors, progress data as
            # know it to be.  Check pool 1 first then we'll check 2 below.
            while True:
                pool1.pool_query()
                if pool1.pool_info.pi_rebuild_st.rs_done == 1:
                    break
                else:
                    time.sleep(2)

            # check there are no errors and other data matches what we
            # apriori know to be true,
            if pool1.pool_info.pi_ndisabled != 1:
                self.fail(
                    "P1 number disabled targets reporting incorrectly: {}".
                    format(pool1.pool_info.pi_ndisabled))
            if pool1.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("P1 rebuild error reported: {}".format(
                    pool1.pool_info.pi_rebuild_st.rs_errno))
            if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount:
                self.fail("P1 rebuilt objs not as expected: {0} {1}".format(
                    pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount))
            if (pool1.pool_info.pi_rebuild_st.rs_rec_nr !=
                (self.reccount * self.objcount)):
                self.fail("P1 rebuilt recs not as expected: {0} {1}".format(
                    pool1.pool_info.pi_rebuild_st.rs_rec_nr,
                    self.reccount * self.objcount))

            # now that the rebuild finished verify the records are correct
            for tup in saved_data:
                data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2],
                                               tup[0], tup[4])
                if tup[3] != data2.value:
                    self.fail("after rebuild data didn't check out")

            # now check the other pool
            while True:
                pool2.pool_query()
                if pool2.pool_info.pi_rebuild_st.rs_done == 1:
                    break
                else:
                    time.sleep(2)

            # check there are no errors and other data matches what we
            # apriori know to be true
            if pool2.pool_info.pi_ndisabled != 1:
                self.fail(
                    "Number disabled targets reporting incorrectly: {}".format(
                        pool2.pool_info.pi_ndisabled))
            if pool2.pool_info.pi_rebuild_st.rs_errno != 0:
                self.fail("Rebuild error reported: {}".format(
                    pool2.pool_info.pi_rebuild_st.rs_errno))
            if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount:
                self.fail("Rebuilt objs not as expected: {0} {1}".format(
                    pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount))
            if (pool2.pool_info.pi_rebuild_st.rs_rec_nr !=
                (self.reccount * self.objcount)):
                self.fail("Rebuilt recs not as expected: {0} {1}".format(
                    pool2.pool_info.pi_rebuild_st.rs_rec_nr,
                    (self.reccount * self.objcount)))

            # now that the rebuild finished verify the records are correct
            for tup in saved_data:
                data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2],
                                               tup[0], tup[4])
                if tup[3] != data2.value:
                    self.fail("after rebuild data didn't check out")

        except DaosApiError as excp:
            print(excp)
            print(traceback.format_exc())
            self.fail("Expecting to pass but test has failed.\n")

        finally:
            server_utils.stop_server(hosts=self.hostlist_servers)
            check_for_pool.cleanup_pools(self.hostlist_servers)
            server_utils.kill_server(self.hostlist_servers)