def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.daos_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        manager.job.daos_cont.update(container_info
                                     ["{}{}{}".format(oclass,
                                                      api,
                                                      test[2])])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
示例#2
0
文件: enospace.py 项目: wli5/daos
    def ior_bg_thread(self, results):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        Args:
            results (queue): queue for returning thread results
        """
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        manager = Mpirun(ior_bg_cmd, mpitype="mpich")
        self.create_cont()
        manager.job.dfs_cont.update(self.container.uuid)
        env = ior_bg_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(1)
        manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            manager.run()
        except (CommandFailure, TestFail) as _error:
            results.put("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                manager.run()
            except (CommandFailure, TestFail) as _error:
                results.put("FAIL")
                break
示例#3
0
    def ior_thread(self, pool, oclass, api, test, flags, results):
        """This method calls job manager for IOR command
        invocation.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[0])
        ior_cmd.block_size.update(test[1])
        ior_cmd.flags.update(flags)
        if "-w" in flags:
            self.container_info["{}{}{}"
                                .format(oclass,
                                        api,
                                        test[0])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[0])])
        manager.job.dfs_cont.update(self.container_info[key])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
示例#4
0
class OSAUtils(IorTestBase):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server offline drain test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAUtils, self).setUp()
        self.container = None
        self.obj = None
        self.ioreq = None
        self.dmg_command = self.get_dmg_command()
        self.no_of_dkeys = self.params.get("no_of_dkeys",
                                           '/run/dkeys/*',
                                           default=[0])[0]
        self.no_of_akeys = self.params.get("no_of_akeys",
                                           '/run/akeys/*',
                                           default=[0])[0]
        self.record_length = self.params.get("length",
                                             '/run/record/*',
                                             default=[0])[0]

    @fail_on(CommandFailure)
    def get_pool_leader(self):
        """Get the pool leader.

        Returns:
            int: pool leader value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["leader"])

    @fail_on(CommandFailure)
    def get_rebuild_status(self):
        """Get the rebuild status.

        Returns:
            str: reuild status

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return data["rebuild"]["status"]

    @fail_on(CommandFailure)
    def is_rebuild_done(self, time_interval):
        """Rebuild is completed/done.
        Args:
            time_interval: Wait interval between checks
        Returns:
            False: If rebuild_status not "done" or "completed".
            True: If rebuild status is "done" or "completed".
        """
        status = False
        fail_count = 0
        completion_flag = ["done", "completed"]
        while fail_count <= 20:
            rebuild_status = self.get_rebuild_status()
            time.sleep(time_interval)
            fail_count += 1
            if rebuild_status in completion_flag:
                status = True
                break
        return status

    @fail_on(CommandFailure)
    def assert_on_rebuild_failure(self):
        """If the rebuild is not successful,
        raise assert.
        """
        rebuild_status = self.get_rebuild_status()
        self.log.info("Rebuild Status: %s", rebuild_status)
        rebuild_failed_string = ["failed", "scanning", "aborted", "busy"]
        self.assertTrue(rebuild_status not in rebuild_failed_string,
                        "Rebuild failed")

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    @fail_on(DaosApiError)
    def write_single_object(self):
        """Write some data to the existing pool."""
        self.pool.connect(2)
        csum = self.params.get("enable_checksum", '/run/container/*')
        self.container = DaosContainer(self.context)
        input_param = self.container.cont_input_values
        input_param.enable_chksum = csum
        self.container.create(poh=self.pool.pool.handle, con_prop=input_param)
        self.container.open()
        self.obj = DaosObj(self.context, self.container)
        self.obj.create(objcls=1)
        self.obj.open()
        self.ioreq = IORequest(self.context,
                               self.container,
                               self.obj,
                               objtype=4)
        self.log.info("Writing the Single Dataset")
        for dkey in range(self.no_of_dkeys):
            for akey in range(self.no_of_akeys):
                indata = ("{0}".format(str(akey)[0]) * self.record_length)
                d_key_value = "dkey {0}".format(dkey)
                c_dkey = ctypes.create_string_buffer(d_key_value)
                a_key_value = "akey {0}".format(akey)
                c_akey = ctypes.create_string_buffer(a_key_value)
                c_value = ctypes.create_string_buffer(indata)
                c_size = ctypes.c_size_t(ctypes.sizeof(c_value))
                self.ioreq.single_insert(c_dkey, c_akey, c_value, c_size)
        self.obj.close()
        self.container.close()

    @fail_on(DaosApiError)
    def verify_single_object(self):
        """Verify the container data on the existing pool."""
        self.pool.connect(2)
        self.container.open()
        self.obj.open()
        self.log.info("Single Dataset Verification -- Started")
        for dkey in range(self.no_of_dkeys):
            for akey in range(self.no_of_akeys):
                indata = ("{0}".format(str(akey)[0]) * self.record_length)
                c_dkey = ctypes.create_string_buffer("dkey {0}".format(dkey))
                c_akey = ctypes.create_string_buffer("akey {0}".format(akey))
                val = self.ioreq.single_fetch(c_dkey, c_akey, len(indata) + 1)
                if indata != (repr(val.value)[1:-1]):
                    self.d_log.error("ERROR:Data mismatch for "
                                     "dkey = {0}, "
                                     "akey = {1}".format(
                                         "dkey {0}".format(dkey),
                                         "akey {0}".format(akey)))
                    self.fail(
                        "ERROR: Data mismatch for dkey = {0}, akey={1}".format(
                            "dkey {0}".format(dkey), "akey {0}".format(akey)))
        self.obj.close()
        self.container.close()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        """
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(self.processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")
示例#5
0
class NvmePoolCapacity(TestWithServers):
    # pylfloat: disable=too-many-ancestors
    """Test class Description: Verify NOSPC
    condition is reported when accessing data beyond
    pool size.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(NvmePoolCapacity, self).setUp()

        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "{}{}{}".format(oclass, api, test[2])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def test_create_delete(self,
                           num_pool=2,
                           num_cont=5,
                           total_count=100,
                           scm_size=100000000000,
                           nvme_size=300000000000):
        """
        Test Description:
            This method is used to create/delete pools
            for a long run. It verifies the NVME free space
            during this process.
            Args:
                num_pool (int): Total pools for running test
                num_cont (int): Total containers created on each pool
                total_count (int): Total times the test is run in a loop
                scm_size (int): SCM size used in the testing
                nvme_size (int): NVME size used in the testing
            Returns:
                None
        """
        pool = {}
        cont = {}

        for loop_count in range(0, total_count):
            self.log.info("Running test %s", loop_count)
            for val in range(0, num_pool):
                pool[val] = TestPool(self.context, self.get_dmg_command())
                pool[val].get_params(self)
                # Split total SCM and NVME size for creating multiple pools.
                temp = int(scm_size) / num_pool
                pool[val].scm_size.update(str(temp))
                temp = int(nvme_size) / num_pool
                pool[val].nvme_size.update(str(temp))
                pool[val].create()
                self.pool = pool[val]
                display_string = "pool{} space at the Beginning".format(val)
                self.pool.display_pool_daos_space(display_string)
                nvme_size_begin = self.pool.get_pool_free_space("NVME")
                for cont_val in range(0, num_cont):
                    cont[cont_val] = TestContainer(pool[val])
            m_leak = 0
            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)
                nvme_size_end = self.pool.get_pool_free_space("NVME")
                pool[val].destroy()
                if (nvme_size_begin != nvme_size_end) and (m_leak == 0):
                    m_leak = val + 1
            # After destroying pools, check memory leak for each test loop.
            if m_leak != 0:
                self.fail("Memory leak : iteration {0} \n".format(m_leak))

    def test_run(self, num_pool=1):
        """
        Method Description:
            This method is called with different test_cases.
            Args:
               num_pool (int): Total pools for running a test.
            Returns:
               None
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}

        # Iterate through IOR different ior test sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_test_sequence,
                                                self.ior_flags):
            # Create the IOR threads
            threads = []
            for val in range(0, num_pool):
                pool[val] = TestPool(self.context, self.get_dmg_command())
                pool[val].get_params(self)
                # Split total SCM and NVME size for creating multiple pools.
                pool[val].scm_size.value = int(test[0]) / num_pool
                pool[val].nvme_size.value = int(test[1]) / num_pool
                pool[val].create()
                display_string = "pool{} space at the Beginning".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)

                for thrd in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
            # Launch the IOR threads
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(5)
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

            # Verify the queue and make sure no FAIL for any IOR run
            # Test should fail with ENOSPC.
            while not self.out_queue.empty():
                if (self.out_queue.get() == "FAIL" and test[4] == "PASS") \
                     or (self.out_queue.get() != "FAIL" and test[4] == "FAIL"):
                    self.fail("FAIL")

            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)
                self.pool.destroy()

    def test_nvme_pool_capacity(self):
        """Jira ID: DAOS-2085.

        Test Description:
            Purpose of this test is to verify whether DAOS stack
            report NOSPC when accessing data beyond pool size.
            Use Cases
            Test Case 1 or 2:
             1. Perform IO less than entire SSD disk space.
             2. Perform IO beyond entire SSD disk space.
            Test Case 3:
             3. Create Pool/Container and destroy them several times.

        Use case:
        :avocado: tags=all,hw,medium,ib2,nvme,full_regression
        :avocado: tags=nvme_pool_capacity
        """
        # Run test with one pool.
        self.log.info("Running Test Case 1 with one Pool")
        self.test_run(1)
        time.sleep(5)
        # Run test with two pools.
        self.log.info("Running Test Case 1 with two Pools")
        self.test_run(2)
        time.sleep(5)
        # Run Create/delete pool/container
        self.log.info("Running Test Case 3: Pool/Cont Create/Destroy")
        self.test_create_delete(10, 50, 100)
示例#6
0
    def test_rebuild_container_create(self):
        """Jira ID: DAOS-1168.

        Test Description:
            Configure 4 servers and 1 client with 1 or 2 pools and a pool
            service leader quantity of 2.  Add 1 container to the first pool
            configured with 3 replicas.  Populate the container with 1GB of
            objects.  Exclude a server that has shards of this object and
            verify that rebuild is initiated.  While rebuild is active, create
            1000 additional containers in the same pool or the second pool
            (when available).  Finally verify that rebuild completes and the
            pool info indicates the correct number of rebuilt objects and
            records.  Also confirm that all 1000 additional containers created
            during rebuild are accessible.

        Use Cases:
            Basic rebuild of container objects of array values with sufficient
            numbers of rebuild targets and no available rebuild targets.

        :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate
        """
        # Get test params
        targets = self.params.get("targets", "/run/server_config/*")
        pool_qty = self.params.get("pools", "/run/test/*")
        loop_qty = self.params.get("loops", "/run/test/*")
        cont_qty = self.params.get("containers", "/run/test/*")
        cont_obj_cls = self.params.get("container_obj_class", "/run/test/*")
        rank = self.params.get("rank", "/run/test/*")
        use_ior = self.params.get("use_ior", "/run/test/*", False)
        node_qty = len(self.hostlist_servers)

        # Get pool params
        self.pool = []
        for index in range(pool_qty):
            self.pool.append(
                TestPool(self.context, dmg_command=self.get_dmg_command()))
            self.pool[-1].get_params(self)

        if use_ior:
            # Get ior params
            mpirun = Mpirun(IorCommand())
            mpirun.job.get_params(self)
            mpirun.assign_hosts(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)
            mpirun.assign_processes(len(self.hostlist_clients))
            mpirun.assign_environment(mpirun.job.get_default_env("mpirun"))

        # Cancel any tests with tickets already assigned
        if rank in (1, 2):
            self.cancelForTicket("DAOS-2434")

        errors = [0 for _ in range(loop_qty)]
        for loop in range(loop_qty):
            # Log the start of the loop
            loop_id = "LOOP {}/{}".format(loop + 1, loop_qty)
            self.log.info("%s", "-" * 80)
            self.log.info("%s: Starting loop", loop_id)

            # Start this loop with a fresh list of containers
            self.container = []

            # Create the requested number of pools
            info_checks = []
            rebuild_checks = []
            for pool in self.pool:
                pool.create()
                info_checks.append(
                    {
                        "pi_uuid": pool.uuid,
                        "pi_ntargets": node_qty * targets,
                        "pi_nnodes": node_qty,
                        "pi_ndisabled": 0,
                    }
                )
                rebuild_checks.append(
                    {
                        "rs_errno": 0,
                        "rs_done": 1,
                        "rs_obj_nr": 0,
                        "rs_rec_nr": 0,
                    }
                )

            # Check the pool info
            status = True
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
                pool.display_pool_daos_space("after creation")
            self.assertTrue(
                status,
                "Error verifying pool info prior to excluding rank {}".format(
                    rank))

            # Create a container with 1GB of data in the first pool
            if use_ior:
                mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags")
                mpirun.job.dfs_destroy.update(False, "ior.dfs_destroy")
                mpirun.job.set_daos_params(self.server_group, self.pool[0])
                self.log.info(
                    "%s: Running IOR on pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value)
                self.run_ior(loop_id, mpirun)
            else:
                self.container.append(TestContainer(self.pool[0]))
                self.container[-1].get_params(self)
                self.container[-1].create()
                self.log.info(
                    "%s: Writing to pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, self.container[-1].uuid)
                self.container[-1].object_qty.value = 8
                self.container[-1].record_qty.value = 64
                self.container[-1].data_size.value = 1024 * 1024
                self.container[-1].write_objects(rank, cont_obj_cls)
                rank_list = self.container[-1].get_target_rank_lists(
                    " after writing data")
                self.container[-1].get_target_rank_count(rank, rank_list)

            # Display the updated pool space usage
            for pool in self.pool:
                pool.display_pool_daos_space("after container creation")

            # Exclude the first rank from the first pool to initiate rebuild
            self.pool[0].start_rebuild([rank], self.d_log)

            # Wait for rebuild to start
            self.pool[0].wait_for_rebuild(True, 1)

            # Create additional containers in the last pool
            start_index = len(self.container)
            self.add_containers_during_rebuild(
                loop_id, cont_qty, self.pool[0], self.pool[-1])

            # Confirm rebuild completes
            self.pool[0].wait_for_rebuild(False, 1)

            # Check the pool info
            info_checks[0]["pi_ndisabled"] += targets
            rebuild_checks[0]["rs_done"] = 1
            rebuild_checks[0]["rs_obj_nr"] = ">=0"
            rebuild_checks[0]["rs_rec_nr"] = ">=0"
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(status, "Error verifying pool info after rebuild")

            # Verify that each of created containers exist by opening them
            for index in range(start_index, len(self.container)):
                count = "{}/{}".format(
                    index - start_index + 1, len(self.container) - start_index)
                if not self.access_container(loop_id, index, count):
                    errors[loop] += 1

            # Destroy the containers created during rebuild
            for index in range(start_index, len(self.container)):
                self.container[index].destroy()

            # Read the data from the container created before rebuild
            if use_ior:
                self.log.info(
                    "%s: Running IOR on pool %s to verify container %s",
                    loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value)
                mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags")
                mpirun.job.dfs_destroy.update(True, "ior.dfs_destroy")
                self.run_ior(loop_id, mpirun)
            else:
                self.log.info(
                    "%s: Reading pool %s to verify container %s",
                    loop_id, self.pool[0].uuid, self.container[0].uuid)
                self.assertTrue(
                    self.container[0].read_objects(),
                    "Error verifying data written before rebuild")
                self.container[0].destroy()

            # Destroy the pools
            for pool in self.pool:
                pool.destroy(1)

            self.log.info(
                "%s: Loop %s", loop_id,
                "passed" if errors[loop] == 0 else "failed")

        self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
示例#7
0
class NvmeEnospace(ServerFillUp):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: To validate DER_NOSPACE for SCM and NVMe
    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a NvmeEnospace object."""
        super(NvmeEnospace, self).__init__(*args, **kwargs)
        self.daos_cmd = None

    def setUp(self):
        super(NvmeEnospace, self).setUp()

        # initialize daos command
        self.daos_cmd = DaosCommand(self.bin)
        self.create_pool_max_size()
        self.der_nospace_count = 0
        self.other_errors_count = 0

    def verify_enspace_log(self, der_nospace_err_count):
        """
        Function to verify there are no other error except DER_NOSPACE
        in client log and also DER_NOSPACE count is higher.

        args:
            expected_err_count(int): Expected DER_NOSPACE count from client log.
        """
        #Get the DER_NOSPACE and other error count from log
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        #Check there are no other errors in log file
        if self.other_errors_count > 0:
            self.fail('Found other errors, count {} in client log {}'
                      .format(self.other_errors_count, self.client_log))
        #Check the DER_NOSPACE error count is higher if not test will FAIL
        if self.der_nospace_count < der_nospace_err_count:
            self.fail('Expected DER_NOSPACE should be > {} and Found {}'
                      .format(der_nospace_err_count, self.der_nospace_count))

    def delete_all_containers(self):
        """
        Delete all the containers.
        """
        #List all the container
        kwargs = {"pool": self.pool.uuid}
        data = self.daos_cmd.pool_list_cont(**kwargs)
        containers = data["uuids"]

        #Destroy all the containers
        for _cont in containers:
            kwargs["cont"] = _cont
            self.daos_cmd.container_destroy(**kwargs)

    def ior_bg_thread(self, results):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        Args:
            results (queue): queue for returning thread results
        """
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich")
        self.create_cont()
        self.job_manager.job.dfs_cont.update(self.container.uuid)
        env = ior_bg_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        self.job_manager.assign_processes(1)
        self.job_manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            self.job_manager.run()
        except (CommandFailure, TestFail) as _error:
            results.put("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                self.job_manager.run()
            except (CommandFailure, TestFail) as _error:
                results.put("FAIL")
                break

    def run_enospace_foreground(self):
        """
        Function to run test and validate DER_ENOSPACE and expected storage size
        """
        #Fill 75% more of SCM pool,Aggregation is Enabled so NVMe space will be
        #start filling
        print('Starting main IOR load')
        self.start_ior_load(storage='SCM', percent=75)
        print(self.pool.pool_percentage_used())

        #Fill 50% more of SCM pool,Aggregation is Enabled so NVMe space will be
        #filled
        self.start_ior_load(storage='SCM', percent=50)
        print(self.pool.pool_percentage_used())

        #Fill 60% more of SCM pool, now NVMe will be Full so data will not be
        #moved to NVMe but it will start filling SCM. SCM size will be going to
        #full and this command expected to fail with DER_NOSPACE
        try:
            self.start_ior_load(storage='SCM', percent=60)
            self.fail('This test suppose to FAIL because of DER_NOSPACE'
                      'but it got Passed')
        except TestFail as _error:
            self.log.info('Test expected to fail because of DER_NOSPACE')

        #Display the pool%
        print(self.pool.pool_percentage_used())

        #verify the DER_NO_SAPCE error count is expected and no other Error in
        #client log
        self.verify_enspace_log(self.der_nospace_count)

        #Check both NVMe and SCM are full.
        pool_usage = self.pool.pool_percentage_used()
        #NVMe should be almost full if not test will fail.
        if pool_usage['nvme'] > 8:
            self.fail('Pool NVMe used percentage should be < 8%, instead {}'.
                      format(pool_usage['nvme']))
        #For SCM some % space used for system so it won't be 100% full.
        if pool_usage['scm'] > 50:
            self.fail('Pool SCM used percentage should be < 50%, instead {}'.
                      format(pool_usage['scm']))

    def run_enospace_with_bg_job(self):
        """
        Function to run test and validate DER_ENOSPACE and expected storage
        size. Single IOR job will run in background while space is filling.
        """
        #Get the initial DER_ENOSPACE count
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        # Start the IOR Background thread which will write small data set and
        # read in loop, until storage space is full.
        out_queue = queue.Queue()
        job = threading.Thread(target=self.ior_bg_thread,
                               kwargs={"results": out_queue})
        job.daemon = True
        job.start()

        #Run IOR in Foreground
        self.run_enospace_foreground()
        # Verify the background job queue and make sure no FAIL for any IOR run
        while not self.out_queue.empty():
            if self.out_queue.get() == "FAIL":
                self.fail("One of the Background IOR job failed")

    def test_enospace_lazy_with_bg(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM and NVMe is full with
                          default (lazy) Aggregation mode.

        Use Case: This tests will create the pool and fill 75% of SCM size which
                  will trigger the aggregation because of space pressure,
                  next fill 75% more which should fill NVMe. Try to fill 60%
                  more and now SCM size will be full too.
                  verify that last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full.One background IO job will be running
                  continuously.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_bg
        """
        print(self.pool.pool_percentage_used())

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

    def test_enospace_lazy_with_fg(self):
        """Jira ID: DAOS-4756.

        Test Description: Fill up the system (default aggregation mode) and
                          delete all containers in loop, which should release
                          the space.

        Use Case: This tests will create the pool and fill 75% of SCM size which
                  will trigger the aggregation because of space pressure,
                  next fill 75% more which should fill NVMe. Try to fill 60%
                  more and now SCM size will be full too.
                  verify that last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full. Delete all the containers.
                  Do this in loop for 10 times and verify space is released.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_fg
        """
        print(self.pool.pool_percentage_used())

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_lazy_fg Loop--------- {}".format(_loop))
            #Run IOR to fill the pool.
            self.run_enospace_foreground()
            #Delete all the containers
            self.delete_all_containers()
            #Delete container will take some time to release the space
            time.sleep(60)

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)

    def test_enospace_time_with_bg(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM is full and it release
                          the size when container destroy with Aggregation
                          set on time mode.

        Use Case: This tests will create the pool. Set Aggregation mode to Time.
                  Start filling 75% of SCM size. Aggregation will be triggered
                  time to time, next fill 75% more which will fill up NVMe.
                  Try to fill 60% more and now SCM size will be full too.
                  Verify last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full.One background IO job will be running
                  continuously.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_time,enospc_time_bg
        """
        print(self.pool.pool_percentage_used())

        # Enabled TIme mode for Aggregation.
        self.pool.set_property("reclaim", "time")

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

    def test_enospace_time_with_fg(self):
        """Jira ID: DAOS-4756.

        Test Description: Fill up the system (time aggregation mode) and
                          delete all containers in loop, which should release
                          the space.

        Use Case: This tests will create the pool. Set Aggregation mode to Time.
                  Start filling 75% of SCM size. Aggregation will be triggered
                  time to time, next fill 75% more which will fill up NVMe.
                  Try to fill 60% more and now SCM size will be full too.
                  Verify last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full. Delete all the containers.
                  Do this in loop for 10 times and verify space is released.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_time,enospc_time_fg
        """
        print(self.pool.pool_percentage_used())

        # Enabled TIme mode for Aggregation.
        self.pool.set_property("reclaim", "time")

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_time_fg Loop--------- {}".format(_loop))
            #Run IOR to fill the pool.
            self.run_enospace_with_bg_job()
            #Delete all the containers
            self.delete_all_containers()
            #Delete container will take some time to release the space
            time.sleep(60)

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)

    @skipForTicket("DAOS-5403")
    def test_performance_storage_full(self):
        """Jira ID: DAOS-4756.

        Test Description: Verify IO Read performance when pool size is full.

        Use Case: This tests will create the pool. Run small set of IOR as
                  baseline.Start IOR with < 4K which will start filling SCM
                  and trigger aggregation and start filling up NVMe.
                  Check the IOR baseline read number and make sure it's +- 5%
                  to the number ran prior system storage was full.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_performance
        """
        #Write the IOR Baseline and get the Read BW for later comparison.
        print(self.pool.pool_percentage_used())
        #Write First
        self.start_ior_load(storage='SCM', percent=1)
        #Read the baseline data set
        self.start_ior_load(storage='SCM', operation='Read', percent=1)
        max_mib_baseline = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)])
        baseline_cont_uuid = self.ior_cmd.dfs_cont.value
        print("IOR Baseline Read MiB {}".format(max_mib_baseline))

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

        #Read the same container which was written at the beginning.
        self.container.uuid = baseline_cont_uuid
        self.start_ior_load(storage='SCM', operation='Read', percent=1)
        max_mib_latest = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)])
        print("IOR Latest Read MiB {}".format(max_mib_latest))

        #Check if latest IOR read performance is in Tolerance of 5%, when
        #Storage space is full.
        if abs(max_mib_baseline-max_mib_latest) > (max_mib_baseline/100 * 5):
            self.fail('Latest IOR read performance is not under 5% Tolerance'
                      ' Baseline Read MiB = {} and latest IOR Read MiB = {}'
                      .format(max_mib_baseline, max_mib_latest))

    def test_enospace_no_aggregation(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM is full and it release
                          the size when container destroy with Aggregation
                          disabled.

        Use Case: This tests will create the pool and disable aggregation. Fill
                  75% of SCM size which should work, next try fill 10% more
                  which should fail with DER_NOSPACE. Destroy the container
                  and validate the Pool SCM free size is close to full (> 95%).
                  Do this in loop ~10 times and verify the DER_NOSPACE and SCM
                  free size after container destroy.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_no_aggregation
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        print(self.pool.pool_percentage_used())

        # Disable the aggregation
        self.pool.set_property("reclaim", "disabled")

        #Get the DER_NOSPACE and other error count from log
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_no_aggregation Loop--------- {}".format(_loop))
            #Fill 75% of SCM pool
            self.start_ior_load(storage='SCM', percent=40)

            print(self.pool.pool_percentage_used())

            try:
                #Fill 10% more to SCM ,which should Fail because no SCM space
                self.start_ior_load(storage='SCM', percent=40)
                self.fail('This test suppose to fail because of DER_NOSPACE'
                          'but it got Passed')
            except TestFail as _error:
                self.log.info('Expected to fail because of DER_NOSPACE')

            #Verify DER_NO_SAPCE error count is expected and no other Error
            #in client log.
            self.verify_enspace_log(self.der_nospace_count)

            #Delete all the containers
            self.delete_all_containers()

            #Get the pool usage
            pool_usage = self.pool.pool_percentage_used()
            #Delay to release the SCM size.
            time.sleep(60)
            print(pool_usage)
            #SCM pool size should be released (some still be used for system)
            #Pool SCM free % should not be less than 50%
            if pool_usage['scm'] > 55:
                self.fail('SCM pool used percentage should be < 55, instead {}'.
                          format(pool_usage['scm']))

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)
示例#8
0
    def ior_runner_thread(self, results):
        """Start threads and wait until all threads are finished.

        Destroy the container at the end of this thread run.

        Args:
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        cmd = DaosCommand(os.path.join(self.prefix, "bin"))
        cmd.set_sub_command("container")
        cmd.sub_command_class.set_sub_command("destroy")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Iterate through IOR different value and run in sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_transfer_size,
                                                self.ior_flags):
            # Define the arguments for the ior_runner_thread method
            ior_cmd = IorCommand()
            ior_cmd.get_params(self)
            ior_cmd.set_daos_params(self.server_group, self.pool)
            ior_cmd.dfs_oclass.update(oclass)
            ior_cmd.api.update(api)
            ior_cmd.transfer_size.update(test[0])
            ior_cmd.block_size.update(test[1])
            ior_cmd.flags.update(flags)

            container_info["{}{}{}"
                           .format(oclass,
                                   api,
                                   test[0])] = str(uuid.uuid4())

            # Define the job manager for the IOR command
            manager = Mpirun(ior_cmd, mpitype="mpich")
            manager.job.dfs_cont.update(container_info
                                         ["{}{}{}".format(oclass,
                                                          api,
                                                          test[0])])
            env = ior_cmd.get_default_env(str(manager))
            manager.assign_hosts(self.hostlist_clients, self.workdir, None)
            manager.assign_processes(processes)
            manager.assign_environment(env, True)

            # run IOR Command
            try:
                manager.run()
            except CommandFailure as _error:
                results.put("FAIL")

        # Destroy the container created by thread
        for key in container_info:
            cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid
            cmd.sub_command_class.sub_command_class.svc.value = \
                self.pool.svc_ranks
            cmd.sub_command_class.sub_command_class.cont.value = \
                container_info[key]

            try:
                cmd._get_result()
            except CommandFailure as _error:
                results.put("FAIL")
示例#9
0
class NvmeFragmentation(TestWithServers):
    # pylint: disable=too-many-ancestors
    # pylint: disable=too-many-instance-attributes
    """NVMe drive fragmentation test cases.

    Test class Description:
        Verify the drive fragmentation does free the space and do not lead to
        ENOM_SPACE.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super().setUp()

        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_transfer_size = self.params.get("transfer_block_size",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    def ior_runner_thread(self, results):
        """Start threads and wait until all threads are finished.

        Destroy the container at the end of this thread run.

        Args:
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        cmd = DaosCommand(os.path.join(self.prefix, "bin"))
        cmd.set_sub_command("container")
        cmd.sub_command_class.set_sub_command("destroy")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Iterate through IOR different value and run in sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_transfer_size,
                                                self.ior_flags):
            # Define the arguments for the ior_runner_thread method
            ior_cmd = IorCommand()
            ior_cmd.get_params(self)
            ior_cmd.set_daos_params(self.server_group, self.pool)
            ior_cmd.dfs_oclass.update(oclass)
            ior_cmd.api.update(api)
            ior_cmd.transfer_size.update(test[0])
            ior_cmd.block_size.update(test[1])
            ior_cmd.flags.update(flags)

            # Define the job manager for the IOR command
            self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
            cont_uuid = str(uuid.uuid4())
            self.job_manager.job.dfs_cont.update(cont_uuid)
            env = ior_cmd.get_default_env(str(self.job_manager))
            self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                          None)
            self.job_manager.assign_processes(processes)
            self.job_manager.assign_environment(env, True)

            # run IOR Command
            try:
                self.job_manager.run()
                container_info["{}{}{}".format(oclass, api,
                                               test[0])] = cont_uuid
            except CommandFailure as _error:
                results.put("FAIL")

        # Destroy the container created by thread
        for key in container_info:
            cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid
            #cmd.sub_command_class.sub_command_class.svc.value = \
            #    self.pool.svc_ranks
            cmd.sub_command_class.sub_command_class.cont.value = \
                container_info[key]

            try:
                # pylint: disable=protected-access
                cmd._get_result()
            except CommandFailure as _error:
                results.put("FAIL")

    def test_nvme_fragmentation(self):
        """Jira ID: DAOS-2332.

        Test Description:
            Purpose of this test is to verify there is no Fragmentation
            after doing some IO write/delete operation for ~hour.

        Use case:
        Create object with different transfer size in parallel (10 IOR threads)
        Delete the container created by IOR which will dealloc NVMe block
        Run above code in loop for some time (~1 hours) and expect
        not to fail with NO ENOM SPAC.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium
        :avocado: tags=nvme,ib2,nvme_fragmentation
        """
        no_of_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        self.add_pool(connect=False)
        self.pool.display_pool_daos_space("Pool space at the Beginning")

        # Repeat the test for 30 times which will take ~1 hour
        for test_loop in range(30):
            self.log.info("--Test Repeat for loop %s---", test_loop)
            # Create the IOR threads
            threads = []
            for thrd in range(no_of_jobs):
                # Add a thread for these IOR arguments
                threads.append(
                    threading.Thread(target=self.ior_runner_thread,
                                     kwargs={"results": self.out_queue}))
            # Launch the IOR threads
            for thrd in threads:
                thrd.start()
                time.sleep(5)
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

            # Verify the queue and make sure no FAIL for any IOR run
            while not self.out_queue.empty():
                if self.out_queue.get() == "FAIL":
                    self.fail("FAIL")

        self.pool.display_pool_daos_space("Pool space at the End")
示例#10
0
class OSAOnlineDrain(TestWithServers):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server Online Drain test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineDrain, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    @fail_on(CommandFailure)
    def get_pool_leader(self):
        """Get the pool leader.

        Returns:
            int: pool leader value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["leader"])

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def run_online_drain_test(self, num_pool):
        """Run the Online drain without data.
            Args:
             int : total pools to create for testing purposes.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []
        drain_servers = len(self.hostlist_servers) - 1

        # Exclude target : random two targets  (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n + 1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Drain one of the ranks (or server)
        rank = random.randint(1, drain_servers)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Drain the pool_uuid, rank and targets
        for val in range(0, num_pool):
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                threads = []
                for thrd in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(5)
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_drain(self.pool.uuid, rank,
                                                 t_string)
            self.log.info(output)

            fail_count = 0
            while fail_count <= 20:
                pver_drain = self.get_pool_version()
                time.sleep(10)
                fail_count += 1
                if pver_drain > pver_begin + 1:
                    break

            self.log.info("Pool Version after drain %s", pver_drain)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_drain > pver_begin,
                            "Pool Version Error:  After drain")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            pool[val].destroy()

    @skipForTicket("DAOS-6061")
    def test_osa_online_drain(self):
        """Test ID: DAOS-4750
        Test Description: Validate Online drain

        :avocado: tags=all,pr,hw,large,osa,osa_drain,online_drain,DAOS_5610
        """
        # Perform drain testing with 1 to 2 pools
        for pool_num in range(1, 3):
            self.run_online_drain_test(pool_num)
示例#11
0
class RbldContainerCreate(TestWithServers):
    """Rebuild with container creation test cases.

    Test Class Description:
        These rebuild tests verify the ability to create additional containers
        while rebuild is ongoing.

    :avocado: recursive
    """
    def add_containers_during_rebuild(self, loop_id, qty, pool1, pool2):
        """Add containers to a pool while rebuild is still in progress.

        Args:
            loop_id (str): loop identification string
            qty (int): the number of containers to create
            pool1 (TestPool): pool used to determine if rebuild is complete
            pool2 (TestPool): pool used to add containers

        """
        count = 0
        while not pool1.rebuild_complete() and count < qty:
            # Create a new container
            count += 1
            self.log.info(
                "%s: Creating container %s/%s in pool %s during rebuild",
                loop_id, count, qty, pool2.uuid)
            self.container.append(TestContainer(pool2))
            self.container[-1].get_params(self)
            self.container[-1].create()
            self.container[-1].write_objects()

        if count < qty:
            self.fail("{}: Rebuild completed with only {}/{} containers "
                      "created".format(loop_id, count, qty))

    def run_ior(self, loop_id, mpirun):
        """Run the ior command defined by the specified ior command object.

        Args:
            loop_id (str): loop identification string
            mpirun (Mpirun): mpirun command object to run ior
        """
        total_bytes = mpirun.job.get_aggregate_total(mpirun.processes.value)
        try:
            mpirun.run()
        except CommandFailure as error:
            self.fail(
                "{}: Error populating the container with {} bytes of data "
                "prior to target exclusion: {}".format(loop_id, total_bytes,
                                                       error))
        self.log.info("%s: %s %s bytes to the container", loop_id,
                      "Wrote" if "-w" in mpirun.job.flags.value else "Read",
                      total_bytes)

    def access_container(self, loop_id, index, message):
        """Open and close the specified container.

        Args:
            loop_id (str): loop identification string
            index (int): index of the daos container object to open/close
            message (str): additional text describing the container

        Returns:
            bool: was the opening and closing of the container successful

        """
        status = True
        self.log.info("%s: Verifying the container %s created during rebuild",
                      loop_id, message)
        try:
            self.container[index].read_objects()
            self.container[index].close()

        except TestFail as error:
            self.log.error("%s:  - Container read failed:",
                           loop_id,
                           exc_info=error)
            status = False

        return status

    def test_rebuild_container_create(self):
        """Jira ID: DAOS-1168.

        Test Description:
            Configure 4 servers and 1 client with 1 or 2 pools and a pool
            service leader quantity of 2.  Add 1 container to the first pool
            configured with 3 replicas.  Populate the container with 1GB of
            objects.  Exclude a server that has shards of this object and
            verify that rebuild is initiated.  While rebuild is active, create
            1000 additional containers in the same pool or the second pool
            (when available).  Finally verify that rebuild completes and the
            pool info indicates the correct number of rebuilt objects and
            records.  Also confirm that all 1000 additional containers created
            during rebuild are accessible.

        Use Cases:
            Basic rebuild of container objects of array values with sufficient
            numbers of rebuild targets and no available rebuild targets.

        :avocado: tags=all,full_regression
        :avocado: tags=medium
        :avocado: tags=rebuild,rebuild_cont_create
        """
        # Get test params
        targets = self.params.get("targets", "/run/server_config/*")
        pool_qty = self.params.get("pools", "/run/test/*")
        loop_qty = self.params.get("loops", "/run/test/*")
        cont_qty = self.params.get("containers", "/run/test/*")
        cont_obj_cls = self.params.get("container_obj_class", "/run/test/*")
        rank = self.params.get("rank", "/run/test/*")
        use_ior = self.params.get("use_ior", "/run/test/*", False)
        node_qty = len(self.hostlist_servers)

        # Get pool params
        self.pool = []
        for index in range(pool_qty):
            self.pool.append(self.get_pool(create=False))

        if use_ior:
            # Get ior params
            self.job_manager = Mpirun(IorCommand())
            self.job_manager.job.get_params(self)
            self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                          self.hostfile_clients_slots)
            self.job_manager.assign_processes(len(self.hostlist_clients))
            self.job_manager.assign_environment(
                self.job_manager.job.get_default_env("mpirun"))

        errors = [0 for _ in range(loop_qty)]
        for loop in range(loop_qty):
            # Log the start of the loop
            loop_id = "LOOP {}/{}".format(loop + 1, loop_qty)
            self.log.info("%s", "-" * 80)
            self.log.info("%s: Starting loop", loop_id)

            # Start this loop with a fresh list of containers
            self.container = []

            # Create the requested number of pools
            info_checks = []
            rebuild_checks = []
            for pool in self.pool:
                pool.create()
                info_checks.append({
                    "pi_uuid": pool.uuid,
                    "pi_ntargets": node_qty * targets,
                    "pi_nnodes": node_qty,
                    "pi_ndisabled": 0,
                })
                rebuild_checks.append({
                    "rs_errno": 0,
                    "rs_done": 1,
                    "rs_obj_nr": 0,
                    "rs_rec_nr": 0,
                })

            # Check the pool info
            status = True
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
                pool.display_pool_daos_space("after creation")
            self.assertTrue(
                status,
                "Error verifying pool info prior to excluding rank {}".format(
                    rank))

            # Create a container with 1GB of data in the first pool
            if use_ior:
                self.job_manager.job.flags.update("-v -w -W -G 1 -k",
                                                  "ior.flags")
                self.job_manager.job.dfs_destroy.update(
                    False, "ior.dfs_destroy")
                self.job_manager.job.set_daos_params(self.server_group,
                                                     self.pool[0])
                self.log.info(
                    "%s: Running IOR on pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid,
                    self.job_manager.job.dfs_cont.value)
                self.run_ior(loop_id, self.job_manager)
            else:
                self.container.append(TestContainer(self.pool[0]))
                self.container[-1].get_params(self)
                self.container[-1].create()
                self.log.info(
                    "%s: Writing to pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, self.container[-1].uuid)
                self.container[-1].object_qty.value = 8
                self.container[-1].record_qty.value = 64
                self.container[-1].data_size.value = 1024 * 1024
                self.container[-1].write_objects(rank, cont_obj_cls)
                rank_list = self.container[-1].get_target_rank_lists(
                    " after writing data")
                self.container[-1].get_target_rank_count(rank, rank_list)

            # Display the updated pool space usage
            for pool in self.pool:
                pool.display_pool_daos_space("after container creation")

            # Exclude the first rank from the first pool to initiate rebuild
            self.server_managers[0].stop_ranks([rank], self.d_log)

            # Wait for rebuild to start
            self.pool[0].wait_for_rebuild(True, 1)

            # Create additional containers in the last pool
            start_index = len(self.container)
            self.add_containers_during_rebuild(loop_id, cont_qty, self.pool[0],
                                               self.pool[-1])

            # Confirm rebuild completes
            self.pool[0].wait_for_rebuild(False, 1)

            # Check the pool info
            info_checks[0]["pi_ndisabled"] += targets
            rebuild_checks[0]["rs_done"] = 1
            rebuild_checks[0]["rs_obj_nr"] = ">=0"
            rebuild_checks[0]["rs_rec_nr"] = ">=0"
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(status, "Error verifying pool info after rebuild")

            # Verify that each of created containers exist by opening them
            for index in range(start_index, len(self.container)):
                count = "{}/{}".format(index - start_index + 1,
                                       len(self.container) - start_index)
                if not self.access_container(loop_id, index, count):
                    errors[loop] += 1

            # Destroy the containers created during rebuild
            for index in range(start_index, len(self.container)):
                self.container[index].destroy()

            # Read the data from the container created before rebuild
            if use_ior:
                self.log.info(
                    "%s: Running IOR on pool %s to verify container %s",
                    loop_id, self.pool[0].uuid,
                    self.job_manager.job.dfs_cont.value)
                self.job_manager.job.flags.update("-v -r -R -G 1 -E",
                                                  "ior.flags")
                self.job_manager.job.dfs_destroy.update(
                    True, "ior.dfs_destroy")
                self.run_ior(loop_id, self.job_manager)
            else:
                self.log.info("%s: Reading pool %s to verify container %s",
                              loop_id, self.pool[0].uuid,
                              self.container[0].uuid)
                self.assertTrue(self.container[0].read_objects(),
                                "Error verifying data written before rebuild")
                self.container[0].destroy()

            # Destroy the pools
            for pool in self.pool:
                pool.destroy(1)

            self.log.info("%s: Loop %s", loop_id,
                          "passed" if errors[loop] == 0 else "failed")

        self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")
示例#12
0
class MacsioTestBase(TestWithServers):
    """Base MACSio test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a MacsioTestBase object."""
        super(MacsioTestBase, self).__init__(*args, **kwargs)
        self.manager = None
        self.macsio = None

    def setUp(self):
        """Set up each test case."""
        super(MacsioTestBase, self).setUp()
        self.manager = Mpirun(None, subprocess=False, mpitype="mpich")
        self.macsio = self.get_macsio_command()

    def get_macsio_command(self):
        """Get the MacsioCommand object.

        Returns:
            MacsioCommand: object defining the macsio command

        """
        # Create the macsio command
        test_repo = self.params.get("macsio", "/run/test_repo/*", "")
        macsio = MacsioCommand(test_repo)
        macsio.get_params(self)

        # Create all the macsio output files in the same directory as the other
        # test log files
        macsio.set_output_file_path()

        return macsio

    def run_macsio(self, pool_uuid, pool_svcl, cont_uuid=None):
        """Run the macsio.

        Parameters for the macsio command are obtained from the test yaml file,
        including the path to the macsio executable.

        By default mpirun will be used to run macsio.  This can be overridden by
        redfining the self.manager attribute prior to calling this method.

        Args:
            pool_uuid (str): pool uuid
            pool_svcl (str): pool service replica
            cont_uuid (str, optional): container uuid. Defaults to None.

        Returns:
            CmdResult: Object that contains exit status, stdout, and other
                information.

        """
        # Setup the job manager (mpirun) to run the macsio command
        self.macsio.daos_pool = pool_uuid
        self.macsio.daos_svcl = pool_svcl
        self.macsio.daos_cont = cont_uuid
        self.manager.job = self.macsio
        self.manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        self.manager.assign_processes(len(self.hostlist_clients))
        self.manager.assign_environment(
            self.macsio.get_environment(self.server_managers[0],
                                        self.client_log))
        try:
            return self.manager.run()

        except CommandFailure as error:
            self.log.error("MACSio Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")