Exemplos de Mpirun em Python, exemplos de job_manager_utils.Mpirun em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: ior_test_base.py Projeto: dsikich/daos

    def get_ior_job_manager_command(self, custom_ior_cmd=None):
        """Get the MPI job manager command for IOR.

        Args:
            custom_ior_cmd (IorCommand): Custom IorCommand instance to create
            job_manager with.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if custom_ior_cmd:
            self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich")
        else:
            self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich")

        return self.job_manager

Exemplo n.º 2

0

Exibir arquivo

Arquivo: macsio_test_base.py Projeto: wli5/daos

    def setUp(self):
        """Set up each test case."""
        super(MacsioTestBase, self).setUp()

        # Support using different job managers to launch the daos agent/servers
        mpi_type = self.params.get("mpi_type", default="mpich")
        self.manager = Mpirun(None, subprocess=False, mpitype=mpi_type)
        self.macsio = self.get_macsio_command()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: osa_online_reintegration.py Projeto: franklyhuan/daos

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.daos_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        manager.job.daos_cont.update(container_info
                                     ["{}{}{}".format(oclass,
                                                      api,
                                                      test[2])])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

Exemplo n.º 4

0

Exibir arquivo

    def run(self, tmp, processes):
        # pylint: disable=arguments-differ
        """Run the dcp command.

        Args:
            tmp (str): path for hostfiles
            processes: Number of processes for dcp command

        Returns:
            CmdResult: Object that contains exit status, stdout, and other
                information.

        Raises:
            CommandFailure: In case dcp run command fails

        """
        self.log.info('Starting dcp')

        # Handle compatibility
        if not self.has_src_pool:
            src_pool = self.daos_src_pool.value
            src_cont = self.daos_src_cont.value
            src_path = self.src_path.value
            dst_pool = self.daos_dst_pool.value
            dst_cont = self.daos_dst_cont.value
            dst_path = self.dst_path.value
            if src_pool or src_cont:
                self.log.info(
                    "Converting --daos-src-pool to daos://pool/cont/path")
                src_path = "daos://{}/{}/{}".format(
                    src_pool, src_cont, src_path)
                self.src_path.update(src_path)
                self.daos_src_pool.update(None)
                self.daos_src_cont.update(None)
            if dst_pool or dst_cont:
                self.log.info(
                    "Converting --daos-dst-pool to daos://pool/cont/path")
                dst_path = "daos://{}/{}/{}".format(
                    dst_pool, dst_cont, dst_path)
                self.dst_path.update(dst_path)
                self.daos_dst_pool.update(None)
                self.daos_dst_cont.update(None)
        if self.has_bufsize:
            blocksize = self.blocksize.value
            if blocksize:
                self.log.info(
                    "Converting --blocksize to --bufsize")
                self.blocksize.update(None)
                self.bufsize.update(blocksize)

        # Get job manager cmd
        mpirun = Mpirun(self, mpitype="mpich")
        mpirun.assign_hosts(self.hosts, tmp)
        mpirun.assign_processes(processes)
        mpirun.exit_status_exception = self.exit_status_exception

        # run dcp
        out = mpirun.run()

        return out

Exemplo n.º 5

0

Exibir arquivo

Arquivo: data_mover_utils.py Projeto: liw/daos

    def run(self, processes, job_manager):
        # pylint: disable=arguments-differ
        """Run the MpiFileUtils command.

        Args:
            processes: Number of processes for the command.
            job_manager: Job manager variable to set/assign

        Returns:
            CmdResult: Object that contains exit status, stdout, and other
                information.

        Raises:
            CommandFailure: In case run command fails.

        """
        self.log.info('Starting %s', str(self.command).lower())

        # Get job manager cmd
        job_manager = Mpirun(self, mpi_type="mpich")
        job_manager.assign_hosts(self.hosts, self.tmp)
        job_manager.assign_processes(processes)
        job_manager.exit_status_exception = self.exit_status_exception

        # Run the command
        out = job_manager.run()

        return out

Exemplo n.º 6

0

Exibir arquivo

    def run(self, tmp, processes):
        # pylint: disable=arguments-differ
        """Run the dsync command.

        Args:
            tmp (str): path for hostfiles
            processes: Number of processes for dsync command

        Returns:
            CmdResult: Object that contains exit status, stdout, and other
                information.

        Raises:
            CommandFailure: In case dsync run command fails

        """
        self.log.info('Starting dsync')

        # Get job manager cmd
        mpirun = Mpirun(self, mpitype="mpich")
        mpirun.assign_hosts(self.hosts, tmp)
        mpirun.assign_processes(processes)
        mpirun.exit_status_exception = self.exit_status_exception

        # run dsync
        out = mpirun.run()

        return out

Exemplo n.º 7

0

Exibir arquivo

    def ior_bg_thread(self, results):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        Args:
            results (queue): queue for returning thread results
        """
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich")
        self.create_cont()
        self.job_manager.job.dfs_cont.update(self.container.uuid)
        env = ior_bg_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(1)
        self.job_manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            self.job_manager.run()
        except (CommandFailure, TestFail) as _error:
            results.put("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                self.job_manager.run()
            except (CommandFailure, TestFail) as _error:
                results.put("FAIL")
                break

Exemplo n.º 8

0

Exibir arquivo

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """This method calls job manager for IOR command
        invocation.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[0])
        ior_cmd.block_size.update(test[1])
        ior_cmd.flags.update(flags)
        if "-w" in flags:
            self.container_info["{}{}{}"
                                .format(oclass,
                                        api,
                                        test[0])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[0])])
        manager.job.dfs_cont.update(self.container_info[key])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

Exemplo n.º 9

0

Exibir arquivo

Arquivo: ior_test_base.py Projeto: yanqiang-ux/daos

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if self.subprocess:
            self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich")
        else:
            self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich")

        return self.mpirun

Exemplo n.º 10

0

Exibir arquivo

Arquivo: vol_test_base.py Projeto: wli5/daos

    def run_test(self):
        """Run the HDF5 VOL testsuites.

        Raises:
            VolFailed: for an invalid test name or test execution failure

        """
        # initialize test specific variables
        mpi_type = self.params.get("mpi_type", default="mpich")
        test_repo = self.params.get("daos_vol_repo")
        plugin_path = self.params.get("plugin_path")
        # test_list = self.params.get("daos_vol_tests", default=[])
        testname = self.params.get("testname")
        client_processes = self.params.get("client_processes")

        # create pool, container and dfuse mount
        self.add_pool(connect=False)
        self.add_container(self.pool)

        # VOL needs to run from a file system that supports xattr.
        #  Currently nfs does not have this attribute so it was recommended
        #  to create a dfuse dir and run vol tests from there.
        # create dfuse container
        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        # for test_param in test_list:
        # testname = test_param[0][1]
        # client_processes = test_param[1][1]
        exe = os.path.join(test_repo, testname)
        if mpi_type == "openmpi":
            manager = Orterun(exe, subprocess=False)
        else:
            manager = Mpirun(exe, subprocess=False, mpitype="mpich")

        env = EnvironmentVariables()
        env["DAOS_POOL"] = "{}".format(self.pool.uuid)
        env["DAOS_SVCL"] = "{}".format(self.pool.svc_ranks[0])
        env["DAOS_CONT"] = "{}".format(self.container.uuid)
        env["HDF5_VOL_CONNECTOR"] = "daos"
        env["HDF5_PLUGIN_PATH"] = "{}".format(plugin_path)
        manager.assign_hosts(self.hostlist_clients)
        manager.assign_processes(client_processes)
        manager.assign_environment(env, True)
        manager.working_dir.value = self.dfuse.mount_dir.value

        # run VOL Command
        try:
            manager.run()
        except CommandFailure as _error:
            self.fail("{} FAILED> \nException occurred: {}".format(
                exe, str(_error)))

Exemplo n.º 11

0

Exibir arquivo

    def get_mdtest_job_manager_command(self, manager):
        """Get the MPI job manager command for Mdtest.

        Returns:
            JobManager: the object for the mpi job manager command

        """
        # Initialize MpioUtils if mdtest needs to be run using mpich
        if manager == "MPICH":
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
            return Mpirun(self.mdtest_cmd, mpitype="mpich")

        return Orterun(self.mdtest_cmd)

Exemplo n.º 12

0

Exibir arquivo

    def test_load_mpi(self):
        """Simple test of apricot test code to load the openmpi module.

        :avocado: tags=all
        :avocado: tags=harness,harness_basic_test,test_load_mpi
        :avocado: tags=load_mpi
        """
        try:
            Orterun(None)
        except CommandFailure as error:
            self.fail("Orterun initialization failed: {}".format(error))

        try:
            Mpirun(None, mpi_type="mpich")
        except CommandFailure as error:
            self.fail("Mpirun initialization failed: {}".format(error))

Exemplo n.º 13

0

Exibir arquivo

    def run(self, tmp, processes):
        # pylint: disable=arguments-differ
        """Run the datamover command.

        Args:
            tmp (str): path for hostfiles
            processes: Number of processes for dcp command
        Raises:
            CommandFailure: In case datamover run command fails

        """
        self.log.info('Starting datamover')

        # Get job manager cmd
        mpirun = Mpirun(self, mpitype="mpich")
        mpirun.assign_hosts(self.hosts, tmp)
        mpirun.assign_processes(processes)
        mpirun.exit_status_exception = self.exit_status_exception

        # run dcp
        out = mpirun.run()

        return out

Exemplo n.º 14

0

Exibir arquivo

Arquivo: dbench_utils.py Projeto: zzh-wisdom/daos

    def run(self, processes=1):
        # pylint: disable=arguments-differ
        """Run the dbench command.

        Args:
            processes: mpi processes

        Raises:
            CommandFailure: In case dbench run command fails

        """
        self.log.info('Starting dbench')

        # Get job manager cmd
        mpirun = Mpirun(self, mpitype="mpich")
        mpirun.assign_hosts(self.hosts, self.tmp)
        mpirun.assign_processes(processes)
        mpirun.exit_status_exception = True

        # run dcp
        out = mpirun.run()

        return out

Exemplo n.º 15

0

Exibir arquivo

Arquivo: fragmentation.py Projeto: kjacque/daos

class NvmeFragmentation(TestWithServers):
    # pylint: disable=too-many-ancestors
    # pylint: disable=too-many-instance-attributes
    """NVMe drive fragmentation test cases.

    Test class Description:
        Verify the drive fragmentation does free the space and do not lead to
        ENOM_SPACE.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super().setUp()

        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_transfer_size = self.params.get("transfer_block_size",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    def ior_runner_thread(self, results):
        """Start threads and wait until all threads are finished.

        Destroy the container at the end of this thread run.

        Args:
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        cmd = DaosCommand(os.path.join(self.prefix, "bin"))
        cmd.set_sub_command("container")
        cmd.sub_command_class.set_sub_command("destroy")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Iterate through IOR different value and run in sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_transfer_size,
                                                self.ior_flags):
            # Define the arguments for the ior_runner_thread method
            ior_cmd = IorCommand()
            ior_cmd.get_params(self)
            ior_cmd.set_daos_params(self.server_group, self.pool)
            ior_cmd.dfs_oclass.update(oclass)
            ior_cmd.api.update(api)
            ior_cmd.transfer_size.update(test[0])
            ior_cmd.block_size.update(test[1])
            ior_cmd.flags.update(flags)

            # Define the job manager for the IOR command
            self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
            cont_uuid = str(uuid.uuid4())
            self.job_manager.job.dfs_cont.update(cont_uuid)
            env = ior_cmd.get_default_env(str(self.job_manager))
            self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                          None)
            self.job_manager.assign_processes(processes)
            self.job_manager.assign_environment(env, True)

            # run IOR Command
            try:
                self.job_manager.run()
                container_info["{}{}{}".format(oclass, api,
                                               test[0])] = cont_uuid
            except CommandFailure as _error:
                results.put("FAIL")

        # Destroy the container created by thread
        for key in container_info:
            cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid
            #cmd.sub_command_class.sub_command_class.svc.value = \
            #    self.pool.svc_ranks
            cmd.sub_command_class.sub_command_class.cont.value = \
                container_info[key]

            try:
                # pylint: disable=protected-access
                cmd._get_result()
            except CommandFailure as _error:
                results.put("FAIL")

    def test_nvme_fragmentation(self):
        """Jira ID: DAOS-2332.

        Test Description:
            Purpose of this test is to verify there is no Fragmentation
            after doing some IO write/delete operation for ~hour.

        Use case:
        Create object with different transfer size in parallel (10 IOR threads)
        Delete the container created by IOR which will dealloc NVMe block
        Run above code in loop for some time (~1 hours) and expect
        not to fail with NO ENOM SPAC.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium
        :avocado: tags=nvme,ib2,nvme_fragmentation
        """
        no_of_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        self.add_pool(connect=False)
        self.pool.display_pool_daos_space("Pool space at the Beginning")

        # Repeat the test for 30 times which will take ~1 hour
        for test_loop in range(30):
            self.log.info("--Test Repeat for loop %s---", test_loop)
            # Create the IOR threads
            threads = []
            for thrd in range(no_of_jobs):
                # Add a thread for these IOR arguments
                threads.append(
                    threading.Thread(target=self.ior_runner_thread,
                                     kwargs={"results": self.out_queue}))
            # Launch the IOR threads
            for thrd in threads:
                thrd.start()
                time.sleep(5)
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

            # Verify the queue and make sure no FAIL for any IOR run
            while not self.out_queue.empty():
                if self.out_queue.get() == "FAIL":
                    self.fail("FAIL")

        self.pool.display_pool_daos_space("Pool space at the End")

Exemplo n.º 16

0

Exibir arquivo

Arquivo: ior_test_base.py Projeto: yanqiang-ux/daos

class IorTestBase(TestWithServers):
    """Base IOR test class.

    :avocado: recursive
    """

    IOR_WRITE_PATTERN = "Commencing write performance test"
    IOR_READ_PATTERN = "Commencing read performance test"

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super(IorTestBase, self).__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.dfuse = None
        self.container = None
        self.lock = None
        self.mpirun = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super(IorTestBase, self).setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        self.subprocess = self.params.get("subprocess", '/run/ior/*', False)

        # lock is needed for run_multiple_ior method.
        self.lock = threading.Lock()

    def tearDown(self):
        """Tear down each test case."""
        try:
            if self.dfuse:
                self.dfuse.stop()
        finally:
            # Stop the servers and agents
            super(IorTestBase, self).tearDown()

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params
        self.pool = TestPool(
            self.context, dmg_command=self.get_dmg_command())
        self.pool.get_params(self)

        # Create a pool
        self.pool.create()

    def create_cont(self):
        """Create a TestContainer object to be used to create container."""
        # Get container params
        self.container = TestContainer(
            self.pool, daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # create container
        self.container.create()

    def _start_dfuse(self):
        """Create a DfuseCommand object to start dfuse."""
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)

        # update dfuse params
        self.dfuse.set_dfuse_params(self.pool)
        self.dfuse.set_dfuse_cont_param(self.container)
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        try:
            # start dfuse
            self.dfuse.run()
        except CommandFailure as error:
            self.log.error("Dfuse command %s failed on hosts %s",
                           str(self.dfuse),
                           str(NodeSet.fromlist(self.dfuse.hosts)),
                           exc_info=error)
            self.fail("Test was expected to pass but it failed.\n")

    def run_ior_with_pool(self, intercept=None, test_file_suffix="",
                          test_file="daos:testFile", create_pool=True,
                          create_cont=True, stop_dfuse=True):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:testFile". Is ignored when using POSIX through DFUSE.
            create_pool (bool, optional): If it is true, create pool and
                container else just run the ior. Defaults to True.
            create_cont (bool, optional): Create new container. Default is True
            stop_dfuse (bool, optional): Stop dfuse after ior command is
                finished. Default is True.

        Returns:
            CmdResult: result of the ior command execution

        """
        if create_pool:
            self.update_ior_cmd_with_pool(create_cont)

        # start dfuse if api is POSIX
        if self.ior_cmd.api.value == "POSIX":
            # Connect to the pool, create container and then start dfuse
            if not self.dfuse:
                self._start_dfuse()
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))

        out = self.run_ior(self.get_ior_job_manager_command(), self.processes,
                           intercept)

        if stop_dfuse and self.dfuse:
            self.dfuse.stop()
            self.dfuse = None
        return out

    def update_ior_cmd_with_pool(self, create_cont=True):
        """Update ior_cmd with pool."""
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Create a container, if needed.
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        if create_cont:
            self.pool.connect()
            self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self):
        """Get the MPI job manager command for IOR.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if self.subprocess:
            self.mpirun = Mpirun(self.ior_cmd, True, mpitype="mpich")
        else:
            self.mpirun = Mpirun(self.ior_cmd, mpitype="mpich")

        return self.mpirun

    def check_subprocess_status(self, operation="write"):
        """Check subprocess status """
        if operation == "write":
            self.ior_cmd.pattern = self.IOR_WRITE_PATTERN
        elif operation == "read":
            self.ior_cmd.pattern = self.IOR_READ_PATTERN
        else:
            self.fail("Exiting Test: Inappropriate operation type \
                      for subprocess status check")

        if not self.ior_cmd.check_ior_subprocess_status(
                self.mpirun.process, self.ior_cmd):
            self.fail("Exiting Test: Subprocess not running")

    def run_ior(self, manager, processes, intercept=None, display_space=True):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str): path to interception library.
        """
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(
            self.hostlist_clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(processes)
        manager.assign_environment(env)

        try:
            if display_space:
                self.pool.display_pool_daos_space()
            out = manager.run()

            if not self.subprocess:
                for line in out.stdout.splitlines():
                    if 'WARNING' in line:
                        self.fail("IOR command issued warnings.\n")
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            if not self.subprocess and display_space:
                self.pool.display_pool_daos_space()

    def stop_ior(self):
        """Stop IOR process.
        Args:
            manager (str): mpi job manager command
        """
        self.log.info(
            "<IOR> Stopping in-progress IOR command: %s", self.mpirun.__str__())

        try:
            out = self.mpirun.stop()
            return out
        except CommandFailure as error:
            self.log.error("IOR stop Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()


    def run_multiple_ior_with_pool(self, results, intercept=None):
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str): path to the interception library. Shall be used
                             only for POSIX through DFUSE.
            ior_flags (str, optional): ior flags. Defaults to None.
            object_class (str, optional): daos object class. Defaults to None.
        """
        self.update_ior_cmd_with_pool()

        # start dfuse for POSIX api. This is specific to interception
        # library test requirements.
        self._start_dfuse()

        # Create two jobs and run in parallel.
        # Job1 will have 3 client set up to use dfuse + interception
        # library
        # Job2 will have 1 client set up to use only dfuse.
        job1 = self.get_new_job(self.hostlist_clients[:-1], 1,
                                results, intercept)
        job2 = self.get_new_job([self.hostlist_clients[-1]], 2,
                                results, None)

        job1.start()
        # Since same ior_cmd is used to trigger the MPIRUN
        # with different parameters, pausing for 2 seconds to
        # avoid data collisions.
        time.sleep(2)
        job2.start()
        job1.join()
        job2.join()
        self.dfuse.stop()
        self.dfuse = None

    def get_new_job(self, clients, job_num, results, intercept=None):
        """Create a new thread for ior run.

        Args:
            clients (list): hosts on which to run ior
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        job = threading.Thread(target=self.run_multiple_ior, args=[
            clients, results, job_num, intercept])
        return job

    def run_multiple_ior(self, clients, results, job_num, intercept=None):
        """Run the IOR command.

        Args:
            clients (list): hosts on which to run ior
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            intercept (str, optional): path to interception library. Defaults to
                None.
        """
        self.lock.acquire(True)
        tsize = self.ior_cmd.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        self.ior_cmd.test_file.update(testfile)
        manager = self.get_ior_job_manager_command()
        procs = (self.processes // len(self.hostlist_clients)) * len(clients)
        env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(clients, self.workdir, self.hostfile_clients_slots)
        manager.assign_processes(procs)
        manager.assign_environment(env)
        self.lock.release()
        try:
            self.pool.display_pool_daos_space()
            out = manager.run()
            self.lock.acquire(True)
            results[job_num] = IorCommand.get_ior_metrics(out)
            self.lock.release()
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.pool.display_pool_daos_space()

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))

    def execute_cmd(self, cmd, fail_on_err=True, display_output=True):
        """Execute cmd using general_utils.pcmd

          Args:
            cmd (str): String command to be executed
            fail_on_err (bool): Boolean for whether to fail the test if command
                                execution returns non zero return code.
            display_output (bool): Boolean for whether to display output.

          Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                  values indicating which hosts yielded the return code.
        """
        try:
            # execute bash cmds
            ret = pcmd(
                self.hostlist_clients, cmd, verbose=display_output, timeout=300)
            if 0 not in ret:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in
                         ret.items() if code != 0]))
                if fail_on_err:
                    raise CommandFailure(
                        "Error running '{}' on the following "
                        "hosts: {}".format(cmd, error_hosts))

         # report error if any command fails
        except CommandFailure as error:
            self.log.error("DfuseSparseFile Test Failed: %s",
                           str(error))
            self.fail("Test was expected to pass but "
                      "it failed.\n")
        return ret

Exemplo n.º 17

0

Exibir arquivo

Arquivo: macsio_test_base.py Projeto: yanqiang-ux/daos

class MacsioTestBase(TestWithServers):
    """Base MACSio test class.

    :avocado: recursive
    """
    def __init__(self, *args, **kwargs):
        """Initialize a MacsioTestBase object."""
        super(MacsioTestBase, self).__init__(*args, **kwargs)
        self.manager = None
        self.macsio = None

    def setUp(self):
        """Set up each test case."""
        super(MacsioTestBase, self).setUp()
        self.manager = Mpirun(None, subprocess=False, mpitype="mpich")
        self.macsio = self.get_macsio_command()

    def get_macsio_command(self):
        """Get the MacsioCommand object.

        Returns:
            MacsioCommand: object defining the macsio command

        """
        # Create the macsio command
        test_repo = self.params.get("macsio", "/run/test_repo/*", "")
        macsio = MacsioCommand(test_repo)
        macsio.get_params(self)

        # Create all the macsio output files in the same directory as the other
        # test log files
        macsio.set_output_file_path()

        return macsio

    def run_macsio(self, pool_uuid, pool_svcl, cont_uuid=None):
        """Run the macsio.

        Parameters for the macsio command are obtained from the test yaml file,
        including the path to the macsio executable.

        By default mpirun will be used to run macsio.  This can be overridden by
        redfining the self.manager attribute prior to calling this method.

        Args:
            pool_uuid (str): pool uuid
            pool_svcl (str): pool service replica
            cont_uuid (str, optional): container uuid. Defaults to None.

        Returns:
            CmdResult: Object that contains exit status, stdout, and other
                information.

        """
        # Setup the job manager (mpirun) to run the macsio command
        self.macsio.daos_pool = pool_uuid
        self.macsio.daos_svcl = pool_svcl
        self.macsio.daos_cont = cont_uuid
        self.manager.job = self.macsio
        self.manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        self.manager.assign_processes(len(self.hostlist_clients))
        self.manager.assign_environment(
            self.macsio.get_environment(self.server_managers[0],
                                        self.client_log))
        try:
            return self.manager.run()

        except CommandFailure as error:
            self.log.error("MACSio Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

Exemplo n.º 18

0

Exibir arquivo

Arquivo: nvme_pool_capacity.py Projeto: yanghaomai/daos

class NvmePoolCapacity(TestWithServers):
    # pylfloat: disable=too-many-ancestors
    """Test class Description: Verify NOSPC
    condition is reported when accessing data beyond
    pool size.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(NvmePoolCapacity, self).setUp()

        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "{}{}{}".format(oclass, api, test[2])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def test_create_delete(self,
                           num_pool=2,
                           num_cont=5,
                           total_count=100,
                           scm_size=100000000000,
                           nvme_size=300000000000):
        """
        Test Description:
            This method is used to create/delete pools
            for a long run. It verifies the NVME free space
            during this process.
            Args:
                num_pool (int): Total pools for running test
                num_cont (int): Total containers created on each pool
                total_count (int): Total times the test is run in a loop
                scm_size (int): SCM size used in the testing
                nvme_size (int): NVME size used in the testing
            Returns:
                None
        """
        pool = {}
        cont = {}

        for loop_count in range(0, total_count):
            self.log.info("Running test %s", loop_count)
            for val in range(0, num_pool):
                pool[val] = TestPool(self.context, self.get_dmg_command())
                pool[val].get_params(self)
                # Split total SCM and NVME size for creating multiple pools.
                temp = int(scm_size) / num_pool
                pool[val].scm_size.update(str(temp))
                temp = int(nvme_size) / num_pool
                pool[val].nvme_size.update(str(temp))
                pool[val].create()
                self.pool = pool[val]
                display_string = "pool{} space at the Beginning".format(val)
                self.pool.display_pool_daos_space(display_string)
                nvme_size_begin = self.pool.get_pool_free_space("NVME")
                for cont_val in range(0, num_cont):
                    cont[cont_val] = TestContainer(pool[val])
            m_leak = 0
            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)
                nvme_size_end = self.pool.get_pool_free_space("NVME")
                pool[val].destroy()
                if (nvme_size_begin != nvme_size_end) and (m_leak == 0):
                    m_leak = val + 1
            # After destroying pools, check memory leak for each test loop.
            if m_leak != 0:
                self.fail("Memory leak : iteration {0} \n".format(m_leak))

    def test_run(self, num_pool=1):
        """
        Method Description:
            This method is called with different test_cases.
            Args:
               num_pool (int): Total pools for running a test.
            Returns:
               None
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}

        # Iterate through IOR different ior test sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_test_sequence,
                                                self.ior_flags):
            # Create the IOR threads
            threads = []
            for val in range(0, num_pool):
                pool[val] = TestPool(self.context, self.get_dmg_command())
                pool[val].get_params(self)
                # Split total SCM and NVME size for creating multiple pools.
                pool[val].scm_size.value = int(test[0]) / num_pool
                pool[val].nvme_size.value = int(test[1]) / num_pool
                pool[val].create()
                display_string = "pool{} space at the Beginning".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)

                for thrd in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
            # Launch the IOR threads
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(5)
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

            # Verify the queue and make sure no FAIL for any IOR run
            # Test should fail with ENOSPC.
            while not self.out_queue.empty():
                if (self.out_queue.get() == "FAIL" and test[4] == "PASS") \
                     or (self.out_queue.get() != "FAIL" and test[4] == "FAIL"):
                    self.fail("FAIL")

            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                self.pool = pool[val]
                self.pool.display_pool_daos_space(display_string)
                self.pool.destroy()

    def test_nvme_pool_capacity(self):
        """Jira ID: DAOS-2085.

        Test Description:
            Purpose of this test is to verify whether DAOS stack
            report NOSPC when accessing data beyond pool size.
            Use Cases
            Test Case 1 or 2:
             1. Perform IO less than entire SSD disk space.
             2. Perform IO beyond entire SSD disk space.
            Test Case 3:
             3. Create Pool/Container and destroy them several times.

        Use case:
        :avocado: tags=all,hw,medium,ib2,nvme,full_regression
        :avocado: tags=nvme_pool_capacity
        """
        # Run test with one pool.
        self.log.info("Running Test Case 1 with one Pool")
        self.test_run(1)
        time.sleep(5)
        # Run test with two pools.
        self.log.info("Running Test Case 1 with two Pools")
        self.test_run(2)
        time.sleep(5)
        # Run Create/delete pool/container
        self.log.info("Running Test Case 3: Pool/Cont Create/Destroy")
        self.test_create_delete(10, 50, 100)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: macsio_test_base.py Projeto: yanqiang-ux/daos

 def setUp(self):
     """Set up each test case."""
     super(MacsioTestBase, self).setUp()
     self.manager = Mpirun(None, subprocess=False, mpitype="mpich")
     self.macsio = self.get_macsio_command()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: container_create.py Projeto: yanqiang-ux/daos

    def test_rebuild_container_create(self):
        """Jira ID: DAOS-1168.

        Test Description:
            Configure 4 servers and 1 client with 1 or 2 pools and a pool
            service leader quantity of 2.  Add 1 container to the first pool
            configured with 3 replicas.  Populate the container with 1GB of
            objects.  Exclude a server that has shards of this object and
            verify that rebuild is initiated.  While rebuild is active, create
            1000 additional containers in the same pool or the second pool
            (when available).  Finally verify that rebuild completes and the
            pool info indicates the correct number of rebuilt objects and
            records.  Also confirm that all 1000 additional containers created
            during rebuild are accessible.

        Use Cases:
            Basic rebuild of container objects of array values with sufficient
            numbers of rebuild targets and no available rebuild targets.

        :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate
        """
        # Get test params
        targets = self.params.get("targets", "/run/server_config/*")
        pool_qty = self.params.get("pools", "/run/test/*")
        loop_qty = self.params.get("loops", "/run/test/*")
        cont_qty = self.params.get("containers", "/run/test/*")
        cont_obj_cls = self.params.get("container_obj_class", "/run/test/*")
        rank = self.params.get("rank", "/run/test/*")
        use_ior = self.params.get("use_ior", "/run/test/*", False)
        node_qty = len(self.hostlist_servers)

        # Get pool params
        self.pool = []
        for index in range(pool_qty):
            self.pool.append(
                TestPool(self.context, dmg_command=self.get_dmg_command()))
            self.pool[-1].get_params(self)

        if use_ior:
            # Get ior params
            mpirun = Mpirun(IorCommand())
            mpirun.job.get_params(self)
            mpirun.assign_hosts(
                self.hostlist_clients, self.workdir,
                self.hostfile_clients_slots)
            mpirun.assign_processes(len(self.hostlist_clients))
            mpirun.assign_environment(mpirun.job.get_default_env("mpirun"))

        # Cancel any tests with tickets already assigned
        if rank in (1, 2):
            self.cancelForTicket("DAOS-2434")

        errors = [0 for _ in range(loop_qty)]
        for loop in range(loop_qty):
            # Log the start of the loop
            loop_id = "LOOP {}/{}".format(loop + 1, loop_qty)
            self.log.info("%s", "-" * 80)
            self.log.info("%s: Starting loop", loop_id)

            # Start this loop with a fresh list of containers
            self.container = []

            # Create the requested number of pools
            info_checks = []
            rebuild_checks = []
            for pool in self.pool:
                pool.create()
                info_checks.append(
                    {
                        "pi_uuid": pool.uuid,
                        "pi_ntargets": node_qty * targets,
                        "pi_nnodes": node_qty,
                        "pi_ndisabled": 0,
                    }
                )
                rebuild_checks.append(
                    {
                        "rs_errno": 0,
                        "rs_done": 1,
                        "rs_obj_nr": 0,
                        "rs_rec_nr": 0,
                    }
                )

            # Check the pool info
            status = True
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
                pool.display_pool_daos_space("after creation")
            self.assertTrue(
                status,
                "Error verifying pool info prior to excluding rank {}".format(
                    rank))

            # Create a container with 1GB of data in the first pool
            if use_ior:
                mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags")
                mpirun.job.dfs_destroy.update(False, "ior.dfs_destroy")
                mpirun.job.set_daos_params(self.server_group, self.pool[0])
                self.log.info(
                    "%s: Running IOR on pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value)
                self.run_ior(loop_id, mpirun)
            else:
                self.container.append(TestContainer(self.pool[0]))
                self.container[-1].get_params(self)
                self.container[-1].create()
                self.log.info(
                    "%s: Writing to pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, self.container[-1].uuid)
                self.container[-1].object_qty.value = 8
                self.container[-1].record_qty.value = 64
                self.container[-1].data_size.value = 1024 * 1024
                self.container[-1].write_objects(rank, cont_obj_cls)
                rank_list = self.container[-1].get_target_rank_lists(
                    " after writing data")
                self.container[-1].get_target_rank_count(rank, rank_list)

            # Display the updated pool space usage
            for pool in self.pool:
                pool.display_pool_daos_space("after container creation")

            # Exclude the first rank from the first pool to initiate rebuild
            self.pool[0].start_rebuild([rank], self.d_log)

            # Wait for rebuild to start
            self.pool[0].wait_for_rebuild(True, 1)

            # Create additional containers in the last pool
            start_index = len(self.container)
            self.add_containers_during_rebuild(
                loop_id, cont_qty, self.pool[0], self.pool[-1])

            # Confirm rebuild completes
            self.pool[0].wait_for_rebuild(False, 1)

            # Check the pool info
            info_checks[0]["pi_ndisabled"] += targets
            rebuild_checks[0]["rs_done"] = 1
            rebuild_checks[0]["rs_obj_nr"] = ">=0"
            rebuild_checks[0]["rs_rec_nr"] = ">=0"
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(status, "Error verifying pool info after rebuild")

            # Verify that each of created containers exist by opening them
            for index in range(start_index, len(self.container)):
                count = "{}/{}".format(
                    index - start_index + 1, len(self.container) - start_index)
                if not self.access_container(loop_id, index, count):
                    errors[loop] += 1

            # Destroy the containers created during rebuild
            for index in range(start_index, len(self.container)):
                self.container[index].destroy()

            # Read the data from the container created before rebuild
            if use_ior:
                self.log.info(
                    "%s: Running IOR on pool %s to verify container %s",
                    loop_id, self.pool[0].uuid, mpirun.job.dfs_cont.value)
                mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags")
                mpirun.job.dfs_destroy.update(True, "ior.dfs_destroy")
                self.run_ior(loop_id, mpirun)
            else:
                self.log.info(
                    "%s: Reading pool %s to verify container %s",
                    loop_id, self.pool[0].uuid, self.container[0].uuid)
                self.assertTrue(
                    self.container[0].read_objects(),
                    "Error verifying data written before rebuild")
                self.container[0].destroy()

            # Destroy the pools
            for pool in self.pool:
                pool.destroy(1)

            self.log.info(
                "%s: Loop %s", loop_id,
                "passed" if errors[loop] == 0 else "failed")

        self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")

Exemplo n.º 21

0

Exibir arquivo

class NvmeEnospace(ServerFillUp):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: To validate DER_NOSPACE for SCM and NVMe
    :avocado: recursive
    """

    def __init__(self, *args, **kwargs):
        """Initialize a NvmeEnospace object."""
        super(NvmeEnospace, self).__init__(*args, **kwargs)
        self.daos_cmd = None

    def setUp(self):
        super(NvmeEnospace, self).setUp()

        # initialize daos command
        self.daos_cmd = DaosCommand(self.bin)
        self.create_pool_max_size()
        self.der_nospace_count = 0
        self.other_errors_count = 0

    def verify_enspace_log(self, der_nospace_err_count):
        """
        Function to verify there are no other error except DER_NOSPACE
        in client log and also DER_NOSPACE count is higher.

        args:
            expected_err_count(int): Expected DER_NOSPACE count from client log.
        """
        #Get the DER_NOSPACE and other error count from log
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        #Check there are no other errors in log file
        if self.other_errors_count > 0:
            self.fail('Found other errors, count {} in client log {}'
                      .format(self.other_errors_count, self.client_log))
        #Check the DER_NOSPACE error count is higher if not test will FAIL
        if self.der_nospace_count < der_nospace_err_count:
            self.fail('Expected DER_NOSPACE should be > {} and Found {}'
                      .format(der_nospace_err_count, self.der_nospace_count))

    def delete_all_containers(self):
        """
        Delete all the containers.
        """
        #List all the container
        kwargs = {"pool": self.pool.uuid}
        data = self.daos_cmd.pool_list_cont(**kwargs)
        containers = data["uuids"]

        #Destroy all the containers
        for _cont in containers:
            kwargs["cont"] = _cont
            self.daos_cmd.container_destroy(**kwargs)

    def ior_bg_thread(self, results):
        """Start IOR Background thread, This will write small data set and
        keep reading it in loop until it fails or main program exit.

        Args:
            results (queue): queue for returning thread results
        """
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Define the IOR Command and use the parameter from yaml file.
        ior_bg_cmd = IorCommand()
        ior_bg_cmd.get_params(self)
        ior_bg_cmd.set_daos_params(self.server_group, self.pool)
        ior_bg_cmd.dfs_oclass.update(self.ior_cmd.dfs_oclass.value)
        ior_bg_cmd.api.update(self.ior_cmd.api.value)
        ior_bg_cmd.transfer_size.update(self.ior_scm_xfersize)
        ior_bg_cmd.block_size.update(self.ior_cmd.block_size.value)
        ior_bg_cmd.flags.update(self.ior_cmd.flags.value)
        ior_bg_cmd.test_file.update('/testfile_background')

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_bg_cmd, mpitype="mpich")
        self.create_cont()
        self.job_manager.job.dfs_cont.update(self.container.uuid)
        env = ior_bg_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        self.job_manager.assign_processes(1)
        self.job_manager.assign_environment(env, True)
        print('----Run IOR in Background-------')
        # run IOR Write Command
        try:
            self.job_manager.run()
        except (CommandFailure, TestFail) as _error:
            results.put("FAIL")
            return

        # run IOR Read Command in loop
        ior_bg_cmd.flags.update(self.ior_read_flags)
        while True:
            try:
                self.job_manager.run()
            except (CommandFailure, TestFail) as _error:
                results.put("FAIL")
                break

    def run_enospace_foreground(self):
        """
        Function to run test and validate DER_ENOSPACE and expected storage size
        """
        #Fill 75% more of SCM pool,Aggregation is Enabled so NVMe space will be
        #start filling
        print('Starting main IOR load')
        self.start_ior_load(storage='SCM', percent=75)
        print(self.pool.pool_percentage_used())

        #Fill 50% more of SCM pool,Aggregation is Enabled so NVMe space will be
        #filled
        self.start_ior_load(storage='SCM', percent=50)
        print(self.pool.pool_percentage_used())

        #Fill 60% more of SCM pool, now NVMe will be Full so data will not be
        #moved to NVMe but it will start filling SCM. SCM size will be going to
        #full and this command expected to fail with DER_NOSPACE
        try:
            self.start_ior_load(storage='SCM', percent=60)
            self.fail('This test suppose to FAIL because of DER_NOSPACE'
                      'but it got Passed')
        except TestFail as _error:
            self.log.info('Test expected to fail because of DER_NOSPACE')

        #Display the pool%
        print(self.pool.pool_percentage_used())

        #verify the DER_NO_SAPCE error count is expected and no other Error in
        #client log
        self.verify_enspace_log(self.der_nospace_count)

        #Check both NVMe and SCM are full.
        pool_usage = self.pool.pool_percentage_used()
        #NVMe should be almost full if not test will fail.
        if pool_usage['nvme'] > 8:
            self.fail('Pool NVMe used percentage should be < 8%, instead {}'.
                      format(pool_usage['nvme']))
        #For SCM some % space used for system so it won't be 100% full.
        if pool_usage['scm'] > 50:
            self.fail('Pool SCM used percentage should be < 50%, instead {}'.
                      format(pool_usage['scm']))

    def run_enospace_with_bg_job(self):
        """
        Function to run test and validate DER_ENOSPACE and expected storage
        size. Single IOR job will run in background while space is filling.
        """
        #Get the initial DER_ENOSPACE count
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        # Start the IOR Background thread which will write small data set and
        # read in loop, until storage space is full.
        out_queue = queue.Queue()
        job = threading.Thread(target=self.ior_bg_thread,
                               kwargs={"results": out_queue})
        job.daemon = True
        job.start()

        #Run IOR in Foreground
        self.run_enospace_foreground()
        # Verify the background job queue and make sure no FAIL for any IOR run
        while not self.out_queue.empty():
            if self.out_queue.get() == "FAIL":
                self.fail("One of the Background IOR job failed")

    def test_enospace_lazy_with_bg(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM and NVMe is full with
                          default (lazy) Aggregation mode.

        Use Case: This tests will create the pool and fill 75% of SCM size which
                  will trigger the aggregation because of space pressure,
                  next fill 75% more which should fill NVMe. Try to fill 60%
                  more and now SCM size will be full too.
                  verify that last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full.One background IO job will be running
                  continuously.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_bg
        """
        print(self.pool.pool_percentage_used())

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

    def test_enospace_lazy_with_fg(self):
        """Jira ID: DAOS-4756.

        Test Description: Fill up the system (default aggregation mode) and
                          delete all containers in loop, which should release
                          the space.

        Use Case: This tests will create the pool and fill 75% of SCM size which
                  will trigger the aggregation because of space pressure,
                  next fill 75% more which should fill NVMe. Try to fill 60%
                  more and now SCM size will be full too.
                  verify that last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full. Delete all the containers.
                  Do this in loop for 10 times and verify space is released.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_lazy,enospc_lazy_fg
        """
        print(self.pool.pool_percentage_used())

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_lazy_fg Loop--------- {}".format(_loop))
            #Run IOR to fill the pool.
            self.run_enospace_foreground()
            #Delete all the containers
            self.delete_all_containers()
            #Delete container will take some time to release the space
            time.sleep(60)

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)

    def test_enospace_time_with_bg(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM is full and it release
                          the size when container destroy with Aggregation
                          set on time mode.

        Use Case: This tests will create the pool. Set Aggregation mode to Time.
                  Start filling 75% of SCM size. Aggregation will be triggered
                  time to time, next fill 75% more which will fill up NVMe.
                  Try to fill 60% more and now SCM size will be full too.
                  Verify last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full.One background IO job will be running
                  continuously.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_time,enospc_time_bg
        """
        print(self.pool.pool_percentage_used())

        # Enabled TIme mode for Aggregation.
        self.pool.set_property("reclaim", "time")

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

    def test_enospace_time_with_fg(self):
        """Jira ID: DAOS-4756.

        Test Description: Fill up the system (time aggregation mode) and
                          delete all containers in loop, which should release
                          the space.

        Use Case: This tests will create the pool. Set Aggregation mode to Time.
                  Start filling 75% of SCM size. Aggregation will be triggered
                  time to time, next fill 75% more which will fill up NVMe.
                  Try to fill 60% more and now SCM size will be full too.
                  Verify last IO fails with DER_NOSPACE and SCM/NVMe pool
                  capacity is full. Delete all the containers.
                  Do this in loop for 10 times and verify space is released.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_time,enospc_time_fg
        """
        print(self.pool.pool_percentage_used())

        # Enabled TIme mode for Aggregation.
        self.pool.set_property("reclaim", "time")

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_time_fg Loop--------- {}".format(_loop))
            #Run IOR to fill the pool.
            self.run_enospace_with_bg_job()
            #Delete all the containers
            self.delete_all_containers()
            #Delete container will take some time to release the space
            time.sleep(60)

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)

    @skipForTicket("DAOS-5403")
    def test_performance_storage_full(self):
        """Jira ID: DAOS-4756.

        Test Description: Verify IO Read performance when pool size is full.

        Use Case: This tests will create the pool. Run small set of IOR as
                  baseline.Start IOR with < 4K which will start filling SCM
                  and trigger aggregation and start filling up NVMe.
                  Check the IOR baseline read number and make sure it's +- 5%
                  to the number ran prior system storage was full.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_performance
        """
        #Write the IOR Baseline and get the Read BW for later comparison.
        print(self.pool.pool_percentage_used())
        #Write First
        self.start_ior_load(storage='SCM', percent=1)
        #Read the baseline data set
        self.start_ior_load(storage='SCM', operation='Read', percent=1)
        max_mib_baseline = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)])
        baseline_cont_uuid = self.ior_cmd.dfs_cont.value
        print("IOR Baseline Read MiB {}".format(max_mib_baseline))

        #Run IOR to fill the pool.
        self.run_enospace_with_bg_job()

        #Read the same container which was written at the beginning.
        self.container.uuid = baseline_cont_uuid
        self.start_ior_load(storage='SCM', operation='Read', percent=1)
        max_mib_latest = float(self.ior_matrix[0][int(IorMetrics.Max_MiB)])
        print("IOR Latest Read MiB {}".format(max_mib_latest))

        #Check if latest IOR read performance is in Tolerance of 5%, when
        #Storage space is full.
        if abs(max_mib_baseline-max_mib_latest) > (max_mib_baseline/100 * 5):
            self.fail('Latest IOR read performance is not under 5% Tolerance'
                      ' Baseline Read MiB = {} and latest IOR Read MiB = {}'
                      .format(max_mib_baseline, max_mib_latest))

    def test_enospace_no_aggregation(self):
        """Jira ID: DAOS-4756.

        Test Description: IO gets DER_NOSPACE when SCM is full and it release
                          the size when container destroy with Aggregation
                          disabled.

        Use Case: This tests will create the pool and disable aggregation. Fill
                  75% of SCM size which should work, next try fill 10% more
                  which should fail with DER_NOSPACE. Destroy the container
                  and validate the Pool SCM free size is close to full (> 95%).
                  Do this in loop ~10 times and verify the DER_NOSPACE and SCM
                  free size after container destroy.

        :avocado: tags=all,hw,medium,nvme,ib2,full_regression
        :avocado: tags=der_enospace,enospc_no_aggregation
        """
        # pylint: disable=attribute-defined-outside-init
        # pylint: disable=too-many-branches
        print(self.pool.pool_percentage_used())

        # Disable the aggregation
        self.pool.set_property("reclaim", "disabled")

        #Get the DER_NOSPACE and other error count from log
        self.der_nospace_count, self.other_errors_count = error_count(
            "-1007", self.hostlist_clients, self.client_log)

        #Repeat the test in loop.
        for _loop in range(10):
            print("-------enospc_no_aggregation Loop--------- {}".format(_loop))
            #Fill 75% of SCM pool
            self.start_ior_load(storage='SCM', percent=40)

            print(self.pool.pool_percentage_used())

            try:
                #Fill 10% more to SCM ,which should Fail because no SCM space
                self.start_ior_load(storage='SCM', percent=40)
                self.fail('This test suppose to fail because of DER_NOSPACE'
                          'but it got Passed')
            except TestFail as _error:
                self.log.info('Expected to fail because of DER_NOSPACE')

            #Verify DER_NO_SAPCE error count is expected and no other Error
            #in client log.
            self.verify_enspace_log(self.der_nospace_count)

            #Delete all the containers
            self.delete_all_containers()

            #Get the pool usage
            pool_usage = self.pool.pool_percentage_used()
            #Delay to release the SCM size.
            time.sleep(60)
            print(pool_usage)
            #SCM pool size should be released (some still be used for system)
            #Pool SCM free % should not be less than 50%
            if pool_usage['scm'] > 55:
                self.fail('SCM pool used percentage should be < 55, instead {}'.
                          format(pool_usage['scm']))

        #Run last IO
        self.start_ior_load(storage='SCM', percent=1)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: container_create_race.py Projeto: kjacque/daos

class RbldContainerCreate(TestWithServers):
    """Rebuild with container creation test cases.

    Test Class Description:
        These rebuild tests verify the ability to create additional containers
        while rebuild is ongoing.

    :avocado: recursive
    """
    def add_containers_during_rebuild(self, loop_id, qty, pool1, pool2):
        """Add containers to a pool while rebuild is still in progress.

        Args:
            loop_id (str): loop identification string
            qty (int): the number of containers to create
            pool1 (TestPool): pool used to determine if rebuild is complete
            pool2 (TestPool): pool used to add containers

        """
        count = 0
        while not pool1.rebuild_complete() and count < qty:
            # Create a new container
            count += 1
            self.log.info(
                "%s: Creating container %s/%s in pool %s during rebuild",
                loop_id, count, qty, pool2.uuid)
            self.container.append(TestContainer(pool2))
            self.container[-1].get_params(self)
            self.container[-1].create()
            self.container[-1].write_objects()

        if count < qty:
            self.fail("{}: Rebuild completed with only {}/{} containers "
                      "created".format(loop_id, count, qty))

    def run_ior(self, loop_id, mpirun):
        """Run the ior command defined by the specified ior command object.

        Args:
            loop_id (str): loop identification string
            mpirun (Mpirun): mpirun command object to run ior
        """
        total_bytes = mpirun.job.get_aggregate_total(mpirun.processes.value)
        try:
            mpirun.run()
        except CommandFailure as error:
            self.fail(
                "{}: Error populating the container with {} bytes of data "
                "prior to target exclusion: {}".format(loop_id, total_bytes,
                                                       error))
        self.log.info("%s: %s %s bytes to the container", loop_id,
                      "Wrote" if "-w" in mpirun.job.flags.value else "Read",
                      total_bytes)

    def access_container(self, loop_id, index, message):
        """Open and close the specified container.

        Args:
            loop_id (str): loop identification string
            index (int): index of the daos container object to open/close
            message (str): additional text describing the container

        Returns:
            bool: was the opening and closing of the container successful

        """
        status = True
        self.log.info("%s: Verifying the container %s created during rebuild",
                      loop_id, message)
        try:
            self.container[index].read_objects()
            self.container[index].close()

        except TestFail as error:
            self.log.error("%s:  - Container read failed:",
                           loop_id,
                           exc_info=error)
            status = False

        return status

    def test_rebuild_container_create(self):
        """Jira ID: DAOS-1168.

        Test Description:
            Configure 4 servers and 1 client with 1 or 2 pools and a pool
            service leader quantity of 2.  Add 1 container to the first pool
            configured with 3 replicas.  Populate the container with 1GB of
            objects.  Exclude a server that has shards of this object and
            verify that rebuild is initiated.  While rebuild is active, create
            1000 additional containers in the same pool or the second pool
            (when available).  Finally verify that rebuild completes and the
            pool info indicates the correct number of rebuilt objects and
            records.  Also confirm that all 1000 additional containers created
            during rebuild are accessible.

        Use Cases:
            Basic rebuild of container objects of array values with sufficient
            numbers of rebuild targets and no available rebuild targets.

        :avocado: tags=all,full_regression
        :avocado: tags=medium
        :avocado: tags=rebuild,rebuild_cont_create
        """
        # Get test params
        targets = self.params.get("targets", "/run/server_config/*")
        pool_qty = self.params.get("pools", "/run/test/*")
        loop_qty = self.params.get("loops", "/run/test/*")
        cont_qty = self.params.get("containers", "/run/test/*")
        cont_obj_cls = self.params.get("container_obj_class", "/run/test/*")
        rank = self.params.get("rank", "/run/test/*")
        use_ior = self.params.get("use_ior", "/run/test/*", False)
        node_qty = len(self.hostlist_servers)

        # Get pool params
        self.pool = []
        for index in range(pool_qty):
            self.pool.append(self.get_pool(create=False))

        if use_ior:
            # Get ior params
            self.job_manager = Mpirun(IorCommand())
            self.job_manager.job.get_params(self)
            self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                          self.hostfile_clients_slots)
            self.job_manager.assign_processes(len(self.hostlist_clients))
            self.job_manager.assign_environment(
                self.job_manager.job.get_default_env("mpirun"))

        errors = [0 for _ in range(loop_qty)]
        for loop in range(loop_qty):
            # Log the start of the loop
            loop_id = "LOOP {}/{}".format(loop + 1, loop_qty)
            self.log.info("%s", "-" * 80)
            self.log.info("%s: Starting loop", loop_id)

            # Start this loop with a fresh list of containers
            self.container = []

            # Create the requested number of pools
            info_checks = []
            rebuild_checks = []
            for pool in self.pool:
                pool.create()
                info_checks.append({
                    "pi_uuid": pool.uuid,
                    "pi_ntargets": node_qty * targets,
                    "pi_nnodes": node_qty,
                    "pi_ndisabled": 0,
                })
                rebuild_checks.append({
                    "rs_errno": 0,
                    "rs_done": 1,
                    "rs_obj_nr": 0,
                    "rs_rec_nr": 0,
                })

            # Check the pool info
            status = True
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
                pool.display_pool_daos_space("after creation")
            self.assertTrue(
                status,
                "Error verifying pool info prior to excluding rank {}".format(
                    rank))

            # Create a container with 1GB of data in the first pool
            if use_ior:
                self.job_manager.job.flags.update("-v -w -W -G 1 -k",
                                                  "ior.flags")
                self.job_manager.job.dfs_destroy.update(
                    False, "ior.dfs_destroy")
                self.job_manager.job.set_daos_params(self.server_group,
                                                     self.pool[0])
                self.log.info(
                    "%s: Running IOR on pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid,
                    self.job_manager.job.dfs_cont.value)
                self.run_ior(loop_id, self.job_manager)
            else:
                self.container.append(TestContainer(self.pool[0]))
                self.container[-1].get_params(self)
                self.container[-1].create()
                self.log.info(
                    "%s: Writing to pool %s to fill container %s with data",
                    loop_id, self.pool[0].uuid, self.container[-1].uuid)
                self.container[-1].object_qty.value = 8
                self.container[-1].record_qty.value = 64
                self.container[-1].data_size.value = 1024 * 1024
                self.container[-1].write_objects(rank, cont_obj_cls)
                rank_list = self.container[-1].get_target_rank_lists(
                    " after writing data")
                self.container[-1].get_target_rank_count(rank, rank_list)

            # Display the updated pool space usage
            for pool in self.pool:
                pool.display_pool_daos_space("after container creation")

            # Exclude the first rank from the first pool to initiate rebuild
            self.server_managers[0].stop_ranks([rank], self.d_log)

            # Wait for rebuild to start
            self.pool[0].wait_for_rebuild(True, 1)

            # Create additional containers in the last pool
            start_index = len(self.container)
            self.add_containers_during_rebuild(loop_id, cont_qty, self.pool[0],
                                               self.pool[-1])

            # Confirm rebuild completes
            self.pool[0].wait_for_rebuild(False, 1)

            # Check the pool info
            info_checks[0]["pi_ndisabled"] += targets
            rebuild_checks[0]["rs_done"] = 1
            rebuild_checks[0]["rs_obj_nr"] = ">=0"
            rebuild_checks[0]["rs_rec_nr"] = ">=0"
            for index, pool in enumerate(self.pool):
                status &= pool.check_pool_info(**info_checks[index])
                status &= pool.check_rebuild_status(**rebuild_checks[index])
            self.assertTrue(status, "Error verifying pool info after rebuild")

            # Verify that each of created containers exist by opening them
            for index in range(start_index, len(self.container)):
                count = "{}/{}".format(index - start_index + 1,
                                       len(self.container) - start_index)
                if not self.access_container(loop_id, index, count):
                    errors[loop] += 1

            # Destroy the containers created during rebuild
            for index in range(start_index, len(self.container)):
                self.container[index].destroy()

            # Read the data from the container created before rebuild
            if use_ior:
                self.log.info(
                    "%s: Running IOR on pool %s to verify container %s",
                    loop_id, self.pool[0].uuid,
                    self.job_manager.job.dfs_cont.value)
                self.job_manager.job.flags.update("-v -r -R -G 1 -E",
                                                  "ior.flags")
                self.job_manager.job.dfs_destroy.update(
                    True, "ior.dfs_destroy")
                self.run_ior(loop_id, self.job_manager)
            else:
                self.log.info("%s: Reading pool %s to verify container %s",
                              loop_id, self.pool[0].uuid,
                              self.container[0].uuid)
                self.assertTrue(self.container[0].read_objects(),
                                "Error verifying data written before rebuild")
                self.container[0].destroy()

            # Destroy the pools
            for pool in self.pool:
                pool.destroy(1)

            self.log.info("%s: Loop %s", loop_id,
                          "passed" if errors[loop] == 0 else "failed")

        self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")

Exemplo n.º 23

0

Exibir arquivo

Arquivo: osa_online_drain.py Projeto: sudison/daos

class OSAOnlineDrain(TestWithServers):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server Online Drain test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineDrain, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()

    @fail_on(CommandFailure)
    def get_pool_leader(self):
        """Get the pool leader.

        Returns:
            int: pool leader value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["leader"])

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def run_online_drain_test(self, num_pool):
        """Run the Online drain without data.
            Args:
             int : total pools to create for testing purposes.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []
        drain_servers = len(self.hostlist_servers) - 1

        # Exclude target : random two targets  (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n + 1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Drain one of the ranks (or server)
        rank = random.randint(1, drain_servers)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Drain the pool_uuid, rank and targets
        for val in range(0, num_pool):
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                threads = []
                for thrd in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(5)
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_drain(self.pool.uuid, rank,
                                                 t_string)
            self.log.info(output)

            fail_count = 0
            while fail_count <= 20:
                pver_drain = self.get_pool_version()
                time.sleep(10)
                fail_count += 1
                if pver_drain > pver_begin + 1:
                    break

            self.log.info("Pool Version after drain %s", pver_drain)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_drain > pver_begin,
                            "Pool Version Error:  After drain")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            pool[val].destroy()

    @skipForTicket("DAOS-6061")
    def test_osa_online_drain(self):
        """Test ID: DAOS-4750
        Test Description: Validate Online drain

        :avocado: tags=all,pr,hw,large,osa,osa_drain,online_drain,DAOS_5610
        """
        # Perform drain testing with 1 to 2 pools
        for pool_num in range(1, 3):
            self.run_online_drain_test(pool_num)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: ior_test_base.py Projeto: dsikich/daos

class IorTestBase(DfuseTestBase):
    # pylint: disable=too-many-ancestors
    """Base IOR test class.

    :avocado: recursive
    """

    IOR_WRITE_PATTERN = "Commencing write performance test"
    IOR_READ_PATTERN = "Commencing read performance test"

    def __init__(self, *args, **kwargs):
        """Initialize a IorTestBase object."""
        super().__init__(*args, **kwargs)
        self.ior_cmd = None
        self.processes = None
        self.hostfile_clients_slots = None
        self.container = None
        self.ior_timeout = None
        self.ppn = None

    def setUp(self):
        """Set up each test case."""
        # obtain separate logs
        self.update_log_file_names()
        # Start the servers and agents
        super().setUp()

        # Get the parameters for IOR
        self.ior_cmd = IorCommand()
        self.ior_cmd.get_params(self)
        self.processes = self.params.get("np", '/run/ior/client_processes/*')
        self.ppn = self.params.get("ppn", '/run/ior/client_processes/*')
        self.subprocess = self.params.get("subprocess", '/run/ior/*', False)
        self.ior_timeout = self.params.get("ior_timeout", '/run/ior/*', None)

    def create_pool(self):
        """Create a TestPool object to use with ior."""
        # Get the pool params and create a pool
        self.add_pool(connect=False)

    def create_cont(self):
        """Create a TestContainer object to be used to create container.

        """
        # Get container params
        self.container = TestContainer(self.pool,
                                       daos_command=DaosCommand(self.bin))
        self.container.get_params(self)

        # update container oclass
        if self.ior_cmd.dfs_oclass:
            self.container.oclass.update(self.ior_cmd.dfs_oclass.value)

        # create container
        self.container.create()

    def display_pool_space(self, pool=None):
        """Display the current pool space.

        If the TestPool object has a DmgCommand object assigned, also display
        the free pool space per target.

        Args:
            pool (TestPool, optional): The pool for which to display space.
                    Default is self.pool.
        """
        if not pool:
            pool = self.pool

        pool.display_pool_daos_space()
        if pool.dmg:
            pool.set_query_data()

    def run_ior_with_pool(self,
                          intercept=None,
                          test_file_suffix="",
                          test_file="daos:/testFile",
                          create_pool=True,
                          create_cont=True,
                          stop_dfuse=True,
                          plugin_path=None,
                          timeout=None,
                          fail_on_warning=False,
                          mount_dir=None,
                          out_queue=None,
                          env=None):
        # pylint: disable=too-many-arguments
        """Execute ior with optional overrides for ior flags and object_class.

        If specified the ior flags and ior daos object class parameters will
        override the values read from the yaml file.

        Args:
            intercept (str, optional): path to the interception library. Shall
                    be used only for POSIX through DFUSE. Defaults to None.
            test_file_suffix (str, optional): suffix to add to the end of the
                test file name. Defaults to "".
            test_file (str, optional): ior test file name. Defaults to
                "daos:/testFile". Is ignored when using POSIX through DFUSE.
            create_pool (bool, optional): If it is true, create pool and
                container else just run the ior. Defaults to True.
            create_cont (bool, optional): Create new container. Default is True
            stop_dfuse (bool, optional): Stop dfuse after ior command is
                finished. Default is True.
            plugin_path (str, optional): HDF5 vol connector library path.
                This will enable dfuse (xattr) working directory which is
                needed to run vol connector for DAOS. Default is None.
            timeout (int, optional): command timeout. Defaults to None.
            fail_on_warning (bool, optional): Controls whether the test
                should fail if a 'WARNING' is found. Default is False.
            mount_dir (str, optional): Create specific mount point
            out_queue (queue, optional): Pass the exception to the queue.
                Defaults to None
            env (EnvironmentVariables, optional): Pass the environment to be
                used when calling run_ior. Defaults to None

        Returns:
            CmdResult: result of the ior command execution

        """
        if create_pool:
            self.update_ior_cmd_with_pool(create_cont)

        # start dfuse if api is POSIX or HDF5 with vol connector
        if self.ior_cmd.api.value == "POSIX" or plugin_path:
            # add a substring in case of HDF5-VOL
            if plugin_path:
                sub_dir = get_random_string(5)
                mount_dir = os.path.join(mount_dir, sub_dir)
            # Connect to the pool, create container and then start dfuse
            if not self.dfuse:
                self.start_dfuse(self.hostlist_clients, self.pool,
                                 self.container, mount_dir)

        # setup test file for POSIX or HDF5 with vol connector
        if self.ior_cmd.api.value == "POSIX" or plugin_path:
            test_file = os.path.join(self.dfuse.mount_dir.value, "testfile")
        elif self.ior_cmd.api.value == "DFS":
            test_file = os.path.join("/", "testfile")

        self.ior_cmd.test_file.update("".join([test_file, test_file_suffix]))
        job_manager = self.get_ior_job_manager_command()
        job_manager.timeout = timeout
        try:
            out = self.run_ior(job_manager,
                               self.processes,
                               intercept,
                               plugin_path=plugin_path,
                               fail_on_warning=fail_on_warning,
                               out_queue=out_queue,
                               env=env)
        finally:
            if stop_dfuse:
                self.stop_dfuse()

        return out

    def update_ior_cmd_with_pool(self, create_cont=True):
        """Update ior_cmd with pool.

        Args:
          create_cont (bool, optional): create a container. Defaults to True.
        """
        # Create a pool if one does not already exist
        if self.pool is None:
            self.create_pool()
        # Create a container, if needed.
        # Don't pass uuid and pool handle to IOR.
        # It will not enable checksum feature
        if create_cont:
            self.pool.connect()
            self.create_cont()
        # Update IOR params with the pool and container params
        self.ior_cmd.set_daos_params(self.server_group, self.pool,
                                     self.container.uuid)

    def get_ior_job_manager_command(self, custom_ior_cmd=None):
        """Get the MPI job manager command for IOR.

        Args:
            custom_ior_cmd (IorCommand): Custom IorCommand instance to create
            job_manager with.

        Returns:
            str: the path for the mpi job manager command

        """
        # Initialize MpioUtils if IOR is running in MPIIO or DFS mode
        if self.ior_cmd.api.value in ["MPIIO", "POSIX", "DFS", "HDF5"]:
            mpio_util = MpioUtils()
            if mpio_util.mpich_installed(self.hostlist_clients) is False:
                self.fail("Exiting Test: Mpich not installed")
        else:
            self.fail("Unsupported IOR API")

        if custom_ior_cmd:
            self.job_manager = Mpirun(custom_ior_cmd, self.subprocess, "mpich")
        else:
            self.job_manager = Mpirun(self.ior_cmd, self.subprocess, "mpich")

        return self.job_manager

    def check_subprocess_status(self, operation="write"):
        """Check subprocess status."""
        if operation == "write":
            self.ior_cmd.pattern = self.IOR_WRITE_PATTERN
        elif operation == "read":
            self.ior_cmd.pattern = self.IOR_READ_PATTERN
        else:
            self.fail("Exiting Test: Inappropriate operation type \
                      for subprocess status check")

        if not self.ior_cmd.check_ior_subprocess_status(
                self.job_manager.process, self.ior_cmd):
            self.fail("Exiting Test: Subprocess not running")

    def run_ior(self,
                manager,
                processes,
                intercept=None,
                display_space=True,
                plugin_path=None,
                fail_on_warning=False,
                pool=None,
                out_queue=None,
                env=None):
        """Run the IOR command.

        Args:
            manager (str): mpi job manager command
            processes (int): number of host processes
            intercept (str, optional): path to interception library.
            display_space (bool, optional): Whether to display the pool
                space. Defaults to True.
            plugin_path (str, optional): HDF5 vol connector library path.
                This will enable dfuse (xattr) working directory which is
                needed to run vol connector for DAOS. Default is None.
            fail_on_warning (bool, optional): Controls whether the test
                should fail if a 'WARNING' is found. Default is False.
            pool (TestPool, optional): The pool for which to display space.
                Default is self.pool.
            out_queue (queue, optional): Pass the exception to the queue.
                Defaults to None.
            env (EnvironmentVariables, optional): Environment to be used
             when running ior. Defaults to None
        """
        if not env:
            env = self.ior_cmd.get_default_env(str(manager), self.client_log)
        if intercept:
            env['LD_PRELOAD'] = intercept
            env['D_LOG_MASK'] = 'INFO'
            if env.get('D_IL_REPORT', None) is None:
                env['D_IL_REPORT'] = '1'

            #env['D_LOG_MASK'] = 'INFO,IL=DEBUG'
            #env['DD_MASK'] = 'all'
            #env['DD_SUBSYS'] = 'all'
        if plugin_path:
            env["HDF5_VOL_CONNECTOR"] = "daos"
            env["HDF5_PLUGIN_PATH"] = str(plugin_path)
            manager.working_dir.value = self.dfuse.mount_dir.value
        manager.assign_hosts(self.hostlist_clients, self.workdir,
                             self.hostfile_clients_slots)
        if self.ppn is None:
            manager.assign_processes(processes)
        else:
            manager.ppn.update(self.ppn, 'mpirun.ppn')
            manager.processes.update(None, 'mpirun.np')

        manager.assign_environment(env)

        if not pool:
            pool = self.pool

        try:
            if display_space:
                self.display_pool_space(pool)
            out = manager.run()

            if self.subprocess:
                return out

            if fail_on_warning:
                report_warning = self.fail
            else:
                report_warning = self.log.warning

            for line in out.stdout_text.splitlines():
                if 'WARNING' in line:
                    report_warning("IOR command issued warnings.")
            return out
        except CommandFailure as error:
            self.log.error("IOR Failed: %s", str(error))
            # Queue is used when we use a thread to call
            # ior thread (eg: thread1 --> thread2 --> ior)
            if out_queue is not None:
                out_queue.put("IOR Failed")
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            if not self.subprocess and display_space:
                self.display_pool_space(pool)

    def stop_ior(self):
        """Stop IOR process.

        Args:
            manager (str): mpi job manager command
        """
        self.log.info("<IOR> Stopping in-progress IOR command: %s",
                      str(self.job_manager))

        try:
            out = self.job_manager.stop()
            return out
        except CommandFailure as error:
            self.log.error("IOR stop Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")
        finally:
            self.display_pool_space()

    def run_ior_threads_il(self, results, intercept, with_clients,
                           without_clients):
        """Execute 2 IOR threads in parallel.

        One thread is run with the interception library (IL) and one without.

        Args:
            results (dict): Dictionary to store the IOR results that gets
                printed in the IOR output.
            intercept (str): Path to the interception library. Shall be used
                only for POSIX through DFUSE.
            with_clients (list): List of clients that use IL.
            without_clients (list): List of clients that doesn't use IL.
        """
        # We can't use the shared self.ior_cmd, so we need to create the
        # IorCommand object for each thread.
        ior_cmd1 = IorCommand()
        ior_cmd1.get_params(self)
        # Update IOR params with the pool and container params
        ior_cmd1.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        ior_cmd2 = IorCommand()
        ior_cmd2.get_params(self)
        ior_cmd2.set_daos_params(self.server_group, self.pool,
                                 self.container.uuid)

        # start dfuse for POSIX api. This is specific to interception library
        # test requirements.
        self.start_dfuse(self.hostlist_clients, self.pool, self.container)

        # Create two threads and run in parallel.
        thread1 = self.create_ior_thread(ior_cmd1, with_clients, 1, results,
                                         intercept)
        thread2 = self.create_ior_thread(ior_cmd2, without_clients, 2, results,
                                         None)

        thread1.start()
        thread2.start()
        thread1.join()
        thread2.join()

        self.stop_dfuse()

        # Basic verification of the thread results
        status = True
        for key in sorted(results):
            if not results[key].pop(0):
                self.log.error("IOR Thread %d: %s", key, results[key][0])
                status = False
            if len(results[key]) != 2:
                self.log.error(
                    "IOR Thread %d: expecting 2 results; %d found: %s", key,
                    len(results[key]), results[key])
                status = False
        if not status:
            self.fail("At least one IOR thread failed!")

    def create_ior_thread(self,
                          ior_command,
                          clients,
                          job_num,
                          results,
                          intercept=None):
        """Create a new thread for ior run.

        Args:
            ior_command (IorCommand): IOR command instance.
            clients (list): hosts on which to run ior
            job_num (int): Assigned job number
            results (dict): A dictionary object to store the ior metrics
            intercept (path): Path to interception library
        """
        job = threading.Thread(
            target=self.run_custom_ior_cmd,
            args=[ior_command, clients, results, job_num, intercept])
        return job

    def run_custom_ior_cmd(self,
                           ior_command,
                           clients,
                           results,
                           job_num,
                           intercept=None):
        """Run customized IOR command, not self.ior_cmd.

        Expected to be used with a threaded code where multiple IOR commands are
        executed in parallel.

        Display pool space before running it for a reference.

        Args:
            ior_command (IorCommand): Custom IOR command instance.
            clients (list): hosts on which to run ior
            results (dict): A dictionary object to store the ior metrics
            job_num (int): Assigned job number
            intercept (str, optional): path to interception library. Defaults to
                None.
        """
        self.log.info("--- IOR Thread %d: Start ---", job_num)
        tsize = ior_command.transfer_size.value
        testfile = os.path.join(self.dfuse.mount_dir.value,
                                "testfile{}{}".format(tsize, job_num))
        if intercept:
            testfile += "intercept"
        ior_command.test_file.update(testfile)

        # Get the custom job manager that's associated with this thread.
        manager = self.get_ior_job_manager_command(custom_ior_cmd=ior_command)

        procs = (self.processes // len(self.hostlist_clients)) * len(clients)
        env = ior_command.get_default_env(str(manager), self.client_log)
        if intercept:
            env["LD_PRELOAD"] = intercept
        manager.assign_hosts(clients, self.workdir,
                             self.hostfile_clients_slots)
        manager.assign_processes(procs)
        manager.assign_environment(env)

        self.log.info("--- IOR Thread %d: Starting IOR ---", job_num)
        self.display_pool_space()
        try:
            ior_output = manager.run()
            results[job_num] = [True]
            results[job_num].extend(IorCommand.get_ior_metrics(ior_output))
        except CommandFailure as error:
            results[job_num] = [False, "IOR failed: {}".format(error)]
        finally:
            self.display_pool_space()

        self.log.info("--- IOR Thread %d: End ---", job_num)

    def run_ior_multiple_variants(self, obj_class, apis, transfer_block_size,
                                  flags, mount_dir):
        """Run multiple ior commands with various different combination
           of ior input params.

        Args:
            obj_class(list): List of different object classes
            apis(list): list of different apis
            transfer_block_size(list): list of different transfer sizes
                                       and block sizes. eg: [1M, 32M]
                                       1M is transfer size and 32M is
                                       block size in the above example.
            flags(list): list of ior flags
            mount_dir(str): dfuse mount directory
        """
        results = []

        for oclass in obj_class:
            self.ior_cmd.dfs_oclass.update(oclass)
            for api in apis:
                if api == "HDF5-VOL":
                    self.ior_cmd.api.update("HDF5")
                    hdf5_plugin_path = self.params.get("plugin_path",
                                                       '/run/hdf5_vol/*')
                    flags_w_k = " ".join([flags[0]] + ["-k"])
                    self.ior_cmd.flags.update(flags_w_k, "ior.flags")
                else:
                    # run tests for different variants
                    self.ior_cmd.flags.update(flags[0], "ior.flags")
                    hdf5_plugin_path = None
                    self.ior_cmd.api.update(api)
                for test in transfer_block_size:
                    # update transfer and block size
                    self.ior_cmd.transfer_size.update(test[0])
                    self.ior_cmd.block_size.update(test[1])
                    # run ior
                    try:
                        self.run_ior_with_pool(plugin_path=hdf5_plugin_path,
                                               timeout=self.ior_timeout,
                                               mount_dir=mount_dir)
                        results.append(["PASS", str(self.ior_cmd)])
                    except CommandFailure:
                        results.append(["FAIL", str(self.ior_cmd)])
        return results

    def verify_pool_size(self, original_pool_info, processes):
        """Validate the pool size.

        Args:
            original_pool_info (PoolInfo): Pool info prior to IOR
            processes (int): number of processes
        """
        # Get the current pool size for comparison
        current_pool_info = self.pool.pool.pool_query()

        # If Transfer size is < 4K, Pool size will verified against NVMe, else
        # it will be checked against SCM
        if self.ior_cmd.transfer_size.value >= 4096:
            self.log.info(
                "Size is > 4K,Size verification will be done with NVMe size")
            storage_index = 1
        else:
            self.log.info(
                "Size is < 4K,Size verification will be done with SCM size")
            storage_index = 0
        actual_pool_size = \
            original_pool_info.pi_space.ps_space.s_free[storage_index] - \
            current_pool_info.pi_space.ps_space.s_free[storage_index]
        expected_pool_size = self.ior_cmd.get_aggregate_total(processes)

        if actual_pool_size < expected_pool_size:
            self.fail(
                "Pool Free Size did not match: actual={}, expected={}".format(
                    actual_pool_size, expected_pool_size))

    def execute_cmd(self, command, fail_on_err=True, display_output=True):
        """Execute cmd using general_utils.pcmd.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        try:
            # Execute the bash command on each client host
            result = self._execute_command(command, fail_on_err,
                                           display_output)

        except CommandFailure as error:
            # Report an error if any command fails
            self.log.error("DfuseSparseFile Test Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

        return result

    def _execute_command(self,
                         command,
                         fail_on_err=True,
                         display_output=True,
                         hosts=None):
        """Execute the command on all client hosts.

        Optionally verify if the command returns a non zero return code.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Raises:
            CommandFailure: if 'fail_on_err' is set and the command fails on at
                least one of the client hosts

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        if hosts is None:
            hosts = self.hostlist_clients
        result = pcmd(hosts, command, verbose=display_output, timeout=300)
        if 0 not in result and fail_on_err:
            hosts = [
                str(nodes) for code, nodes in list(result.items()) if code != 0
            ]
            raise CommandFailure(
                "Error running '{}' on the following hosts: {}".format(
                    command, NodeSet(",".join(hosts))))
        return result

Exemplo n.º 25

0

Exibir arquivo

Arquivo: osa_utils.py Projeto: wuxingyi/daos

class OSAUtils(IorTestBase):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server offline drain test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAUtils, self).setUp()
        self.container = None
        self.obj = None
        self.ioreq = None
        self.dmg_command = self.get_dmg_command()
        self.no_of_dkeys = self.params.get("no_of_dkeys",
                                           '/run/dkeys/*',
                                           default=[0])[0]
        self.no_of_akeys = self.params.get("no_of_akeys",
                                           '/run/akeys/*',
                                           default=[0])[0]
        self.record_length = self.params.get("length",
                                             '/run/record/*',
                                             default=[0])[0]

    @fail_on(CommandFailure)
    def get_pool_leader(self):
        """Get the pool leader.

        Returns:
            int: pool leader value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["leader"])

    @fail_on(CommandFailure)
    def get_rebuild_status(self):
        """Get the rebuild status.

        Returns:
            str: reuild status

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return data["rebuild"]["status"]

    @fail_on(CommandFailure)
    def is_rebuild_done(self, time_interval):
        """Rebuild is completed/done.
        Args:
            time_interval: Wait interval between checks
        Returns:
            False: If rebuild_status not "done" or "completed".
            True: If rebuild status is "done" or "completed".
        """
        status = False
        fail_count = 0
        completion_flag = ["done", "completed"]
        while fail_count <= 20:
            rebuild_status = self.get_rebuild_status()
            time.sleep(time_interval)
            fail_count += 1
            if rebuild_status in completion_flag:
                status = True
                break
        return status

    @fail_on(CommandFailure)
    def assert_on_rebuild_failure(self):
        """If the rebuild is not successful,
        raise assert.
        """
        rebuild_status = self.get_rebuild_status()
        self.log.info("Rebuild Status: %s", rebuild_status)
        rebuild_failed_string = ["failed", "scanning", "aborted", "busy"]
        self.assertTrue(rebuild_status not in rebuild_failed_string,
                        "Rebuild failed")

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    @fail_on(DaosApiError)
    def write_single_object(self):
        """Write some data to the existing pool."""
        self.pool.connect(2)
        csum = self.params.get("enable_checksum", '/run/container/*')
        self.container = DaosContainer(self.context)
        input_param = self.container.cont_input_values
        input_param.enable_chksum = csum
        self.container.create(poh=self.pool.pool.handle, con_prop=input_param)
        self.container.open()
        self.obj = DaosObj(self.context, self.container)
        self.obj.create(objcls=1)
        self.obj.open()
        self.ioreq = IORequest(self.context,
                               self.container,
                               self.obj,
                               objtype=4)
        self.log.info("Writing the Single Dataset")
        for dkey in range(self.no_of_dkeys):
            for akey in range(self.no_of_akeys):
                indata = ("{0}".format(str(akey)[0]) * self.record_length)
                d_key_value = "dkey {0}".format(dkey)
                c_dkey = ctypes.create_string_buffer(d_key_value)
                a_key_value = "akey {0}".format(akey)
                c_akey = ctypes.create_string_buffer(a_key_value)
                c_value = ctypes.create_string_buffer(indata)
                c_size = ctypes.c_size_t(ctypes.sizeof(c_value))
                self.ioreq.single_insert(c_dkey, c_akey, c_value, c_size)
        self.obj.close()
        self.container.close()

    @fail_on(DaosApiError)
    def verify_single_object(self):
        """Verify the container data on the existing pool."""
        self.pool.connect(2)
        self.container.open()
        self.obj.open()
        self.log.info("Single Dataset Verification -- Started")
        for dkey in range(self.no_of_dkeys):
            for akey in range(self.no_of_akeys):
                indata = ("{0}".format(str(akey)[0]) * self.record_length)
                c_dkey = ctypes.create_string_buffer("dkey {0}".format(dkey))
                c_akey = ctypes.create_string_buffer("akey {0}".format(akey))
                val = self.ioreq.single_fetch(c_dkey, c_akey, len(indata) + 1)
                if indata != (repr(val.value)[1:-1]):
                    self.d_log.error("ERROR:Data mismatch for "
                                     "dkey = {0}, "
                                     "akey = {1}".format(
                                         "dkey {0}".format(dkey),
                                         "akey {0}".format(akey)))
                    self.fail(
                        "ERROR: Data mismatch for dkey = {0}, akey={1}".format(
                            "dkey {0}".format(dkey), "akey {0}".format(akey)))
        self.obj.close()
        self.container.close()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        """
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(self.processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

Exemplo n.º 26

0

Exibir arquivo

Arquivo: nvme_fragmentation.py Projeto: yanqiang-ux/daos

    def ior_runner_thread(self, results):
        """Start threads and wait until all threads are finished.

        Destroy the container at the end of this thread run.

        Args:
            results (queue): queue for returning thread results

        Returns:
            None

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        cmd = DaosCommand(os.path.join(self.prefix, "bin"))
        cmd.set_sub_command("container")
        cmd.sub_command_class.set_sub_command("destroy")
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")

        # Iterate through IOR different value and run in sequence
        for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                self.ior_apis,
                                                self.ior_transfer_size,
                                                self.ior_flags):
            # Define the arguments for the ior_runner_thread method
            ior_cmd = IorCommand()
            ior_cmd.get_params(self)
            ior_cmd.set_daos_params(self.server_group, self.pool)
            ior_cmd.dfs_oclass.update(oclass)
            ior_cmd.api.update(api)
            ior_cmd.transfer_size.update(test[0])
            ior_cmd.block_size.update(test[1])
            ior_cmd.flags.update(flags)

            container_info["{}{}{}"
                           .format(oclass,
                                   api,
                                   test[0])] = str(uuid.uuid4())

            # Define the job manager for the IOR command
            manager = Mpirun(ior_cmd, mpitype="mpich")
            manager.job.dfs_cont.update(container_info
                                         ["{}{}{}".format(oclass,
                                                          api,
                                                          test[0])])
            env = ior_cmd.get_default_env(str(manager))
            manager.assign_hosts(self.hostlist_clients, self.workdir, None)
            manager.assign_processes(processes)
            manager.assign_environment(env, True)

            # run IOR Command
            try:
                manager.run()
            except CommandFailure as _error:
                results.put("FAIL")

        # Destroy the container created by thread
        for key in container_info:
            cmd.sub_command_class.sub_command_class.pool.value = self.pool.uuid
            cmd.sub_command_class.sub_command_class.svc.value = \
                self.pool.svc_ranks
            cmd.sub_command_class.sub_command_class.cont.value = \
                container_info[key]

            try:
                cmd._get_result()
            except CommandFailure as _error:
                results.put("FAIL")