示例#1
0
文件: macsio_util.py 项目: wli5/daos
 def set_output_file_path(self):
     """Set the path for the files generated by the macsio command."""
     self.log_file_name.update(get_log_file(self.log_file_name.value),
                               "macsio.log_file_name")
     self.timings_file_name.update(
         get_log_file(self.timings_file_name.value),
         "macsio.timings_file_name")
示例#2
0
    def prepare(self, storage=True):
        """Prepare to start daos_server.

        Args:
            storage (bool, optional): whether or not to prepare dspm/nvme
                storage. Defaults to True.
        """
        self.log.info(
            "<SERVER> Preparing to start daos_server on %s with %s",
            self._hosts, self.manager.command)

        # Create the daos_server yaml file
        self.manager.job.create_yaml_file()

        # Copy certificates
        self.manager.job.copy_certificates(
            get_log_file("daosCA/certs"), self._hosts)
        local_host = socket.gethostname().split('.', 1)[0]
        self.dmg.copy_certificates(
            get_log_file("daosCA/certs"), local_host.split())

        # Prepare dmg for running storage format on all server hosts
        self.dmg.hostlist = self._hosts
        if not self.dmg.yaml:
            # If using a dmg config file, transport security was
            # already configured.
            self.dmg.insecure.update(
                self.get_config_value("allow_insecure"), "dmg.insecure")

        # Kill any daos servers running on the hosts
        self.kill()

        # Clean up any files that exist on the hosts
        self.clean_files()

        # Make sure log file has been created for ownership change
        if self.manager.job.using_nvme:
            cmd_list = []
            for server_params in self.manager.job.yaml.server_params:
                log_file = server_params.log_file.value
                if log_file is not None:
                    self.log.info("Creating log file: %s", log_file)
                    cmd_list.append("touch {}".format(log_file))
            if cmd_list:
                pcmd(self._hosts, "; ".join(cmd_list), False)

        if storage:
            # Prepare server storage
            if self.manager.job.using_nvme or self.manager.job.using_dcpm:
                self.log.info("Preparing storage in <format> mode")
                self.prepare_storage("root")
                if hasattr(self.manager, "mca"):
                    self.manager.mca.update(
                        {"plm_rsh_args": "-l root"}, "orterun.mca", True)
示例#3
0
    def setUp(self):
        """Define test setup to be done."""
        self.log.info("<<setUp Started>> at %s", time.ctime())
        super(SoakTestBase, self).setUp()
        self.username = getuser()
        # Initialize loop param for all tests
        self.loop = 1
        self.exclude_slurm_nodes = []
        # Setup logging directories for soak logfiles
        # self.output dir is an avocado directory .../data/
        self.log_dir = get_log_file("soak")
        self.outputsoakdir = self.outputdir + "/soak"
        # Fail if slurm partition daos_client is not defined
        if not self.client_partition:
            raise SoakTestError(
                "<<FAILED: Partition is not correctly setup for daos "
                "slurm partition>>")
        # Check if the server nodes are in the client list;
        # this will happen when only one partition is specified
        for host_server in self.hostlist_servers:
            if host_server in self.hostlist_clients:
                self.hostlist_clients.remove(host_server)
                self.exclude_slurm_nodes.append(host_server)

        # Include test node for log cleanup; remove from client list
        local_host_list = include_local_host(None)
        self.exclude_slurm_nodes.extend(local_host_list)
        if local_host_list[0] in self.hostlist_clients:
            self.hostlist_clients.remove((local_host_list[0]))
        self.log.info("<<Updated hostlist_clients %s >>",
                      self.hostlist_clients)
        if not self.hostlist_clients:
            self.fail("There are no nodes that are client only;"
                      "check if the partition also contains server nodes")
示例#4
0
    def __init__(self, path, host, dmg=None):
        """Create a daos_racer command object.

        Args:
            path (str): path of the daos_racer command
            host (str): host on which to run the daos_racer command
            dmg (DmgCommand): a DmgCommand object used to obtain the
                configuration file and certificate
        """
        super(DaosRacerCommand, self).__init__(
            "/run/daos_racer/*", "daos_racer", path)
        self.host = host

        # Number of seconds to run
        self.runtime = FormattedParameter("-t {}", 60)

        if dmg:
            self.dmg_config = FormattedParameter("-n {}", dmg.yaml.filename)
            dmg.copy_certificates(get_log_file("daosCA/certs"), [self.host])

        # Optional timeout for the clush command running the daos_racer command.
        # This should be set greater than the 'runtime' value but less than the
        # avocado test timeout value to allow for proper cleanup.  Using a value
        # of None will result in no timeout being used.
        self.clush_timeout = BasicParameter(None)

        # Environment variable names required to be set when running the
        # daos_racer command.  The values for these names are populated by the
        # get_environment() method and added to command line by the
        # set_environment() method.
        self._env_names = ["D_LOG_FILE"]
示例#5
0
    def get_environment(self, manager, log_file=None):
        """Get the environment variables to export for the command.

        Args:
            manager (DaosServerManager): the job manager used to start
                daos_server from which the server config values can be obtained
                to set the required environment variables.
            log_file (str, optional): when specified overrides the default
                D_LOG_FILE value. Defaults to None.

        Returns:
            EnvironmentVariables: a dictionary of environment variable names and
                values to export.

        """
        env = EnvironmentVariables()
        for name in self._env_names:
            if name == "D_LOG_FILE":
                if not log_file:
                    log_file = "{}_daos.log".format(self.command)
                value = get_log_file(log_file)
            else:
                value = manager.get_environment_value(name)
            env[name] = value

        return env
示例#6
0
    def setUp(self):
        """Set up each test case."""
        super(CartSelfTest, self).setUp()

        # Configure the daos server
        config_file = self.get_config_file(self.server_group, "server")
        self.add_server_manager(config_file)
        self.configure_manager("server", self.server_managers[-1],
                               self.hostlist_servers,
                               self.hostfile_servers_slots,
                               self.hostlist_servers)

        # Setup additional environment variables for the server orterun command
        share_addr = self.params.get("val",
                                     "/run/muxtestparams/share_addr/*")[0]
        self.cart_env["CRT_CTX_SHARE_ADDR"] = str(share_addr)
        self.cart_env["CRT_CTX_NUM"] = "8"
        self.cart_env["CRT_PHY_ADDR_STR"] = \
            self.server_managers[0].get_config_value("provider")
        self.cart_env["OFI_INTERFACE"] = \
            self.server_managers[0].get_config_value("fabric_iface")
        self.server_managers[0].manager.assign_environment(self.cart_env, True)

        # Start the daos server
        self.start_server_managers()

        # Generate a uri file using daos_agent dump-attachinfo
        attachinfo_file = "{}.attach_info_tmp".format(self.server_group)
        self.uri_file = get_log_file(attachinfo_file)
        agent_cmd = self.agent_managers[0].manager.job
        agent_cmd.dump_attachinfo(self.uri_file)
示例#7
0
 def start(self):
     """Start the agent through the job manager."""
     self.log.info("<AGENT> Starting daos_agent on %s with %s", self._hosts,
                   self.manager.command)
     # Copy certificates
     self.manager.job.copy_certificates(get_log_file("daosCA/certs"),
                                        self._hosts)
     super(DaosAgentManager, self).start()
示例#8
0
 def setUp(self):
     """Define test setup to be done."""
     self.log.info("<<setUp Started>> at %s", time.ctime())
     super().setUp()
     # Log the version of rpms being used for this test
     cmd = "sudo dnf list daos-client"
     try:
         _ = run_command(cmd, timeout=30)
     except DaosTestError as error:
         self.log.info("No daos rpm package info available %s", error)
     self.username = getuser()
     # Initialize loop param for all tests
     self.loop = 1
     self.exclude_slurm_nodes = []
     # Setup logging directories for soak logfiles
     # self.output dir is an avocado directory .../data/
     self.log_dir = get_log_file("soak")
     self.outputsoakdir = self.outputdir + "/soak"
     # Create the remote log directories on all client nodes
     self.test_log_dir = self.log_dir + "/pass" + str(self.loop)
     self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)
     self.sharedlog_dir = self.tmp + "/soak"
     self.sharedsoakdir = self.sharedlog_dir + "/pass" + str(self.loop)
     # Initialize dmg cmd
     self.dmg_command = self.get_dmg_command()
     # Fail if slurm partition is not defined
     # NOTE: Slurm reservation and partition are created before soak runs.
     # CI uses partition=daos_client and no reservation.
     # A21 uses partition=normal/default and reservation=daos-test.
     # Partition and reservation names are updated in the yaml file.
     # It is assumed that if there is no reservation (CI only), then all
     # the nodes in the partition will be used for soak.
     if not self.client_partition:
         raise SoakTestError(
             "<<FAILED: Partition is not correctly setup for daos "
             "slurm partition>>")
     self.srun_params = {"partition": self.client_partition}
     if self.client_reservation:
         self.srun_params["reservation"] = self.client_reservation
     # Check if the server nodes are in the client list;
     # this will happen when only one partition is specified
     for host_server in self.hostlist_servers:
         if host_server in self.hostlist_clients:
             self.hostlist_clients.remove(host_server)
             self.exclude_slurm_nodes.append(host_server)
     # Include test node for log cleanup; remove from client list
     local_host_list = include_local_host(None)
     self.exclude_slurm_nodes.extend(local_host_list)
     if local_host_list[0] in self.hostlist_clients:
         self.hostlist_clients.remove((local_host_list[0]))
     if not self.hostlist_clients:
         self.fail(
             "There are no valid nodes in this partition to run "
             "soak. Check partition {} for valid nodes".format(
                 self.client_partition))
示例#9
0
文件: agent_utils.py 项目: liw/daos
    def start(self):
        """Start the agent through the job manager."""
        self.log.info("<AGENT> Starting daos_agent on %s with %s", self._hosts,
                      self.manager.command)

        # Copy certificates
        self.manager.job.copy_certificates(get_log_file("daosCA/certs"),
                                           self._hosts)

        # Verify the socket directory exists when using a non-systemctl manager
        self.verify_socket_directory(getuser())

        super().start()
示例#10
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.params.get("daos_test", self.TEST_PATH)
        num_clients = self.params.get("num_clients",
                                      '/run/daos_tests/num_clients/*')
        num_replicas = self.params.get("num_replicas",
                                       '/run/daos_tests/num_replicas/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        nvme_size = self.params.get("nvme_size", '/run/pool/*')
        args = self.params.get("args", self.TEST_PATH, "")
        dmg = self.get_dmg_command()
        dmg_config_file = dmg.yaml.filename

        cmd = " ".join(
            [
                self.orterun,
                self.client_mca,
                "-n", str(num_clients),
                "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]),
                "-x", "D_LOG_MASK=DEBUG",
                "-x", "DD_MASK=mgmt,io,md,epc,rebuild",
                self.daos_test,
                "-s", str(num_replicas),
                "-n", dmg_config_file,
                "".join(["-", subtest]),
                str(args)
            ]
        )

        env = {}
        env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_results.xml")
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)
        if not nvme_size:
            nvme_size = 0
        env['POOL_NVME_SIZE'] = "{}".format(nvme_size)

        load_mpi("openmpi")
        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status != 0:
                # fake a JUnit failure output
                self.create_results_xml(self.subtest_name, result)
                self.fail(
                    "{0} failed with return code={1}.\n".format(
                        cmd, result.result.exit_status))
示例#11
0
    def test_nvme_io_stats(self):
        """Jira ID: DAOS-4722.

        Test Description:
            Purpose of this test is to run IO test and check when NVME_IO_STATS
            enabled in config, it generates the different statistics.

        Use case:
            Run ior and it will print the NVMe IO stats to control plane log
            file.

        :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression
        """
        # run ior
        self.run_ior_with_pool()

        # Get the NVMe IO statistics from server control_log file.
        cmd = 'cat {}'.format(get_log_file(self.control_log))
        results = run_pcmd(self.hostlist_servers, cmd)
        for result in results:
            if result["exit_status"] == 1:
                self.fail("Failed to run cmd {} on {}".format(
                    cmd, result["hosts"]))

            # Verify statistics are increasing for IO
            target_stats = []
            for _tmp in range(8):
                target_stats.append([
                    line for line in result["stdout"]
                    if "tgt[{}]".format(_tmp) in line
                ])
            for stats in NVME_STATS:
                for _tgt in range(len(target_stats)):
                    first_stats = re.findall(r'\d+', [
                        x for x in target_stats[_tgt][0].split()
                        if re.search(stats, x)
                    ][0])[0]
                    last_stats = re.findall(r'\d+', [
                        x for x in target_stats[_tgt][-1].split()
                        if re.search(stats, x)
                    ][0])[0]
                    # Last statistic should be higher than initial statistics
                    if int(first_stats) >= int(last_stats):
                        self.fail(
                            "Failed: Stats {} for target {} did not increased "
                            "First_stat={} < Last_stat={}".format(
                                stats, _tgt, first_stats, last_stats))
示例#12
0
    def test_nvme_io_stats(self):
        """Jira ID: DAOS-4722.

        Test Description:
            Purpose of this test is to run IO test and check when NVME_IO_STATS
            enabled in config, it generates the different statistics.

        Use case:
            Run ior and it will print the NVMe IO stats to control plane log
            file.

        :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression
        """
        # run ior
        self.run_ior_with_pool()

        #Get the NVMe IO statistics from server control_log file.
        cmd = 'cat {}'.format(get_log_file(self.control_log))
        task = run_task(self.hostlist_servers, cmd)
        for _rc_code, _node in task.iter_retcodes():
            if _rc_code == 1:
                self.fail("Failed to run cmd {} on {}".format(cmd, _node))
        for buf, _nodes in task.iter_buffers():
            output_list = str(buf).split('\n')

        #Verify statistics are increasing for IO
        target_stats = []
        for _tmp in range(8):
            target_stats.append(
                [s for s in output_list if "tgt[{}]".format(_tmp) in s])
        for stats in NVME_STATS:
            for _tgt in range(len(target_stats)):
                first_stats = re.findall(r'\d+', [
                    x for x in target_stats[_tgt][0].split()
                    if re.search(stats, x)
                ][0])[0]
                last_stats = re.findall(r'\d+', [
                    x for x in target_stats[_tgt][-1].split()
                    if re.search(stats, x)
                ][0])[0]
                #Last statistic should be higher from the initial statistics
                if int(first_stats) >= int(last_stats):
                    self.fail(
                        'Failed: Stats {} for target {} did not increased'
                        ' First_stat={} < Last_stat={}'.format(
                            stats, _tgt, first_stats, last_stats))
示例#13
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.params.get("daos_test", '/run/daos_tests/Tests/*')
        num_clients = self.params.get("num_clients",
                                      '/run/daos_tests/num_clients/*')
        num_replicas = self.params.get("num_replicas",
                                       '/run/daos_tests/num_replicas/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        args = self.params.get("args", '/run/daos_tests/Tests/*', "")

        cmd = "{} {} -n {} -x D_LOG_FILE={} \
            -x D_LOG_MASK=DEBUG -x DD_MASK=mgmt,io,md,epc,rebuild \
            {} -s {} -n {} {}".format(self.orterun,
                                      self.client_mca, num_clients,
                                      get_log_file(self.client_log),
                                      self.daos_test, num_replicas, subtest,
                                      args)

        env = {}
        env['CMOCKA_XML_FILE'] = "%g_results.xml"
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)

        load_mpi("openmpi")
        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status is not 0:
                # fake a JUnit failure output
                with open(self.subtest_name + "_results.xml",
                          "w") as results_xml:
                    results_xml.write('''<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="{0}" errors="1" failures="0" skipped="0" tests="1" time="0.0">
  <testcase name="ALL" time="0.0" >
    <error message="Test failed to start up"/>
    <system-out>
<![CDATA[{1}]]>
    </system-out>
    <system-err>
<![CDATA[{2}]]>
    </system-err>
  </testcase>
</testsuite>'''.format(self.subtest_name, result.result.stdout,
                       result.result.stderr))
                self.fail("{0} failed with return code={1}.\n".format(
                    cmd, result.result.exit_status))
示例#14
0
    def prepare(self, storage=True):
        """Prepare to start daos_server.

        Args:
            storage (bool, optional): whether or not to prepare dcpm/nvme
                storage. Defaults to True.
        """
        self.log.info(
            "<SERVER> Preparing to start daos_server on %s with %s",
            self._hosts, self.manager.command)

        # Create the daos_server yaml file
        self.manager.job.temporary_file_hosts = self._hosts
        self.manager.job.create_yaml_file()

        # Copy certificates
        self.manager.job.copy_certificates(
            get_log_file("daosCA/certs"), self._hosts)
        self._prepare_dmg_certificates()

        # Prepare dmg for running storage format on all server hosts
        self._prepare_dmg_hostlist(self._hosts)
        if not self.dmg.yaml:
            # If using a dmg config file, transport security was
            # already configured.
            self.dmg.insecure.update(
                self.get_config_value("allow_insecure"), "dmg.insecure")

        # Kill any daos servers running on the hosts
        self.kill()

        # Clean up any files that exist on the hosts
        self.clean_files()

        if storage:
            # Prepare server storage
            if self.manager.job.using_nvme or self.manager.job.using_dcpm:
                self.log.info("Preparing storage in <format> mode")
                self.prepare_storage("root")
                if hasattr(self.manager, "mca"):
                    self.manager.mca.update(
                        {"plm_rsh_args": "-l root"}, "orterun.mca", True)

        # Verify the socket directory exists when using a non-systemctl manager
        self.verify_socket_directory(getuser())
示例#15
0
 def test_csum_error_logging(self):
     """
     Test ID: DAOS-3927
     Test Description: Write Avocado Test to verify single data after
                       pool/container disconnect/reconnect.
     :avocado: tags=all,daily_regression,hw,medium,ib2,csum_error_log,faults
     """
     dev_id = self.get_nvme_device_id()
     self.log.info("%s", dev_id)
     csum = self.get_checksum_error_value(dev_id)
     self.dmg.copy_certificates(get_log_file("daosCA/certs"),
                                self.hostlist_clients)
     self.log.info("Checksum Errors : %d", csum)
     DaosCoreBase.run_subtest(self)
     csum_latest = self.get_checksum_error_value(dev_id)
     self.log.info("Checksum Errors : %d", csum_latest)
     self.assertTrue(csum_latest > csum,
                     "Checksum Error Log not incremented")
     self.log.info("Checksum Error Logging Test Passed")
示例#16
0
    def verify_client_run(self, exp_iface, env):
        """Verify the interface assigned by running a libdaos client.

        Args:
            exp_iface (str): expected interface to check.
            env (bool): add OFI_INTERFACE variable to exported variables of
                client command.

        Returns:
            bool: returns status

        """
        hfi_map = {"ib0": "hfi1_0", "ib1": "hfi1_1"}

        # Get counter values for hfi devices before and after
        cnt_before = self.get_port_cnt(self.hostlist_clients,
                                       hfi_map[exp_iface], "port_rcv_data")

        # get the dmg config file for daos_racer
        dmg = self.get_dmg_command()

        # Let's run daos_racer as a client
        daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg)
        daos_racer.get_params(self)

        # Update env_name list to add OFI_INTERFACE if needed.
        if env:
            daos_racer.update_env_names(["OFI_INTERFACE"])

        # Setup the environment and logfile
        logf = "daos_racer_{}_{}.log".format(exp_iface, env)

        # Add FI_LOG_LEVEL to get more info on device issues
        racer_env = daos_racer.get_environment(self.server_managers[0], logf)
        racer_env["FI_LOG_LEVEL"] = "info"
        daos_racer.set_environment(racer_env)

        # Run client
        daos_racer.run()

        # Verify output and port count to check what iface CaRT init with.
        cnt_after = self.get_port_cnt(self.hostlist_clients,
                                      hfi_map[exp_iface], "port_rcv_data")

        diff = 0
        for cnt_b, cnt_a in zip(cnt_before.values(), cnt_after.values()):
            diff = int(cnt_a) - int(cnt_b)
            self.log.info("Port [%s] count difference: %s", exp_iface, diff)

        # Read daos.log to verify device used and prevent false positives
        self.assertTrue(
            self.get_log_info(self.hostlist_clients, exp_iface, env,
                              get_log_file(logf)))

        # If we don't see data going through the device, fail
        status = True
        if diff <= 0:
            self.log.info("No traffic seen through device: %s", exp_iface)
            status = False
        else:
            status = True
        return status
示例#17
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.get_test_param("daos_test")
        num_clients = self.get_test_param("num_clients")
        if num_clients is None:
            num_clients = self.params.get("num_clients", '/run/daos_tests/*')
        scm_size = self.params.get("scm_size", '/run/pool/*')
        nvme_size = self.params.get("nvme_size", '/run/pool/*')
        args = self.get_test_param("args", "")
        stopped_ranks = self.get_test_param("stopped_ranks", [])
        dmg = self.get_dmg_command()
        dmg_config_file = dmg.yaml.filename
        if self.hostlist_clients:
            dmg.copy_certificates(get_log_file("daosCA/certs"),
                                  self.hostlist_clients)
            dmg.copy_configuration(self.hostlist_clients)
        self.client_mca += " --mca btl_tcp_if_include eth0"

        cmd = " ".join([
            self.orterun, self.client_mca, "-n",
            str(num_clients), "--hostfile", self.hostfile_clients, "-x",
            "=".join(["D_LOG_FILE",
                      get_log_file(self.client_log)]), "--map-by node", "-x",
            "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild",
            self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]),
            str(args)
        ])

        env = {}
        env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir,
                                              "%g_cmocka_results.xml")
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)
        if not nvme_size:
            nvme_size = 0
        env['POOL_NVME_SIZE'] = "{}".format(nvme_size)

        if not load_mpi("openmpi"):
            self.fail("Failed to load openmpi")

        # Update the expected status for each ranks that will be stopped by this
        # test to avoid a false failure during tearDown().
        if "random" in stopped_ranks:
            # Set each expected rank state to be either stopped or running
            for manager in self.server_managers:
                manager.update_expected_states(
                    None, ["Joined", "Stopped", "Evicted"])
        else:
            # Set the specific expected rank state to stopped
            for rank in stopped_ranks:
                for manager in self.server_managers:
                    manager.update_expected_states(rank,
                                                   ["Stopped", "Evicted"])

        try:
            process.run(cmd, env=env)
        except process.CmdError as result:
            if result.result.exit_status != 0:
                # fake a JUnit failure output
                self.create_results_xml(self.subtest_name, result)
                self.fail("{0} failed with return code={1}.\n".format(
                    cmd, result.result.exit_status))
示例#18
0
    def run_subtest(self):
        """Run daos_test with a subtest argument."""
        subtest = self.get_test_param("daos_test")
        num_clients = self.get_test_param("num_clients")
        if num_clients is None:
            num_clients = self.params.get("num_clients", '/run/daos_tests/*')

        scm_size = self.params.get("scm_size", '/run/pool/*')
        nvme_size = self.params.get("nvme_size", '/run/pool/*')
        args = self.get_test_param("args", "")
        stopped_ranks = self.get_test_param("stopped_ranks", [])
        pools_created = self.get_test_param("pools_created", 1)
        self.increment_timeout(POOL_TIMEOUT_INCREMENT * pools_created)
        dmg = self.get_dmg_command()
        dmg_config_file = dmg.yaml.filename
        if self.hostlist_clients:
            dmg.copy_certificates(
                get_log_file("daosCA/certs"), self.hostlist_clients)
            dmg.copy_configuration(self.hostlist_clients)

        cmd = " ".join(
            [
                "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]),
                "--map-by node", "-x", "D_LOG_MASK=DEBUG",
                "-x", "DD_MASK=mgmt,io,md,epc,rebuild",
                "-x", "COVFILE=/tmp/test.cov",
                self.daos_test,
                "-n", dmg_config_file,
                "".join(["-", subtest]),
                str(args)
            ]
        )

        job_cmd = ExecutableCommand(namespace=None, command=cmd)
        job = get_job_manager(self, "Orterun", job_cmd, mpi_type="openmpi")
        # Assign the test to run
        job.hostfile.update(self.hostfile_clients)
        job.processes.update(num_clients)
        job_str = str(job)

        env = {}
        env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir,
                                              "%g_cmocka_results.xml")
        env['CMOCKA_MESSAGE_OUTPUT'] = "xml"
        env['POOL_SCM_SIZE'] = "{}".format(scm_size)
        if not nvme_size:
            nvme_size = 0
        env['POOL_NVME_SIZE'] = "{}".format(nvme_size)

        # Update the expected status for each ranks that will be stopped by this
        # test to avoid a false failure during tearDown().
        if "random" in stopped_ranks:
            # Set each expected rank state to be either stopped or running
            for manager in self.server_managers:
                manager.update_expected_states(
                    None, ["Joined", "Stopped", "Excluded"])
        else:
            # Set the specific expected rank state to stopped
            for rank in stopped_ranks:
                for manager in self.server_managers:
                    manager.update_expected_states(
                        rank, ["Stopped", "Excluded"])

        try:
            process.run(job_str, env=env)
        except process.CmdError as result:
            if result.result.exit_status != 0:
                # fake a JUnit failure output
                self.create_results_xml(self.subtest_name, result,
                                        "Failed to run {}.".format(
                    self.daos_test))
                self.fail(
                    "{0} failed with return code={1}.\n".format(
                        job_str, result.result.exit_status))
示例#19
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.job_timeout = self.params.get("job_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        self.nodesperjob = self.params.get("nodesperjob", test_param + "*")
        self.taskspernode = self.params.get("taskspernode", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        self.dmg_command.copy_certificates(get_log_file("daosCA/certs"),
                                           self.hostlist_clients)
        self.dmg_command.copy_configuration(self.hostlist_clients)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        obj_class = self.params.get("oclass", "/run/container_reserved/*")
        if harassers:
            harasserlist = get_harassers(harassers)
            self.harassers = harasserlist[:]
            run_harasser = True
            self.log.info("<< Initial harrasser list = %s>>",
                          " ".join([harasser for harasser in self.harassers]))
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        self.pool[0].connect()

        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        resv_cont.write_objects(rank, obj_class)

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize if harassers
            if run_harasser and not self.harassers:
                self.harasser_results = {}
                self.harasser_args = {}
                self.harassers = harasserlist[:]
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # TO-DO: use IOR
        if not resv_cont.read_objects():
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        self.container.append(resv_cont)
        # gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))
示例#20
0
 def setUp(self):
     """Define test setup to be done."""
     self.log.info("<<setUp Started>> at %s", time.ctime())
     super(SoakTestBase, self).setUp()
     self.username = getuser()
     # Initialize loop param for all tests
     self.loop = 1
     self.exclude_slurm_nodes = []
     # Setup logging directories for soak logfiles
     # self.output dir is an avocado directory .../data/
     self.log_dir = get_log_file("soak")
     self.outputsoakdir = self.outputdir + "/soak"
     # Create the remote log directories on all client nodes
     self.test_log_dir = self.log_dir + "/pass" + str(self.loop)
     self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)
     self.sharedlog_dir = self.tmp + "/soak"
     self.sharedsoakdir = self.sharedlog_dir + "/pass" + str(self.loop)
     # Fail if slurm partition is not defined
     if not self.client_partition:
         raise SoakTestError(
             "<<FAILED: Partition is not correctly setup for daos "
             "slurm partition>>")
     # Check if the server nodes are in the client list;
     # this will happen when only one partition is specified
     for host_server in self.hostlist_servers:
         if host_server in self.hostlist_clients:
             self.hostlist_clients.remove(host_server)
             self.exclude_slurm_nodes.append(host_server)
     # Include test node for log cleanup; remove from client list
     local_host_list = include_local_host(None)
     self.exclude_slurm_nodes.extend(local_host_list)
     if local_host_list[0] in self.hostlist_clients:
         self.hostlist_clients.remove((local_host_list[0]))
     # Check if requested reservation is allowed in partition
     # NOTE: Slurm reservation and partition are created before soak runs.
     # CI uses partition=daos_client and no reservation.
     # A21 uses partition=normal/default and reservation=daos-test.
     # Partition and reservation names are updated in the yaml file.
     # It is assumed that if there is no reservation (CI only), then all
     # the nodes in the partition will be used for soak.
     self.srun_params = {"partition": self.client_partition}
     slurm_reservation = self.params.get(
         "reservation", "/run/srun_params/*")
     if slurm_reservation is not None:
         # verify that the reservation is valid
         reserved_nodes = slurm_utils.get_reserved_nodes(
             slurm_reservation, self.client_partition)
         if not reserved_nodes:
             # client nodes are invalid for requested reservation
             self.hostlist_clients = []
             raise SoakTestError(
                 "<<FAILED: Reservation {} is invalid "
                 "in partition {}>>".format(
                     slurm_reservation, self.client_partition))
         # update srun params
         self.srun_params["reservation"] = slurm_reservation
     self.log.info(
         "<<Updated hostlist_clients %s >>", self.hostlist_clients)
     if not self.hostlist_clients:
         self.fail(
             "There are no valid nodes in this partition to run "
             "soak. Check partition {} for valid nodes".format(
                 self.client_partition))
示例#21
0
    def verify_client_run(self, exp_iface, env):
        """Verify the interface assigned by running a libdaos client.

        Args:
            exp_iface (str): expected interface to check.
            env (bool): add OFI_INTERFACE variable to exported variables of
                client command.

        Returns:
            bool: returns status

        """
        clients = self.agent_managers[0].hosts

        # Get counter values for hfi devices before and after
        port_info_before = self.get_port_cnt(clients, "port_rcv_data")

        # get the dmg config file for daos_racer
        dmg = self.get_dmg_command()

        # Let's run daos_racer as a client
        daos_racer = DaosRacerCommand(self.bin, clients[0], dmg)
        daos_racer.get_params(self)

        # Update env_name list to add OFI_INTERFACE if needed.
        if env:
            daos_racer.update_env_names(["OFI_INTERFACE"])

        # Setup the environment and logfile
        log_file = "daos_racer_{}_{}.log".format(exp_iface, env)

        # Add FI_LOG_LEVEL to get more info on device issues
        racer_env = daos_racer.get_environment(self.server_managers[0], log_file)
        racer_env["FI_LOG_LEVEL"] = "info"
        racer_env["D_LOG_MASK"] = "INFO,object=ERR,placement=ERR"
        daos_racer.set_environment(racer_env)

        # Run client
        daos_racer.run()

        # Verify output and port count to check what iface CaRT init with.
        port_info_after = self.get_port_cnt(clients, "port_rcv_data")

        self.log.info("Client interface port_rcv_data counters")
        msg_format = "%16s  %9s  %9s  %9s  %s"
        self.log.info(msg_format, "Host(s)", "Interface", "Before", "After", "Difference")
        self.log.info(msg_format, "-" * 16, "-" * 9, "-" * 9, "-" * 9, "-" * 9)
        no_traffic = set()
        for interface in sorted(port_info_before):
            for host in sorted(port_info_before[interface]):
                before = port_info_before[interface][host][1]["port_rcv_data"]
                try:
                    after = port_info_after[interface][host][1]["port_rcv_data"]
                    diff = int(after) - int(before)
                    if diff <= 0:
                        no_traffic.add(interface)
                except (KeyError, ValueError) as error:
                    after = "Error"
                    diff = "Unknown - {}".format(error)
                    no_traffic.add(interface)
                self.log.info(msg_format, host, interface, before, after, diff)

        # Read daos.log to verify device used and prevent false positives
        self.assertTrue(self.get_log_info(clients, exp_iface, env, get_log_file(log_file)))

        # If we don't see data going through the device, fail
        for interface in no_traffic:
            self.log.info("No client traffic seen through device: %s", interface)
        return len(no_traffic) != len(self.interfaces)
示例#22
0
文件: server_utils.py 项目: liw/daos
 def _prepare_dmg_certificates(self):
     """Set up dmg certificates."""
     local_host = socket.gethostname().split('.', 1)[0]
     self.dmg.copy_certificates(get_log_file("daosCA/certs"), local_host.split())
示例#23
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        self.dmg_command.copy_certificates(get_log_file("daosCA/certs"),
                                           self.hostlist_clients)
        self.dmg_command.copy_configuration(self.hostlist_clients)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file
        initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                         "initial", "resv_file")
        try:
            reserved_file_copy(self,
                               initial_resv_file,
                               self.pool[0],
                               resv_cont,
                               num_bytes=500000000,
                               cmd="write")
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container write failed>>") from error

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # verify reserved container data
        final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                       "final", "resv_file")
        try:
            reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont)
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container read failed>>") from error

        if not cmp(initial_resv_file, final_resv_file):
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        for file in [initial_resv_file, final_resv_file]:
            if os.path.isfile(file):
                file_name = os.path.split(os.path.dirname(file))[-1]
                # save a copy of the POSIX file in self.outputsoakdir
                copy_cmd = "cp -p {} {}/{}_resv_file".format(
                    file, self.outputsoakdir, file_name)
                try:
                    run_command(copy_cmd, timeout=30)
                except DaosTestError as error:
                    self.soak_errors.append(
                        "Reserved data file {} failed to archive".format(file))
                os.remove(file)
        self.container.append(resv_cont)
        # Gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))