def set_output_file_path(self): """Set the path for the files generated by the macsio command.""" self.log_file_name.update(get_log_file(self.log_file_name.value), "macsio.log_file_name") self.timings_file_name.update( get_log_file(self.timings_file_name.value), "macsio.timings_file_name")
def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dspm/nvme storage. Defaults to True. """ self.log.info( "<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates( get_log_file("daosCA/certs"), local_host.split()) # Prepare dmg for running storage format on all server hosts self.dmg.hostlist = self._hosts if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update( self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() # Make sure log file has been created for ownership change if self.manager.job.using_nvme: cmd_list = [] for server_params in self.manager.job.yaml.server_params: log_file = server_params.log_file.value if log_file is not None: self.log.info("Creating log file: %s", log_file) cmd_list.append("touch {}".format(log_file)) if cmd_list: pcmd(self._hosts, "; ".join(cmd_list), False) if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update( {"plm_rsh_args": "-l root"}, "orterun.mca", True)
def setUp(self): """Define test setup to be done.""" self.log.info("<<setUp Started>> at %s", time.ctime()) super(SoakTestBase, self).setUp() self.username = getuser() # Initialize loop param for all tests self.loop = 1 self.exclude_slurm_nodes = [] # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = get_log_file("soak") self.outputsoakdir = self.outputdir + "/soak" # Fail if slurm partition daos_client is not defined if not self.client_partition: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.exclude_slurm_nodes.append(host_server) # Include test node for log cleanup; remove from client list local_host_list = include_local_host(None) self.exclude_slurm_nodes.extend(local_host_list) if local_host_list[0] in self.hostlist_clients: self.hostlist_clients.remove((local_host_list[0])) self.log.info("<<Updated hostlist_clients %s >>", self.hostlist_clients) if not self.hostlist_clients: self.fail("There are no nodes that are client only;" "check if the partition also contains server nodes")
def __init__(self, path, host, dmg=None): """Create a daos_racer command object. Args: path (str): path of the daos_racer command host (str): host on which to run the daos_racer command dmg (DmgCommand): a DmgCommand object used to obtain the configuration file and certificate """ super(DaosRacerCommand, self).__init__( "/run/daos_racer/*", "daos_racer", path) self.host = host # Number of seconds to run self.runtime = FormattedParameter("-t {}", 60) if dmg: self.dmg_config = FormattedParameter("-n {}", dmg.yaml.filename) dmg.copy_certificates(get_log_file("daosCA/certs"), [self.host]) # Optional timeout for the clush command running the daos_racer command. # This should be set greater than the 'runtime' value but less than the # avocado test timeout value to allow for proper cleanup. Using a value # of None will result in no timeout being used. self.clush_timeout = BasicParameter(None) # Environment variable names required to be set when running the # daos_racer command. The values for these names are populated by the # get_environment() method and added to command line by the # set_environment() method. self._env_names = ["D_LOG_FILE"]
def get_environment(self, manager, log_file=None): """Get the environment variables to export for the command. Args: manager (DaosServerManager): the job manager used to start daos_server from which the server config values can be obtained to set the required environment variables. log_file (str, optional): when specified overrides the default D_LOG_FILE value. Defaults to None. Returns: EnvironmentVariables: a dictionary of environment variable names and values to export. """ env = EnvironmentVariables() for name in self._env_names: if name == "D_LOG_FILE": if not log_file: log_file = "{}_daos.log".format(self.command) value = get_log_file(log_file) else: value = manager.get_environment_value(name) env[name] = value return env
def setUp(self): """Set up each test case.""" super(CartSelfTest, self).setUp() # Configure the daos server config_file = self.get_config_file(self.server_group, "server") self.add_server_manager(config_file) self.configure_manager("server", self.server_managers[-1], self.hostlist_servers, self.hostfile_servers_slots, self.hostlist_servers) # Setup additional environment variables for the server orterun command share_addr = self.params.get("val", "/run/muxtestparams/share_addr/*")[0] self.cart_env["CRT_CTX_SHARE_ADDR"] = str(share_addr) self.cart_env["CRT_CTX_NUM"] = "8" self.cart_env["CRT_PHY_ADDR_STR"] = \ self.server_managers[0].get_config_value("provider") self.cart_env["OFI_INTERFACE"] = \ self.server_managers[0].get_config_value("fabric_iface") self.server_managers[0].manager.assign_environment(self.cart_env, True) # Start the daos server self.start_server_managers() # Generate a uri file using daos_agent dump-attachinfo attachinfo_file = "{}.attach_info_tmp".format(self.server_group) self.uri_file = get_log_file(attachinfo_file) agent_cmd = self.agent_managers[0].manager.job agent_cmd.dump_attachinfo(self.uri_file)
def start(self): """Start the agent through the job manager.""" self.log.info("<AGENT> Starting daos_agent on %s with %s", self._hosts, self.manager.command) # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) super(DaosAgentManager, self).start()
def setUp(self): """Define test setup to be done.""" self.log.info("<<setUp Started>> at %s", time.ctime()) super().setUp() # Log the version of rpms being used for this test cmd = "sudo dnf list daos-client" try: _ = run_command(cmd, timeout=30) except DaosTestError as error: self.log.info("No daos rpm package info available %s", error) self.username = getuser() # Initialize loop param for all tests self.loop = 1 self.exclude_slurm_nodes = [] # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = get_log_file("soak") self.outputsoakdir = self.outputdir + "/soak" # Create the remote log directories on all client nodes self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) self.sharedlog_dir = self.tmp + "/soak" self.sharedsoakdir = self.sharedlog_dir + "/pass" + str(self.loop) # Initialize dmg cmd self.dmg_command = self.get_dmg_command() # Fail if slurm partition is not defined # NOTE: Slurm reservation and partition are created before soak runs. # CI uses partition=daos_client and no reservation. # A21 uses partition=normal/default and reservation=daos-test. # Partition and reservation names are updated in the yaml file. # It is assumed that if there is no reservation (CI only), then all # the nodes in the partition will be used for soak. if not self.client_partition: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") self.srun_params = {"partition": self.client_partition} if self.client_reservation: self.srun_params["reservation"] = self.client_reservation # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.exclude_slurm_nodes.append(host_server) # Include test node for log cleanup; remove from client list local_host_list = include_local_host(None) self.exclude_slurm_nodes.extend(local_host_list) if local_host_list[0] in self.hostlist_clients: self.hostlist_clients.remove((local_host_list[0])) if not self.hostlist_clients: self.fail( "There are no valid nodes in this partition to run " "soak. Check partition {} for valid nodes".format( self.client_partition))
def start(self): """Start the agent through the job manager.""" self.log.info("<AGENT> Starting daos_agent on %s with %s", self._hosts, self.manager.command) # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) # Verify the socket directory exists when using a non-systemctl manager self.verify_socket_directory(getuser()) super().start()
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.params.get("daos_test", self.TEST_PATH) num_clients = self.params.get("num_clients", '/run/daos_tests/num_clients/*') num_replicas = self.params.get("num_replicas", '/run/daos_tests/num_replicas/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.params.get("args", self.TEST_PATH, "") dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename cmd = " ".join( [ self.orterun, self.client_mca, "-n", str(num_clients), "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", self.daos_test, "-s", str(num_replicas), "-n", dmg_config_file, "".join(["-", subtest]), str(args) ] ) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) load_mpi("openmpi") try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result) self.fail( "{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def test_nvme_io_stats(self): """Jira ID: DAOS-4722. Test Description: Purpose of this test is to run IO test and check when NVME_IO_STATS enabled in config, it generates the different statistics. Use case: Run ior and it will print the NVMe IO stats to control plane log file. :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression """ # run ior self.run_ior_with_pool() # Get the NVMe IO statistics from server control_log file. cmd = 'cat {}'.format(get_log_file(self.control_log)) results = run_pcmd(self.hostlist_servers, cmd) for result in results: if result["exit_status"] == 1: self.fail("Failed to run cmd {} on {}".format( cmd, result["hosts"])) # Verify statistics are increasing for IO target_stats = [] for _tmp in range(8): target_stats.append([ line for line in result["stdout"] if "tgt[{}]".format(_tmp) in line ]) for stats in NVME_STATS: for _tgt in range(len(target_stats)): first_stats = re.findall(r'\d+', [ x for x in target_stats[_tgt][0].split() if re.search(stats, x) ][0])[0] last_stats = re.findall(r'\d+', [ x for x in target_stats[_tgt][-1].split() if re.search(stats, x) ][0])[0] # Last statistic should be higher than initial statistics if int(first_stats) >= int(last_stats): self.fail( "Failed: Stats {} for target {} did not increased " "First_stat={} < Last_stat={}".format( stats, _tgt, first_stats, last_stats))
def test_nvme_io_stats(self): """Jira ID: DAOS-4722. Test Description: Purpose of this test is to run IO test and check when NVME_IO_STATS enabled in config, it generates the different statistics. Use case: Run ior and it will print the NVMe IO stats to control plane log file. :avocado: tags=all,hw,medium,nvme,ib2,nvme_io_stats,full_regression """ # run ior self.run_ior_with_pool() #Get the NVMe IO statistics from server control_log file. cmd = 'cat {}'.format(get_log_file(self.control_log)) task = run_task(self.hostlist_servers, cmd) for _rc_code, _node in task.iter_retcodes(): if _rc_code == 1: self.fail("Failed to run cmd {} on {}".format(cmd, _node)) for buf, _nodes in task.iter_buffers(): output_list = str(buf).split('\n') #Verify statistics are increasing for IO target_stats = [] for _tmp in range(8): target_stats.append( [s for s in output_list if "tgt[{}]".format(_tmp) in s]) for stats in NVME_STATS: for _tgt in range(len(target_stats)): first_stats = re.findall(r'\d+', [ x for x in target_stats[_tgt][0].split() if re.search(stats, x) ][0])[0] last_stats = re.findall(r'\d+', [ x for x in target_stats[_tgt][-1].split() if re.search(stats, x) ][0])[0] #Last statistic should be higher from the initial statistics if int(first_stats) >= int(last_stats): self.fail( 'Failed: Stats {} for target {} did not increased' ' First_stat={} < Last_stat={}'.format( stats, _tgt, first_stats, last_stats))
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.params.get("daos_test", '/run/daos_tests/Tests/*') num_clients = self.params.get("num_clients", '/run/daos_tests/num_clients/*') num_replicas = self.params.get("num_replicas", '/run/daos_tests/num_replicas/*') scm_size = self.params.get("scm_size", '/run/pool/*') args = self.params.get("args", '/run/daos_tests/Tests/*', "") cmd = "{} {} -n {} -x D_LOG_FILE={} \ -x D_LOG_MASK=DEBUG -x DD_MASK=mgmt,io,md,epc,rebuild \ {} -s {} -n {} {}".format(self.orterun, self.client_mca, num_clients, get_log_file(self.client_log), self.daos_test, num_replicas, subtest, args) env = {} env['CMOCKA_XML_FILE'] = "%g_results.xml" env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) load_mpi("openmpi") try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status is not 0: # fake a JUnit failure output with open(self.subtest_name + "_results.xml", "w") as results_xml: results_xml.write('''<?xml version="1.0" encoding="UTF-8"?> <testsuite name="{0}" errors="1" failures="0" skipped="0" tests="1" time="0.0"> <testcase name="ALL" time="0.0" > <error message="Test failed to start up"/> <system-out> <![CDATA[{1}]]> </system-out> <system-err> <![CDATA[{2}]]> </system-err> </testcase> </testsuite>'''.format(self.subtest_name, result.result.stdout, result.result.stderr)) self.fail("{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def prepare(self, storage=True): """Prepare to start daos_server. Args: storage (bool, optional): whether or not to prepare dcpm/nvme storage. Defaults to True. """ self.log.info( "<SERVER> Preparing to start daos_server on %s with %s", self._hosts, self.manager.command) # Create the daos_server yaml file self.manager.job.temporary_file_hosts = self._hosts self.manager.job.create_yaml_file() # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) self._prepare_dmg_certificates() # Prepare dmg for running storage format on all server hosts self._prepare_dmg_hostlist(self._hosts) if not self.dmg.yaml: # If using a dmg config file, transport security was # already configured. self.dmg.insecure.update( self.get_config_value("allow_insecure"), "dmg.insecure") # Kill any daos servers running on the hosts self.kill() # Clean up any files that exist on the hosts self.clean_files() if storage: # Prepare server storage if self.manager.job.using_nvme or self.manager.job.using_dcpm: self.log.info("Preparing storage in <format> mode") self.prepare_storage("root") if hasattr(self.manager, "mca"): self.manager.mca.update( {"plm_rsh_args": "-l root"}, "orterun.mca", True) # Verify the socket directory exists when using a non-systemctl manager self.verify_socket_directory(getuser())
def test_csum_error_logging(self): """ Test ID: DAOS-3927 Test Description: Write Avocado Test to verify single data after pool/container disconnect/reconnect. :avocado: tags=all,daily_regression,hw,medium,ib2,csum_error_log,faults """ dev_id = self.get_nvme_device_id() self.log.info("%s", dev_id) csum = self.get_checksum_error_value(dev_id) self.dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.log.info("Checksum Errors : %d", csum) DaosCoreBase.run_subtest(self) csum_latest = self.get_checksum_error_value(dev_id) self.log.info("Checksum Errors : %d", csum_latest) self.assertTrue(csum_latest > csum, "Checksum Error Log not incremented") self.log.info("Checksum Error Logging Test Passed")
def verify_client_run(self, exp_iface, env): """Verify the interface assigned by running a libdaos client. Args: exp_iface (str): expected interface to check. env (bool): add OFI_INTERFACE variable to exported variables of client command. Returns: bool: returns status """ hfi_map = {"ib0": "hfi1_0", "ib1": "hfi1_1"} # Get counter values for hfi devices before and after cnt_before = self.get_port_cnt(self.hostlist_clients, hfi_map[exp_iface], "port_rcv_data") # get the dmg config file for daos_racer dmg = self.get_dmg_command() # Let's run daos_racer as a client daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg) daos_racer.get_params(self) # Update env_name list to add OFI_INTERFACE if needed. if env: daos_racer.update_env_names(["OFI_INTERFACE"]) # Setup the environment and logfile logf = "daos_racer_{}_{}.log".format(exp_iface, env) # Add FI_LOG_LEVEL to get more info on device issues racer_env = daos_racer.get_environment(self.server_managers[0], logf) racer_env["FI_LOG_LEVEL"] = "info" daos_racer.set_environment(racer_env) # Run client daos_racer.run() # Verify output and port count to check what iface CaRT init with. cnt_after = self.get_port_cnt(self.hostlist_clients, hfi_map[exp_iface], "port_rcv_data") diff = 0 for cnt_b, cnt_a in zip(cnt_before.values(), cnt_after.values()): diff = int(cnt_a) - int(cnt_b) self.log.info("Port [%s] count difference: %s", exp_iface, diff) # Read daos.log to verify device used and prevent false positives self.assertTrue( self.get_log_info(self.hostlist_clients, exp_iface, env, get_log_file(logf))) # If we don't see data going through the device, fail status = True if diff <= 0: self.log.info("No traffic seen through device: %s", exp_iface) status = False else: status = True return status
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: num_clients = self.params.get("num_clients", '/run/daos_tests/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.get_test_param("args", "") stopped_ranks = self.get_test_param("stopped_ranks", []) dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename if self.hostlist_clients: dmg.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) dmg.copy_configuration(self.hostlist_clients) self.client_mca += " --mca btl_tcp_if_include eth0" cmd = " ".join([ self.orterun, self.client_mca, "-n", str(num_clients), "--hostfile", self.hostfile_clients, "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "--map-by node", "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args) ]) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_cmocka_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) if not load_mpi("openmpi"): self.fail("Failed to load openmpi") # Update the expected status for each ranks that will be stopped by this # test to avoid a false failure during tearDown(). if "random" in stopped_ranks: # Set each expected rank state to be either stopped or running for manager in self.server_managers: manager.update_expected_states( None, ["Joined", "Stopped", "Evicted"]) else: # Set the specific expected rank state to stopped for rank in stopped_ranks: for manager in self.server_managers: manager.update_expected_states(rank, ["Stopped", "Evicted"]) try: process.run(cmd, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result) self.fail("{0} failed with return code={1}.\n".format( cmd, result.result.exit_status))
def run_subtest(self): """Run daos_test with a subtest argument.""" subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: num_clients = self.params.get("num_clients", '/run/daos_tests/*') scm_size = self.params.get("scm_size", '/run/pool/*') nvme_size = self.params.get("nvme_size", '/run/pool/*') args = self.get_test_param("args", "") stopped_ranks = self.get_test_param("stopped_ranks", []) pools_created = self.get_test_param("pools_created", 1) self.increment_timeout(POOL_TIMEOUT_INCREMENT * pools_created) dmg = self.get_dmg_command() dmg_config_file = dmg.yaml.filename if self.hostlist_clients: dmg.copy_certificates( get_log_file("daosCA/certs"), self.hostlist_clients) dmg.copy_configuration(self.hostlist_clients) cmd = " ".join( [ "-x", "=".join(["D_LOG_FILE", get_log_file(self.client_log)]), "--map-by node", "-x", "D_LOG_MASK=DEBUG", "-x", "DD_MASK=mgmt,io,md,epc,rebuild", "-x", "COVFILE=/tmp/test.cov", self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args) ] ) job_cmd = ExecutableCommand(namespace=None, command=cmd) job = get_job_manager(self, "Orterun", job_cmd, mpi_type="openmpi") # Assign the test to run job.hostfile.update(self.hostfile_clients) job.processes.update(num_clients) job_str = str(job) env = {} env['CMOCKA_XML_FILE'] = os.path.join(self.outputdir, "%g_cmocka_results.xml") env['CMOCKA_MESSAGE_OUTPUT'] = "xml" env['POOL_SCM_SIZE'] = "{}".format(scm_size) if not nvme_size: nvme_size = 0 env['POOL_NVME_SIZE'] = "{}".format(nvme_size) # Update the expected status for each ranks that will be stopped by this # test to avoid a false failure during tearDown(). if "random" in stopped_ranks: # Set each expected rank state to be either stopped or running for manager in self.server_managers: manager.update_expected_states( None, ["Joined", "Stopped", "Excluded"]) else: # Set the specific expected rank state to stopped for rank in stopped_ranks: for manager in self.server_managers: manager.update_expected_states( rank, ["Stopped", "Excluded"]) try: process.run(job_str, env=env) except process.CmdError as result: if result.result.exit_status != 0: # fake a JUnit failure output self.create_results_xml(self.subtest_name, result, "Failed to run {}.".format( self.daos_test)) self.fail( "{0} failed with return code={1}.\n".format( job_str, result.result.exit_status))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] test_to = self.params.get("test_timeout", test_param + "*") self.job_timeout = self.params.get("job_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") self.nodesperjob = self.params.get("nodesperjob", test_param + "*") self.taskspernode = self.params.get("taskspernode", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) self.dmg_command.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.dmg_command.copy_configuration(self.hostlist_clients) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") obj_class = self.params.get("oclass", "/run/container_reserved/*") if harassers: harasserlist = get_harassers(harassers) self.harassers = harasserlist[:] run_harasser = True self.log.info("<< Initial harrasser list = %s>>", " ".join([harasser for harasser in self.harassers])) # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) resv_cont.write_objects(rank, obj_class) # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize if harassers if run_harasser and not self.harassers: self.harasser_results = {} self.harasser_args = {} self.harassers = harasserlist[:] try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # TO-DO: use IOR if not resv_cont.read_objects(): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") self.container.append(resv_cont) # gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def setUp(self): """Define test setup to be done.""" self.log.info("<<setUp Started>> at %s", time.ctime()) super(SoakTestBase, self).setUp() self.username = getuser() # Initialize loop param for all tests self.loop = 1 self.exclude_slurm_nodes = [] # Setup logging directories for soak logfiles # self.output dir is an avocado directory .../data/ self.log_dir = get_log_file("soak") self.outputsoakdir = self.outputdir + "/soak" # Create the remote log directories on all client nodes self.test_log_dir = self.log_dir + "/pass" + str(self.loop) self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop) self.sharedlog_dir = self.tmp + "/soak" self.sharedsoakdir = self.sharedlog_dir + "/pass" + str(self.loop) # Fail if slurm partition is not defined if not self.client_partition: raise SoakTestError( "<<FAILED: Partition is not correctly setup for daos " "slurm partition>>") # Check if the server nodes are in the client list; # this will happen when only one partition is specified for host_server in self.hostlist_servers: if host_server in self.hostlist_clients: self.hostlist_clients.remove(host_server) self.exclude_slurm_nodes.append(host_server) # Include test node for log cleanup; remove from client list local_host_list = include_local_host(None) self.exclude_slurm_nodes.extend(local_host_list) if local_host_list[0] in self.hostlist_clients: self.hostlist_clients.remove((local_host_list[0])) # Check if requested reservation is allowed in partition # NOTE: Slurm reservation and partition are created before soak runs. # CI uses partition=daos_client and no reservation. # A21 uses partition=normal/default and reservation=daos-test. # Partition and reservation names are updated in the yaml file. # It is assumed that if there is no reservation (CI only), then all # the nodes in the partition will be used for soak. self.srun_params = {"partition": self.client_partition} slurm_reservation = self.params.get( "reservation", "/run/srun_params/*") if slurm_reservation is not None: # verify that the reservation is valid reserved_nodes = slurm_utils.get_reserved_nodes( slurm_reservation, self.client_partition) if not reserved_nodes: # client nodes are invalid for requested reservation self.hostlist_clients = [] raise SoakTestError( "<<FAILED: Reservation {} is invalid " "in partition {}>>".format( slurm_reservation, self.client_partition)) # update srun params self.srun_params["reservation"] = slurm_reservation self.log.info( "<<Updated hostlist_clients %s >>", self.hostlist_clients) if not self.hostlist_clients: self.fail( "There are no valid nodes in this partition to run " "soak. Check partition {} for valid nodes".format( self.client_partition))
def verify_client_run(self, exp_iface, env): """Verify the interface assigned by running a libdaos client. Args: exp_iface (str): expected interface to check. env (bool): add OFI_INTERFACE variable to exported variables of client command. Returns: bool: returns status """ clients = self.agent_managers[0].hosts # Get counter values for hfi devices before and after port_info_before = self.get_port_cnt(clients, "port_rcv_data") # get the dmg config file for daos_racer dmg = self.get_dmg_command() # Let's run daos_racer as a client daos_racer = DaosRacerCommand(self.bin, clients[0], dmg) daos_racer.get_params(self) # Update env_name list to add OFI_INTERFACE if needed. if env: daos_racer.update_env_names(["OFI_INTERFACE"]) # Setup the environment and logfile log_file = "daos_racer_{}_{}.log".format(exp_iface, env) # Add FI_LOG_LEVEL to get more info on device issues racer_env = daos_racer.get_environment(self.server_managers[0], log_file) racer_env["FI_LOG_LEVEL"] = "info" racer_env["D_LOG_MASK"] = "INFO,object=ERR,placement=ERR" daos_racer.set_environment(racer_env) # Run client daos_racer.run() # Verify output and port count to check what iface CaRT init with. port_info_after = self.get_port_cnt(clients, "port_rcv_data") self.log.info("Client interface port_rcv_data counters") msg_format = "%16s %9s %9s %9s %s" self.log.info(msg_format, "Host(s)", "Interface", "Before", "After", "Difference") self.log.info(msg_format, "-" * 16, "-" * 9, "-" * 9, "-" * 9, "-" * 9) no_traffic = set() for interface in sorted(port_info_before): for host in sorted(port_info_before[interface]): before = port_info_before[interface][host][1]["port_rcv_data"] try: after = port_info_after[interface][host][1]["port_rcv_data"] diff = int(after) - int(before) if diff <= 0: no_traffic.add(interface) except (KeyError, ValueError) as error: after = "Error" diff = "Unknown - {}".format(error) no_traffic.add(interface) self.log.info(msg_format, host, interface, before, after, diff) # Read daos.log to verify device used and prevent false positives self.assertTrue(self.get_log_info(clients, exp_iface, env, get_log_file(log_file))) # If we don't see data going through the device, fail for interface in no_traffic: self.log.info("No client traffic seen through device: %s", interface) return len(no_traffic) != len(self.interfaces)
def _prepare_dmg_certificates(self): """Set up dmg certificates.""" local_host = socket.gethostname().split('.', 1)[0] self.dmg.copy_certificates(get_log_file("daosCA/certs"), local_host.split())
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) self.dmg_command.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.dmg_command.copy_configuration(self.hostlist_clients) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file") try: reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont, num_bytes=500000000, cmd="write") except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container write failed>>") from error # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # verify reserved container data final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file") try: reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont) except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container read failed>>") from error if not cmp(initial_resv_file, final_resv_file): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") for file in [initial_resv_file, final_resv_file]: if os.path.isfile(file): file_name = os.path.split(os.path.dirname(file))[-1] # save a copy of the POSIX file in self.outputsoakdir copy_cmd = "cp -p {} {}/{}_resv_file".format( file, self.outputsoakdir, file_name) try: run_command(copy_cmd, timeout=30) except DaosTestError as error: self.soak_errors.append( "Reserved data file {} failed to archive".format(file)) os.remove(file) self.container.append(resv_cont) # Gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))