예제 #1
0
    def pre_tear_down(self):
        """Tear down any test-specific steps prior to running tearDown().

        Returns:
            list: a list of error strings to report after all tear down
            steps have been attempted

        """
        errors = []
        # clear out any jobs in squeue;
        if self.failed_job_id_list:
            job_id = " ".join([str(job) for job in self.failed_job_id_list])
            self.log.info("<<Cancel jobs in queue with ids %s >>", job_id)
            try:
                run_command("scancel --partition {} -u {} {}".format(
                    self.client_partition, self.username, job_id))
            except DaosTestError as error:
                # Exception was raised due to a non-zero exit status
                errors.append("Failed to cancel jobs {}: {}".format(
                    self.failed_job_id_list, error))
        if self.all_failed_jobs:
            errors.append("SOAK FAILED: The following jobs failed {} ".format(
                " ,".join(str(j_id) for j_id in self.all_failed_jobs)))
        # Check if any dfuse mount points need to be cleaned
        if self.dfuse:
            try:
                cleanup_dfuse(self)
            except SoakTestError as error:
                self.log.info("Dfuse cleanup failed with %s", error)

        # daos_agent is always started on this node when start agent is false
        if not self.setup_start_agents:
            self.hostlist_clients = [socket.gethostname().split('.', 1)[0]]
        return errors
예제 #2
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get(
            "single_test_pool", test_param + "*", True)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        resv_cont = self.get_container(
            self.pool[0], "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file
        initial_resv_file = os.path.join(
            os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file")
        try:
            reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont,
                               num_bytes=500000000, cmd="write")
        except CommandFailure as error:
            self.fail(error)

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(
                self.log_dir), self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Soak directories not removed"
                "from clients>>: {}".format(self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error
        # Baseline metrics data
        run_metrics_check(self, prefix="initial")
        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info(
                "<<SOAK LOOP %s: time until done %s>>", self.loop,
                DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info(
                    "Current pools: %s",
                    " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            # Cleanup any dfuse mounts before destroying containers
            cleanup_dfuse(self)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))
            # Fail if the pool/containers did not clean up correctly
            self.assertEqual(
                len(self.soak_errors), 0, "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info(
                "<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(
                    loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # verify reserved container data
        final_resv_file = os.path.join(
            os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file")
        try:
            reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont)
        except CommandFailure as error:
            self.soak_errors.append(
                "<<FAILED: Soak reserved container read failed>>")

        if not cmp(initial_resv_file, final_resv_file):
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        for file in [initial_resv_file, final_resv_file]:
            if os.path.isfile(file):
                file_name = os.path.split(os.path.dirname(file))[-1]
                # save a copy of the POSIX file in self.outputsoakdir
                copy_cmd = "cp -p {} {}/{}_resv_file".format(
                    file, self.outputsoakdir, file_name)
                try:
                    run_command(copy_cmd, timeout=30)
                except DaosTestError as error:
                    self.soak_errors.append(
                        "Reserved data file {} failed to archive".format(file))
                os.remove(file)
        self.container.append(resv_cont)
        # Gather the daos logs from the client nodes
        self.log.info(
            "<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(
                time.time() - start_time))