Python srun示例，slurm_utils.srun Python示例

示例#1

0

显示文件

文件： soak.py 项目： marcelarosalesj/daos

    def start_dfuse(self, pool):
        """Create a DfuseCommand object to start dfuse.

        Args:

            pool (obj):   TestPool obj
        """
        # Get Dfuse params
        self.dfuse = Dfuse(self.hostlist_clients, self.tmp)
        self.dfuse.get_params(self)
        # update dfuse params
        self.dfuse.set_dfuse_params(pool)
        self.dfuse.set_dfuse_cont_param(self.create_dfuse_cont(pool))
        self.dfuse.set_dfuse_exports(self.server_managers[0], self.client_log)

        # create dfuse mount point
        cmd = "mkdir -p {}".format(self.dfuse.mount_dir.value)
        params = self.srun_params
        params["export"] = "all"
        params["ntasks-per-node"] = 1
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Dfuse mountpoint {} not created>>".format(
                    self.dfuse.mount_dir.value))
        cmd = self.dfuse.__str__()
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), cmd,
                                  params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Dfuse failed to start>>")

示例#2

0

显示文件

文件： soak.py 项目： vatelzh/daos

    def get_remote_logs(self):
        """Copy files from remote dir to local dir.

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        # copy the files from the remote
        # TO-DO: change scp
        this_host = socket.gethostname()
        command = "/usr/bin/rsync -avtr --min-size=1B {0} {1}:{0}/..".format(
            self.test_log_dir, this_host)
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  command, self.srun_params)
        if result.exit_status == 0:
            command = "/usr/bin/cp -R -p {0}/ \'{1}\'".format(
                self.test_log_dir, self.outputsoakdir)
            try:
                run_command(command, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak remote logfiles not copied to avocado data "
                    "dir {} - check /tmp/soak on nodes {}>>".format(
                        error, self.hostlist_clients))

            command = "/usr/bin/rm -rf {0}/*".format(self.test_log_dir)
            slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), command,
                             self.srun_params)
            run_command(command)
        else:
            raise SoakTestError(
                "<<FAILED: Soak remote logfiles not copied from clients>>: "
                "{}".format(self.hostlist_clients))

示例#3

0

显示文件

文件： utils.py 项目： kjacque/daos

def cleanup_dfuse(self):
    """Cleanup and remove any dfuse mount points.

    Args:
        self (obj): soak obj

    """
    cmd = [
        "/usr/bin/bash -c 'for pid in $(pgrep dfuse)", "do sudo kill $pid",
        "done'"
    ]
    cmd2 = [
        "/usr/bin/bash -c 'for dir in $(find /tmp/daos_dfuse/)",
        "do fusermount3 -uz $dir", "rm -rf $dir", "done'"
    ]
    try:
        slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                         "{}".format(";".join(cmd)),
                         self.srun_params,
                         timeout=180)
    except slurm_utils.SlurmFailed as error:
        self.log.info("Dfuse processes not stopped")
    try:
        slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                         "{}".format(";".join(cmd2)),
                         self.srun_params,
                         timeout=180)
    except slurm_utils.SlurmFailed as error:
        self.log.info("Dfuse mountpoints not deleted")

示例#4

0

显示文件

文件： soak_utils.py 项目： xyuan/daos

def cleanup_dfuse(self):
    """Cleanup and remove any dfuse mount points."""
    cmd = [
        "/usr/bin/bash -c 'pkill dfuse", "for dir in /tmp/daos_dfuse*",
        "do fusermount3 -uz $dir", "rm -rf $dir", "done'"
    ]
    try:
        slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                         "{}".format(";".join(cmd)), self.srun_params)
    except slurm_utils.SlurmFailed as error:
        self.log.info("<<FAILED: Dfuse directories not deleted %s >>", error)

示例#5

0

显示文件

文件： soak.py 项目： marcelarosalesj/daos

    def get_remote_logs(self):
        """Copy files from remote dir to local dir.

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        # copy the files from the remote
        # TO-DO: change scp
        this_host = socket.gethostname()
        rsync_str = "rsync -avtr --min-size=1B"
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients),
            "bash -c \"{0} {1} {2}:{1}/.. && rm -rf {1}/*\"".format(
                rsync_str, self.test_log_dir, this_host), self.srun_params)
        if result.exit_status == 0:
            cmd = "cp -R -p {0}/ \'{1}\'; rm -rf {0}/*".format(
                self.test_log_dir, self.outputsoakdir)
            try:
                result = process.run(cmd, shell=True, timeout=30)
            except process.CmdError as error:
                raise SoakTestError("<<FAILED: Soak remote logfiles not copied"
                                    "to avocado data dir {} - check /tmp/soak "
                                    "on nodes {}>>".format(
                                        error, self.hostlist_clients))
        else:
            raise SoakTestError("<<FAILED: Soak remote logfiles not copied "
                                "from clients>>: {}".format(
                                    self.hostlist_clients))

示例#6

0

显示文件

    def execute_jobs(self, jobs, pools):
        """Execute the overall soak test.

        Args:
            pools (list): list of TestPool obj - self.pool[1:]

        Raise:
            SoakTestError

        """
        cmdlist = []
        # unique numbers per pass
        self.used = []
        # Update the remote log directories from new loop/pass
        self.sharedsoakdir = self.sharedlog_dir  + "/pass" + str(self.loop)
        self.test_log_dir = self.log_dir + "/pass" + str(self.loop)
        local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients), "mkdir -p {}".format(
                self.test_log_dir), self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: logfile directory not"
                "created on clients>>: {}".format(self.hostlist_clients))
        # Create local log directory
        os.makedirs(local_pass_dir)
        os.makedirs(self.sharedsoakdir)
        # Setup cmdlines for job with specified pool
        if len(pools) < len(jobs):
            raise SoakTestError(
                "<<FAILED: There are not enough pools to run this test>>")
        for index, job in enumerate(jobs):
            cmdlist.extend(self.job_setup(job, pools[index]))
        # Gather the job_ids
        job_id_list = self.job_startup(cmdlist)
        # Initialize the failed_job_list to job_list so that any
        # unexpected failures will clear the squeue in tearDown
        self.failed_job_id_list = job_id_list
        # launch harassers if defined and enabled
        if self.h_list and self.loop > 1:
            self.log.info("<<Harassers are enabled>>")
            self.launch_harassers(self.h_list, pools)
            if not self.harasser_completion(self.harasser_timeout):
                raise SoakTestError("<<FAILED: Harassers failed ")
            # rebuild can only run once for now
            if self.is_harasser("rebuild"):
                self.h_list.remove("rebuild")
        # Wait for jobs to finish and cancel/kill jobs if necessary
        self.failed_job_id_list = self.job_completion(job_id_list)
        # Log the failing job ID
        if self.failed_job_id_list:
            self.log.info(
                "<<FAILED: The following jobs failed %s >>", (" ,".join(
                    str(j_id) for j_id in self.failed_job_id_list)))
            # accumulate failing job IDs
            self.all_failed_jobs.extend(self.failed_job_id_list)
            # clear out the failed jobs for this pass
            self.failed_job_id_list = []

示例#7

0

显示文件

def get_remote_logs(self):
    """Copy files from remote dir to local dir.

    Args:
        self (obj): soak obj

    Raises:
        SoakTestError: if there is an error with the remote copy

    """
    # copy the files from the client nodes to a shared directory
    command = "/usr/bin/rsync -avtr --min-size=1B {0} {1}/..".format(
        self.test_log_dir, self.sharedsoakdir)
    result = slurm_utils.srun(
        NodeSet.fromlist(self.hostlist_clients), command, self.srun_params)
    if result.exit_status == 0:
        # copy the local logs and the logs in the shared dir to avocado dir
        for directory in [self.test_log_dir, self.sharedsoakdir]:
            command = "/usr/bin/cp -R -p {0}/ \'{1}\'".format(
                directory, self.outputsoakdir)
            try:
                result = run_command(command, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: job logs failed to copy {}>>".format(
                        error))
        # remove the remote soak logs for this pass
        command = "/usr/bin/rm -rf {0}".format(self.test_log_dir)
        slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients), command,
            self.srun_params)
        # remove the local log for this pass
        for directory in [self.test_log_dir, self.sharedsoakdir]:
            command = "/usr/bin/rm -rf {0}".format(directory)
            try:
                result = run_command(command)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: job logs failed to delete {}>>".format(
                        error))
    else:
        raise SoakTestError(
            "<<FAILED: Soak remote logfiles not copied "
            "from clients>>: {}".format(self.hostlist_clients))

示例#8

0

显示文件

    def execute_jobs(self, jobs, pools):
        """Execute the overall soak test.

        Args:
            pools (list): list of TestPool obj - self.pool[1:]

        Raise:
            SoakTestError

        """
        job_script_list = []
        # Update the remote log directories from new loop/pass
        self.sharedsoaktest_dir = self.sharedsoak_dir + "/pass" + str(
            self.loop)
        self.soaktest_dir = self.soak_dir + "/pass" + str(self.loop)
        outputsoaktest_dir = self.outputsoak_dir + "/pass" + str(self.loop)
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "mkdir -p {}".format(self.soaktest_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: logfile directory not"
                                "created on clients>>: {}".format(
                                    self.hostlist_clients))
        # Create local avocado log directory for this pass
        os.makedirs(outputsoaktest_dir)
        # Create shared log directory for this pass
        os.makedirs(self.sharedsoaktest_dir)
        # Create local test log directory for this pass
        os.makedirs(self.soaktest_dir)
        # create the batch scripts
        job_script_list = self.job_setup(jobs, pools)
        # randomize job list
        random.seed(4)
        random.shuffle(job_script_list)
        # Gather the job_ids
        job_id_list = self.job_startup(job_script_list)
        # Initialize the failed_job_list to job_list so that any
        # unexpected failures will clear the squeue in tearDown
        self.failed_job_id_list = job_id_list

        # Wait for jobs to finish and cancel/kill jobs if necessary
        self.failed_job_id_list = self.job_completion(job_id_list)
        # Log the failing job ID
        if self.failed_job_id_list:
            self.log.info(
                "<<FAILED: The following jobs failed %s >>",
                (" ,".join(str(j_id) for j_id in self.failed_job_id_list)))
            # accumulate failing job IDs
            self.all_failed_jobs.extend(self.failed_job_id_list)
            # clear out the failed jobs for this pass
            self.failed_job_id_list = []

示例#9

0

显示文件

    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        self.dmg_command.copy_certificates(get_log_file("daosCA/certs"),
                                           self.hostlist_clients)
        self.dmg_command.copy_configuration(self.hostlist_clients)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file
        initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                         "initial", "resv_file")
        try:
            reserved_file_copy(self,
                               initial_resv_file,
                               self.pool[0],
                               resv_cont,
                               num_bytes=500000000,
                               cmd="write")
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container write failed>>") from error

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # verify reserved container data
        final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                       "final", "resv_file")
        try:
            reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont)
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container read failed>>") from error

        if not cmp(initial_resv_file, final_resv_file):
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        for file in [initial_resv_file, final_resv_file]:
            if os.path.isfile(file):
                file_name = os.path.split(os.path.dirname(file))[-1]
                # save a copy of the POSIX file in self.outputsoakdir
                copy_cmd = "cp -p {} {}/{}_resv_file".format(
                    file, self.outputsoakdir, file_name)
                try:
                    run_command(copy_cmd, timeout=30)
                except DaosTestError as error:
                    self.soak_errors.append(
                        "Reserved data file {} failed to archive".format(file))
                os.remove(file)
        self.container.append(resv_cont)
        # Gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))

示例#10

0

显示文件

文件： soak_test_base.py 项目： yanghaomai/daos

    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.job_timeout = self.params.get("job_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        self.nodesperjob = self.params.get("nodesperjob", test_param + "*")
        self.taskspernode = self.params.get("taskspernode", test_param + "*")
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        obj_class = self.params.get("oclass", "/run/container_reserved/*")
        if harassers:
            harasserlist = get_harassers(harassers)
            self.harassers = harasserlist[:]
            run_harasser = True
            self.log.info("<< Initial harrasser list = %s>>",
                          " ".join([harasser for harasser in self.harassers]))
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        self.pool[0].connect()

        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        resv_cont.write_objects(rank, obj_class)

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            # Create pool for jobs
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Initialize if harassers
            if run_harasser and not self.harassers:
                self.harasser_results = {}
                self.harasser_args = {}
                self.harassers = harasserlist[:]
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.soak_errors.extend(self.destroy_pools(self.pool[1]))
            # remove the test pools from self.pool; preserving reserved pool
            self.container = []
            self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and self.harassers:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # TO-DO: use IOR
        if not resv_cont.read_objects():
            self.soak_errors.append("Data verification error on reserved pool"
                                    "after SOAK completed")
        self.container.append(resv_cont)
        # gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))

示例#11

0

显示文件

    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.harasser_joblist = []
        self.harasser_results = {}
        test_to = self.params.get("test_timeout", test_param)
        self.job_timeout = self.params.get("job_timeout", test_param)
        self.harasser_timeout = self.params.get("harasser_timeout", test_param)
        self.test_name = self.params.get("name", test_param)
        self.nodesperjob = self.params.get("nodesperjob", test_param)
        self.test_iteration = self.params.get("iteration", test_param)
        self.task_list = self.params.get("taskspernode", test_param + "*")
        self.h_list = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        pool_list = self.params.get("poollist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        if self.is_harasser("rebuild"):
            obj_class = "_".join(["OC", str(
                self.params.get("daos_oclass", "/run/rebuild/*")[0])])
        else:
            obj_class = self.params.get(
                "object_class", "/run/container_reserved/*")
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        self.add_pools(["pool_reserved"])
        self.pool[0].connect()
        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        self.container = TestContainer(self.pool[0])
        self.container.namespace = "/run/container_reserved/*"
        self.container.get_params(self)
        self.container.create()
        self.container.write_objects(rank, obj_class)
        self.all_failed_jobs = []
        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(
                self.log_dir), self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Soak directories not removed"
                "from clients>>: {}".format(self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))
        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info(
                "<<Soak1 PASS %s: time until done %s>>", self.loop,
                DDHHMMSS_format(self.end_time - time.time()))
            # Create all specified pools
            self.add_pools(pool_list)
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))
            try:
                self.execute_jobs(job_list, self.pool[1:])
            except SoakTestError as error:
                self.fail(error)
            errors = self.destroy_pools(self.pool[1:])
            # remove the test pools from self.pool; preserving reserved pool
            self.pool = [self.pool[0]]
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))
            self.assertEqual(len(errors), 0, "\n".join(errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info(
                "<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format(
                    loop_time))
            self.loop += 1
        # TO-DO: use IOR
        self.assertTrue(
            self.container.read_objects(),
            "Data verification error on reserved pool"
            "after SOAK completed")
        # gather the daos logs from the client nodes
        self.log.info(
            "<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format(
                time.time() - start_time))

示例#12

0

显示文件

    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        self.mpi_module = self.params.get("mpi_module",
                                          "/run/*",
                                          default="mpi/mpich-x86_64")
        enable_sudo = self.params.get("enable_sudo", "/run/*", default=True)
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000)
        ignore_soak_errors = self.params.get("ignore_soak_errors",
                                             test_param + "*", False)
        self.sudo_cmd = "sudo" if enable_sudo else ""
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        self.resv_cont = self.get_container(self.pool[0],
                                            "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file unless test is smoke
        self.initial_resv_file = os.path.join(self.test_dir, "initial",
                                              "resv_file")
        try:
            reserved_file_copy(self,
                               self.initial_resv_file,
                               self.pool[0],
                               self.resv_cont,
                               num_bytes=resv_bytes,
                               cmd="write")
        except CommandFailure as error:
            self.fail(error)

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.soak_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.soak_dir, self.sharedsoak_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error
        # Baseline metrics data
        run_metrics_check(self, prefix="initial")
        # Initialize time
        self.start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = self.start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            # Cleanup any dfuse mounts before destroying containers
            cleanup_dfuse(self)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1:]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Gather metrics data after jobs complete
            run_metrics_check(self)
            # Fail if the pool/containers did not clean up correctly
            if not ignore_soak_errors:
                self.assertEqual(len(self.soak_errors), 0,
                                 "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - self.start_time))