예제 #1
0
    def prepare(self, storage=True):
        """Prepare to start daos_server.

        Args:
            storage (bool, optional): whether or not to prepare dspm/nvme
                storage. Defaults to True.
        """
        self.log.info("<SERVER> Preparing to start daos_server on %s with %s",
                      self._hosts, self.manager.command)

        # Create the daos_server yaml file
        self.manager.job.create_yaml_file()

        # Prepare dmg for running storage format on all server hosts
        self.dmg.hostlist = self._hosts
        if not self.dmg.yaml:
            # If using a dmg config file, transport security was
            # already configured.
            self.dmg.insecure.update(self.get_config_value("allow_insecure"),
                                     "dmg.insecure")

        # Kill any daos servers running on the hosts
        self.kill()

        # Clean up any files that exist on the hosts
        self.clean_files()

        # Make sure log file has been created for ownership change
        if self.manager.job.using_nvme:
            cmd_list = []
            for server_params in self.manager.job.yaml.server_params:
                log_file = server_params.log_file.value
                if log_file is not None:
                    self.log.info("Creating log file: %s", log_file)
                    cmd_list.append("touch {}".format(log_file))
            if cmd_list:
                pcmd(self._hosts, "; ".join(cmd_list), False)

        if storage:
            # Prepare server storage
            if self.manager.job.using_nvme or self.manager.job.using_dcpm:
                self.log.info("Preparing storage in <format> mode")
                self.prepare_storage("root")
                if hasattr(self.manager, "mca"):
                    self.manager.mca.update({"plm_rsh_args": "-l root"},
                                            "orterun.mca", True)
예제 #2
0
 def _run_cmd(self, cmd):
     ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=180)
     if 0 not in ret_code:
         error_hosts = NodeSet(",".join(
             [str(v) for k, v in ret_code.items() if k != 0]))
         raise CommandFailure(
             "Error running '{}' on the following hosts: {}".format(
                 cmd, error_hosts))
예제 #3
0
    def check_mount_state(self, nodes=None):
        """Check the dfuse mount point mounted state on the hosts.

        Args:
            nodes (NodeSet, optional): hosts on which to check if dfuse is
                mounted. Defaults to None, which will use all of the hosts.

        Returns:
            dict: a dictionary of NodeSets of hosts with the dfuse mount point
                either "mounted" or "unmounted"

        """
        state = {
            "mounted": NodeSet(),
            "unmounted": NodeSet(),
            "nodirectory": NodeSet()
        }
        if not nodes:
            nodes = NodeSet.fromlist(self.hosts)
        check_mounted = NodeSet()

        # Detect which hosts have mount point directories defined
        command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value)
        retcodes = pcmd(nodes, command, expect_rc=None)
        for retcode, hosts in list(retcodes.items()):
            for host in hosts:
                if retcode == 0:
                    check_mounted.add(host)
                else:
                    state["nodirectory"].add(host)

        if check_mounted:
            # Detect which hosts with mount point directories have it mounted as
            # a fuseblk device
            command = "stat -c %T -f {0} | grep -v fuseblk".format(
                self.mount_dir.value)
            retcodes = pcmd(check_mounted, command, expect_rc=None)
            for retcode, hosts in list(retcodes.items()):
                for host in hosts:
                    if retcode == 1:
                        state["mounted"].add(host)
                    else:
                        state["unmounted"].add(host)

        return state
예제 #4
0
파일: dfuse_utils.py 프로젝트: wli5/daos
    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        self.log.info('Stopping dfuse at %s on %s', self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value is None:
            return

        if not len(self.running_hosts):
            return

        self.check_running()
        umount_cmd = [
            "if [ -x '$(command -v fusermount)' ]",
            "then fusermount -u {0}".format(self.mount_dir.value),
            "else fusermount3 -u {0}".format(self.mount_dir.value), "fi"
        ]
        ret_code = pcmd(self.running_hosts, "; ".join(umount_cmd), timeout=30)

        if 0 in ret_code:
            self.running_hosts.remove(ret_code[0])
            del ret_code[0]

        if len(self.running_hosts):
            cmd = "pkill dfuse --signal KILL"
            pcmd(self.running_hosts, cmd, timeout=30)
            pcmd(self.running_hosts, umount_cmd, timeout=30)
            self.remove_mount_point(fail=False)
            raise CommandFailure(
                "Error stopping dfuse on the following hosts: {}".format(
                    self.running_hosts))
        time.sleep(2)
        self.remove_mount_point()
예제 #5
0
    def clean_files(self, verbose=True):
        """Clean up the daos server files.

        Args:
            verbose (bool, optional): display clean commands. Defaults to True.
        """
        clean_commands = []
        for index, engine_params in \
                enumerate(self.manager.job.yaml.engine_params):
            scm_mount = engine_params.get_value("scm_mount")
            self.log.info("Cleaning up the %s directory.", str(scm_mount))

            # Remove the superblocks
            cmd = "sudo rm -fr {}/*".format(scm_mount)
            if cmd not in clean_commands:
                clean_commands.append(cmd)

            # Remove the shared memory segment associated with this io server
            cmd = "sudo ipcrm -M {}".format(self.D_TM_SHARED_MEMORY_KEY +
                                            index)
            clean_commands.append(cmd)

            # Dismount the scm mount point
            cmd = "while sudo umount {}; do continue; done".format(scm_mount)
            if cmd not in clean_commands:
                clean_commands.append(cmd)

            if self.manager.job.using_dcpm:
                scm_list = engine_params.get_value("scm_list")
                if isinstance(scm_list, list):
                    self.log.info("Cleaning up the following device(s): %s.",
                                  ", ".join(scm_list))
                    # Umount and wipefs the dcpm device
                    cmd_list = [
                        "for dev in {}".format(" ".join(scm_list)),
                        "do mount=$(lsblk $dev -n -o MOUNTPOINT)",
                        "if [ ! -z $mount ]", "then while sudo umount $mount",
                        "do continue", "done", "fi", "sudo wipefs -a $dev",
                        "done"
                    ]
                    cmd = "; ".join(cmd_list)
                    if cmd not in clean_commands:
                        clean_commands.append(cmd)

        pcmd(self._hosts, "; ".join(clean_commands), verbose)
예제 #6
0
파일: dfuse_utils.py 프로젝트: liw/daos
    def run(self, check=True, bind_cores=None):
        # pylint: disable=arguments-differ
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
            bind_cores (str): List of CPU cores to pass to taskset
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        if 'D_LOG_MASK' not in self.env:
            self.env['D_LOG_MASK'] = 'INFO'

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = self.env.get_export_str()
        if bind_cores:
            cmd += 'taskset -c {} '.format(bind_cores)
        cmd += str(self)
        self.log.info("Command is '%s'", cmd)
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if ret_code:
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()
예제 #7
0
def add_del_user(hosts, ba_cmd, user):
    '''
    Deascription:
        Add or delete the daos user and group on host by sudo command.
    Args:
        hosts: list of host.
        ba_cmd: linux bash command to create user or group.
        user: user or group name to be created or cleaned.
    Return:
        none.
    '''
    bash_cmd = os.path.join("/usr/sbin", ba_cmd)
    homedir = ""
    if "usermod" not in ba_cmd and "user" in ba_cmd:
        homedir = "-r"
    cmd = " ".join(("sudo", bash_cmd, homedir, user))
    print("     =Clients/hosts {0}, exec cmd: {1}".format(hosts, cmd))
    pcmd(hosts, cmd, False)
예제 #8
0
 def clean_files(self):
     """Clean the tmpfs on the servers."""
     scm_mount = self.runner.job.yaml_params.server_params[-1].scm_mount
     scm_list = self.runner.job.yaml_params.server_params[-1].scm_list.value
     clean_cmds = [
         "find /mnt/daos -mindepth 1 -maxdepth 1 -print0 | xargs -0r rm -rf"
     ]
     if self.runner.job.yaml_params.is_nvme():
         clean_cmds.append("sudo rm -rf {0};  \
                            sudo umount {0}".format(scm_mount))
     # scm_mount can be /mnt/daos0 or /mnt/daos1 for two daos_server
     # instances. Presently, not supported in DAOS. The for loop needs
     # to be updated in future to handle it. Single instance pmem
     # device should work now.
     if self.runner.job.yaml_params.is_scm():
         for value in scm_list:
             clean_cmds.append("sudo umount {}; \
                                sudo wipefs -a {}".format(scm_mount, value))
     self.log.info("Cleanup of %s directory.", str(scm_mount))
     pcmd(self._hosts, "; ".join(clean_cmds), False)
예제 #9
0
def stop_agent(sessions, client_list=None):
    """Kill ssh and the agent.

    This is temporary; presuming the agent will deamonize at somepoint and can
    be started and killed more appropriately.

    Args:
        sessions (dict): set of subprocess sessions returned by run_agent()
        client_list (list, optional): lists of hosts running the daos agent.
            Defaults to None.

    Raises:
        AgentFailed: if the daos agents failed to stop

    """
    # if empty client list, 'self' is effectively client
    if client_list is None:
        client_list = [socket.gethostname().split('.', 1)[0]]

    # Kill the agents processes
    pcmd(client_list, "pkill daos_agent", False)

    # Kill any processes running in the sessions
    for client in sessions:
        if sessions[client].poll() is None:
            sessions[client].kill()
        sessions[client].wait()

    # Check to make sure all the daos agents are dead
    # pgrep exit status:
    #   0 - One or more processes matched the criteria.
    #   1 - No processes matched.
    #   2 - Syntax error in the command line.
    #   3 - Fatal error: out of memory etc.
    time.sleep(5)
    result = pcmd(client_list, "pgrep 'daos_agent'", False, expect_rc=1)
    if len(result) > 1 or 1 not in result:
        raise AgentFailed(
            "DAOS agent processes detected after attempted stop on {}".format(
                ", ".join([str(result[key]) for key in result if key != 1])))
예제 #10
0
def storage_reset(hosts):
    """
    Reset the Storage on servers using the DAOS server's yaml settings file.
    Args:
        hosts (str): a string of comma-separated host names
    Raises:
        ServerFailed: if server failed to reset storage
    """
    daos_srv_bin = get_file_path("bin/daos_server")
    cmd = "sudo {} storage prepare -n --reset -f".format(daos_srv_bin[0])
    result = pcmd(hosts, cmd)
    if len(result) > 1 or 0 not in result:
        raise ServerFailed("Error resetting NVMe storage")
예제 #11
0
def storage_prepare(hosts, user):
    """
    Prepare the storage on servers using the DAOS server's yaml settings file.
    Args:
        hosts (str): a string of comma-separated host names
    Raises:
        ServerFailed: if server failed to prepare storage
    """
    daos_srv_bin = get_file_path("bin/daos_server")
    cmd = ("sudo {} storage prepare -n -u \"{}\" --hugepages=4096 -f"
           .format(daos_srv_bin[0], user))
    result = pcmd(hosts, cmd, timeout=120)
    if len(result) > 1 or 0 not in result:
        raise ServerFailed("Error preparing NVMe storage")
예제 #12
0
    def set_scm_mount_ownership(self, user=None, verbose=False):
        """Set the ownership to the specified user for each scm mount.

        Args:
            user (str, optional): user name. Defaults to None - current user.
            verbose (bool, optional): display commands. Defaults to False.

        """
        user = getpass.getuser() if user is None else user

        cmd_list = set()
        for server_params in self.manager.job.yaml.server_params:
            scm_mount = server_params.scm_mount.value

            # Support single or multiple scm_mount points
            if not isinstance(scm_mount, list):
                scm_mount = [scm_mount]

            self.log.info("Changing ownership to %s for: %s", user, scm_mount)
            cmd_list.add("sudo chown -R {0}:{0} {1}".format(
                user, " ".join(scm_mount)))

        if cmd_list:
            pcmd(self._hosts, "; ".join(cmd_list), verbose)
예제 #13
0
    def run(self):
        """Run the daos_racer command remotely.

        Raises:
            CommandFailure: if there is an error running the command

        """
        # Run daos_racer on the specified host
        self.log.info(
            "Running %s on %s with %s timeout", self.__str__(), self.host,
            "no" if self.clush_timeout.value is None else "a {}s".format(
                self.clush_timeout.value))
        return_codes = pcmd([self.host], self.__str__(), True,
                            self.clush_timeout.value)
        if 0 not in return_codes or len(return_codes) > 1:
            # Kill the daos_racer process if the remote command timed out
            if 255 in return_codes:
                self.log.info("Stopping timed out daos_racer process on %s",
                              self.host)
                pcmd([self.host], "pkill daos_racer", True)

            raise CommandFailure("Error running '{}'".format(self._command))

        self.log.info("Test passed!")
예제 #14
0
    def storage_reset(self):
        """Reset the servers' storage.

        NOTE: Don't enhance this method to reset SCM. SCM will not be in a
        useful state for running next tests.

        Raises:
            ServerFailed: if server failed to reset storage

        """
        daos_srv_bin = os.path.join(self.daosbinpath, "daos_server")
        cmd = "{} storage prepare -n --reset -f".format(daos_srv_bin)
        result = pcmd(self._hosts, cmd)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")
예제 #15
0
파일: soak.py 프로젝트: cdurf1/daos
    def setUp(self):
        """Define test setup to be done."""
        print("<<setUp Started>> at {}".format(time.ctime()))
        super(Soak, self).setUp()
        # Initialize loop param for all tests
        self.loop = 1

        self.failed_job_id_list = []
        # Fail if slurm partition daos_client is not defined
        if not self.partition_clients:
            raise SoakTestError(
                "<<FAILED: Partition is not correctly setup for daos "
                "slurm partition>>")
        # Check if the server nodes are in the client list;
        # this will happen when only one partition is specified
        for host_server in self.hostlist_servers:
            if host_server in self.hostlist_clients:
                self.hostlist_clients.remove(host_server)
        self.log.info("<<Updated hostlist_clients %s >>",
                      self.hostlist_clients)
        # include test node for log cleanup; remove from client list
        self.test_node = [socket.gethostname().split('.', 1)[0]]
        if self.test_node[0] in self.hostlist_clients:
            self.hostlist_clients.remove(self.test_node[0])
            self.log.info("<<Updated hostlist_clients %s >>",
                          self.hostlist_clients)
        self.node_list = self.hostlist_clients + self.test_node
        # self.node_list = self.hostlist_clients

        # Setup logging directories for soak logfiles
        # self.output dir is an avocado directory .../data/
        self.log_dir = "/tmp/soak"
        self.outputsoakdir = self.outputdir + "/soak"

        # Create the remote log directories on all client nodes
        self.rem_pass_dir = self.log_dir + "/pass" + str(self.loop)
        self.local_pass_dir = self.outputsoakdir + "/pass" + str(self.loop)

        # cleanup soak log directories before test on all nodes
        result = pcmd(NodeSet.fromlist(self.node_list),
                      "rm -rf {}".format(self.log_dir),
                      verbose=False)
        if len(result) > 1 or 0 not in result:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(", ".join([
                                    str(result[key]) for key in result
                                    if key != 0
                                ])))
예제 #16
0
    def prepare_storage(self, user, using_dcpm=None, using_nvme=None):
        """Prepare the server storage.

        Args:
            user (str): username
            using_dcpm (bool, optional): override option to prepare scm storage.
                Defaults to None, which uses the configuration file to determine
                if scm storage should be formatted.
            using_nvme (bool, optional): override option to prepare nvme
                storage. Defaults to None, which uses the configuration file to
                determine if nvme storage should be formatted.

        Raises:
            ServerFailed: if there was an error preparing the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.target_user.value = user
        cmd.sub_command_class.sub_command_class.force.value = True

        # Use the configuration file settings if no overrides specified
        if using_dcpm is None:
            using_dcpm = self.manager.job.using_dcpm
        if using_nvme is None:
            using_nvme = self.manager.job.using_nvme

        if using_dcpm and not using_nvme:
            cmd.sub_command_class.sub_command_class.scm_only.value = True
        elif not using_dcpm and using_nvme:
            cmd.sub_command_class.sub_command_class.nvme_only.value = True

        if using_nvme:
            hugepages = self.get_config_value("nr_hugepages")
            cmd.sub_command_class.sub_command_class.hugepages.value = hugepages

        self.log.info("Preparing DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=40)
        if len(result) > 1 or 0 not in result:
            dev_type = "nvme"
            if using_dcpm and using_nvme:
                dev_type = "dcpm & nvme"
            elif using_dcpm:
                dev_type = "dcpm"
            raise ServerFailed("Error preparing {} storage".format(dev_type))
예제 #17
0
파일: dfuse_utils.py 프로젝트: wli5/daos
    def run(self, check=True):
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = "".join([self.env.get_export_str(), self.__str__()])
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if len(ret_code):
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in ret_code.items()
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()
예제 #18
0
    def start(self, yamlfile):
        """Start the server through the runner."""
        self.runner.job.set_config(yamlfile)
        self.server_clean()

        # Prepare nvme storage in servers
        if self.runner.job.yaml_params.is_nvme():
            self.log.info("Performing nvme storage prepare in <format> mode")
            storage_prepare(self._hosts, "root")
            self.runner.mca.value = {"plm_rsh_args": "-l root"}

            # Make sure log file has been created for ownership change
            lfile = self.runner.job.yaml_params.server_params[
                -1].log_file.value
            if lfile is not None:
                self.log.info("Creating log file")
                cmd_touch_log = "touch {}".format(lfile)
                pcmd(self._hosts, cmd_touch_log, False)

            # Change ownership of attach info directory
            chmod_attach = "chmod 777 -R {}".format(self.attach.value)
            pcmd(self._hosts, chmod_attach, False)

        try:
            self.run()
        except CommandFailure as details:
            self.log.info("<SERVER> Exception occurred: %s", str(details))
            # Kill the subprocess, anything that might have started
            self.kill()
            raise ServerFailed("Failed to start server in {} mode.".format(
                self.runner.job.mode))

        if self.runner.job.yaml_params.is_nvme() or \
           self.runner.job.yaml_params.is_scm():
            # Setup the hostlist to pass to dmg command
            servers_with_ports = [
                "{}:{}".format(host, self.runner.job.yaml_params.port)
                for host in self._hosts
            ]

            # Format storage and wait for server to change ownership
            self.log.info("Formatting hosts: <%s>", self._hosts)
            storage_format(self.daosbinpath, ",".join(servers_with_ports))
            self.runner.job.mode = "normal"
            try:
                self.runner.job.check_subprocess_status(self.runner.process)
            except CommandFailure as error:
                self.log.info("Failed to start after format: %s", str(error))

            # Change ownership shared attach info file
            chmod_cmds = "sudo chmod 777 {}/daos_server.attach_info_tmp".format(
                self.attach.value)
            pcmd(self._hosts, chmod_cmds, False)

        return True
예제 #19
0
def storage_reset(hosts):
    """Reset the Storage on servers using the DAOS server's yaml settings file.

    NOTE: Don't enhance this method to reset SCM. SCM will not be in a useful
    state for running next tests.

    Args:
        hosts (str): a string of comma-separated host names

    Raises:
        ServerFailed: if server failed to reset storage

    """
    daos_srv_bin = get_file_path("bin/daos_server")
    cmd = "sudo {} storage prepare -n --reset -f".format(daos_srv_bin[0])
    result = pcmd(hosts, cmd)
    if len(result) > 1 or 0 not in result:
        raise ServerFailed("Error resetting NVMe storage")
예제 #20
0
파일: dfuse_utils.py 프로젝트: wli5/daos
    def check_running(self, fail_on_error=True):
        """Check dfuse is running.

        Run a command to verify dfuse is running on hosts where it is supposed
        to be.  Use grep -v and rc=1 here so that if it isn't, then we can
        see what is being used instead.
        """
        retcodes = pcmd(self.running_hosts,
                        "stat -c %T -f {0} | grep -v fuseblk".format(
                            self.mount_dir.value),
                        expect_rc=1)
        if 1 in retcodes:
            del retcodes[1]
        if len(retcodes):
            self.log.error('Errors checking running: %s', retcodes)
            if not fail_on_error:
                return False
            raise CommandFailure('dfuse not running')
        return True
예제 #21
0
    def stop(self):
        """Stop dfuse
        Raises:
            CommandFailure: In case dfuse stop fails
        """

        cmd = "if [ -x '$(command -v fusermount)' ]; "
        cmd += "then fusermount -u {0}; else fusermount3 -u {0}; fi".\
               format(self.mount_dir.value)
        ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30)
        self.remove_mount_point()
        if 0 not in ret_code:
            error_hosts = NodeSet(
                ",".join(
                    [str(node_set) for code, node_set in ret_code.items()
                     if code != 0]))
            raise CommandFailure(
                "Error stopping dfuse on the following hosts: {}".format(
                    error_hosts))
예제 #22
0
    def reset_storage(self):
        """Reset the server storage.

        Raises:
            ServerFailed: if there was an error resetting the storage

        """
        cmd = DaosServerCommand(self.manager.job.command_path)
        cmd.sudo = False
        cmd.debug.value = False
        cmd.set_sub_command("storage")
        cmd.sub_command_class.set_sub_command("prepare")
        cmd.sub_command_class.sub_command_class.nvme_only.value = True
        cmd.sub_command_class.sub_command_class.reset.value = True
        cmd.sub_command_class.sub_command_class.force.value = True

        self.log.info("Resetting DAOS server storage: %s", str(cmd))
        result = pcmd(self._hosts, str(cmd), timeout=120)
        if len(result) > 1 or 0 not in result:
            raise ServerFailed("Error resetting NVMe storage")
예제 #23
0
    def run(self, host):
        """Execute Fio from the host passed on to the method.

           Args:
             host: hostname from where to launch fio
        """
        # run fio command
        self.run_cmd = self.run_cmd.replace("fio", ' ').replace(" POSIX", '')
        print("Running: {}".format('fio' + self.run_cmd))
        ret_code = general_utils.pcmd(host, 'fio' + self.run_cmd)

        # check for any failures
        if 0 not in ret_code:
            error_hosts = NodeSet(
                ",".join(
                    [str(node_set) for code, node_set in ret_code.items()
                     if code != 0]))
            raise CommandFailure(
                "Error starting fio on the following hosts: {}".format(
                    error_hosts))
예제 #24
0
    def run(self):
        """Run the dfuse command.

        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment varaibles for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = "".join([self.env.get_export_str(), self.__str__()])
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if len(ret_code):
            error_hosts = NodeSet(
                ",".join(
                    [str(node_set) for code, node_set in ret_code.items()
                     if code != 0]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if not self.check_running(fail_on_error=False):
            self.log.info('Waiting five seconds for dfuse to start')
            time.sleep(5)
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting twenty five seconds for dfuse to start')
                time.sleep(25)
                self.check_running()
예제 #25
0
def execute_cluster_cmds(nodes, cmdlist, sudo=False):
    """Execute the list of cmds on hostlist nodes.

    Args:
        nodes (list):  list of nodes
        cmdlist ([type]): list of cmdlines to execute
        sudo (str, optional): Execute cmd with sudo privs. Defaults to false.

     Returns:
        ret_code: returns error code if pcmd fails;

    """
    for cmd in cmdlist:
        if sudo:
            cmd = "sudo {}".format(cmd)
        result = pcmd(nodes, cmd, True, None, 0)
        # if at least one node failed or all nodes failed
        # return on first failure
        if len(result) > 1 or 0 not in result:
            return 1
    return 0
예제 #26
0
    def tearDown(self):
        """Tear down after each test case."""
        super().tearDown()

        # force test status !!
        # use mangling trick described at
        # https://stackoverflow.com/questions/3385317/private-variables-and-methods-in-python
        # to do so
        self._Test__status = 'PASS'

        # DAOS-1452 may need to check for one file per engine...
        ret_codes = pcmd(self.hostlist_servers, r"ls /tmp/daos_dump*.txt")
        # Report any failures
        if len(ret_codes) > 1 or 0 not in ret_codes:
            failed = [
                "{}: rc={}".format(val, key) for key, val in ret_codes.items()
                if key != 0
            ]
            print("no ULT stacks dump found on following hosts: {}".format(
                ", ".join(failed)))
            self._Test__status = 'FAIL'
예제 #27
0
    def test_super_block_version_basic(self):
        """JIRA ID: DAOS-3648.

        Test Description:
            Basic test to verify that superblock file is versioned.

        :avocado: tags=all,tiny,pr,ds_versioning,basic
        """
        # Check that the superblock file exists under the scm_mount dir.
        scm_mount = self.server_managers[0].get_config_value("scm_mount")
        fname = os.path.join(scm_mount, "superblock")
        check_result = check_file_exists(self.hostlist_servers, fname)
        if not check_result[0]:
            self.fail("{}: {} not found".format(check_result[1], fname))

        # Make sure that 'version' is in the file, run task to check
        cmd = "cat {} | grep -F \"version\"".format(fname)
        result = pcmd(self.hostlist_servers, cmd, timeout=20)

        # Determine if the command completed successfully across all the hosts
        if len(result) > 1 or 0 not in result:
            self.fail("Was not able to find version in {} file".format(fname))
예제 #28
0
파일: soak.py 프로젝트: cdurf1/daos
    def remote_copy(self, hostlist, remote_dir, local_dir):
        """Copy files from remote dir to local dir.

        Args:
                hostlist (list): list of remote nodes
                remote_dir (str): remote directory of files
                local_dir (str): local directory

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        this_host = socket.gethostname()
        result = pcmd(
            NodeSet.fromlist(hostlist),
            "if [ ! -z '$(ls -A {0})' ]; then "
            "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format(
                remote_dir, this_host, local_dir),
            verbose=False)
        if len(result) > 1 or 0 not in result:
            raise SoakTestError("Error executing remote copy: {}".format(
                ", ".join([str(result[key]) for key in result if key != 0])))
예제 #29
0
    def run(self):
        """ Run the dfuse command.
        Raises:
            CommandFailure: In case dfuse run command fails
        """

        # create dfuse dir if does not exist
        self.create_mount_point()
        # obtain env export string
        env = self.get_default_env()
        # run dfuse command
        ret_code = general_utils.pcmd(self.hosts, env + self.__str__(),
                                      timeout=30)
        # check for any failures
        if 0 not in ret_code:
            error_hosts = NodeSet(
                ",".join(
                    [str(node_set) for code, node_set in ret_code.items()
                     if code != 0]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))
예제 #30
0
    def get_subprocess_state(self, message=None):
        """Display the state of the subprocess.

        Args:
            message (str, optional): additional text to include in output.
                Defaults to None.

        Returns:
            list: a list of states for the process found. If the local job
                manager command is running its state will be the first in the
                list. Additional states in the list can typically indicate that
                remote processes were also found to be active.  Active remote
                processes will be indicated by a 'R' state at the end of the
                list.

        """
        # Get/display the state of the local job manager process
        state = super(JobManager, self).get_subprocess_state(message)
        if self._process is not None and self._hosts:
            # Display the status of the remote job processes on each host
            command = "/usr/bin/pgrep -a {}".format(self.job.command_regex)
            self.log.debug("%s processes still running remotely%s:",
                           self.command,
                           " {}".format(message) if message else "")
            self.log.debug("Running (on %s): %s", self._hosts, command)
            results = pcmd(self._hosts, command, True, 10, None)

            # Add a running state to the list of process states if any remote
            # process was found to be active.  The pcmd method will return a
            # dictionary with a single key, e.g. {1: <NodeSet>}, if there are
            # no remote processes running on any of the hosts.  If this value
            # is not returned, indicate there are processes running by adding
            # the "R" state to the process state list.
            if 1 not in results or len(results) > 1:
                if not state:
                    state = ["?"]
                state.append("R")
        return state