Пример #1
0
def display(tree, disp, gather, trace_mode, enable_nodeset_key):
    """nicely display MsgTree instance `tree' content according to
    `disp' Display object and `gather' boolean flag"""
    out = sys_stdout()
    try:
        if trace_mode:
            display_tree(tree, disp, out)
        else:
            if gather:
                if enable_nodeset_key:
                    # lambda to create a NodeSet from keys returned by walk()
                    ns_getter = lambda x: NodeSet.fromlist(x[1])
                    for nodeset in sorted((ns_getter(item) for item in tree.walk()),
                                          key=nodeset_cmpkey):
                        disp.print_gather(nodeset, tree[nodeset[0]])
                else:
                    for msg, key in tree.walk():
                        disp.print_gather_keys(key, msg)
            else:
                if enable_nodeset_key:
                    # nodes are automagically sorted by NodeSet
                    for node in NodeSet.fromlist(tree.keys()).nsiter():
                        disp.print_gather(node, tree[str(node)])
                else:
                    for key in tree.keys():
                        disp.print_gather_keys([ key ], tree[key])
    finally:
        out.flush()
Пример #2
0
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            nodes = NodeSet.fromlist(worker.iter_keys_timeout())
            self.fs._handle_shine_proxy_error(nodes, "Nodes timed out")
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            for rc, nodes in worker.iter_retcodes():
                if rc == 0:
                    continue

                # Avoid warnings, flag this component in error state
                for comp in self._comps or []:
                    comp.sanitize_state(nodes=worker.nodes)

                for output, nodes in worker.iter_buffers(match_keys=nodes):
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Copy failed: %s" % output
                    self.fs._handle_shine_proxy_error(nodes, msg)
            self.set_status(ACT_ERROR)
Пример #3
0
    def __gen_action_output(self, iterbuf, iterrc, timeouts, error_only):
        '''Display command result from output and retcodes.'''

        # Build the list of non-zero rc nodes
        retcodes = list(iterrc)
        ok_nodes = NodeSet.fromlist((nds for rc, nds in retcodes if rc == 0))

        output = []
        for out, nodes in iterbuf:
            if error_only:
                nodes = NodeSet(nodes) - ok_nodes
            if nodes and out:
                for lbuf in out.splitlines():
                    output.append(' > %s: %s' %
                                  (self.string_color(nodes, 'CYAN'), lbuf))

        for retcode, nodes in retcodes:
            if retcode == 0 and not error_only:
                output.append(' > %s exited with %s' %
                              (self.string_color(nodes, 'CYAN'),
                               self.string_color(retcode, 'GREEN')))
            elif retcode != 0:
                output.append(' > %s exited with %s' %
                              (self.string_color(nodes, 'CYAN'),
                               self.string_color(retcode, 'RED')))
        if len(timeouts):
            output.append(' > %s has %s' %
                          (self.string_color(timeouts, 'CYAN'),
                           self.string_color('timeout', 'RED')))
        return output
Пример #4
0
 def torque_job_nodelist(self,nodelist):
     nodelist = self._exechostpat.sub('',nodelist)
     nodelist = nodelist.split('+')
     nbprocs = len(nodelist)
     nodelist = NodeSet.fromlist(nodelist)
     nbnodes = len(nodelist)
     nodelist = str(nodelist)
     return nbprocs, nbnodes, nodelist
Пример #5
0
    def parse(self, filename=None):
        """
        Function called to parse the content of the tuning configuratio file
        and store the configuration in the object.
        """
        # Build the patterns to retrieve alias and parameter declaration
        alias_re = re.compile("alias\s+(\S+)\s*=\s*(\S+)$")
        parameter_re = re.compile('("[^"]+"|\S+)\s+(\S+)\s+(\S+)$')
        supported = NodeSet.fromlist(list(NODE_TYPES) + TYPE_ALIASES.keys())

        # Open the file to read each lines
        try:
            tuning_file = open(filename or self.filename)

            for line in tuning_file.readlines():

                # Skip comments and blanks
                line = line.split('#', 1)[0].strip()
                if not line:
                    continue

                m_alias = alias_re.match(line)
                m_param = parameter_re.match(line)

                if m_alias:
                    # This line is an alias creation
                    self.create_parameter_alias(m_alias.group(1),
                                                m_alias.group(2))

                elif m_param:
                    # This line is a parameter instanciation
                    nodes = NodeSet.fromlist(
                                           m_param.group(3).lower().split(';'))
                    self.create_parameter(m_param.group(2), m_param.group(1),
                                          nodes & supported, nodes - supported)

                else:
                    # This line is not recognized
                    raise TuningError("Wrong tuning syntax '%s'" % line)

            tuning_file.close()

        except IOError, error:
            msg = "Error while reading tuning configuration file: %s" % error
            raise TuningError(msg)
Пример #6
0
 def iter_retcodes(self, match_keys=None):
     """
     Returns an iterator over return codes and associated NodeSet.
     If the optional parameter match_keys is defined, only keys
     found in match_keys are returned.
     """
     self._task_bound_check()
     for rc, keys in self.task._rc_iter_by_worker(self, match_keys):
         yield rc, NodeSet.fromlist(keys)
Пример #7
0
    def __init__(self, name, value, node_types=None, node_list=None):
        self.name = name
        self.value = value
        self._node_types = set()

        self.node_types = node_types or set()
        self.node_list = NodeSet()
        if node_list is not None:
            self.node_list = NodeSet.fromlist(node_list)
Пример #8
0
 def nodes_timeout(self):
     """Get nodeset of timeout nodes for this action."""
     if self.worker:
         if isinstance(self.worker, WorkerPopen):
             if self.worker.did_timeout():
                 return NodeSet("localhost")
         else:
             return NodeSet.fromlist(list(self.worker.iter_keys_timeout()))
     return NodeSet()
Пример #9
0
 def checkNodes(self):
     try:
        # print command info
        print '\n== Checking active nodes =='
        # launch ping on the specified nodes
        task_self().run('echo OK', nodes=self.ns)
        # retrieve and check return code
        for retcode, nodes in task_self().iter_retcodes():
            if retcode in (0, 1, 2):
                # add nodes to OK set
                self.ns_ok |= NodeSet.fromlist(nodes)
                print '%s : OK' % nodes
            else:
                # add nodes to KO set
                self.ns_ko |= NodeSet.fromlist(nodes)
                print '%s : KO' % nodes
     # syntax error
     except NodeSetException:
         print >> sys.stderr, '(!) Error : the submitted nodeset [%s] is not valid' % self.ns
Пример #10
0
 def connected(self, src_ns):
     """find out and return the aggregation of directly connected children
     from src_ns.
     Argument src_ns is expected to be a NodeSet instance. Result is returned
     as a NodeSet instance
     """
     next_hop = NodeSet.fromlist([dst for dst in [route.dest(src_ns) for route in self._routes] if dst is not None])
     if len(next_hop) == 0:
         return None
     return next_hop
Пример #11
0
 def iter_errors(self, match_keys=None):
     """
     Returns an iterator over available error buffers and associated
     NodeSet. If the optional parameter match_keys is defined, only
     keys found in match_keys are returned.
     """
     self._task_bound_check()
     for msg, keys in self.task._call_tree_matcher(
             self.task._msgtree(self.SNAME_STDERR).walk, match_keys, self):
         yield msg, NodeSet.fromlist(keys)
Пример #12
0
 def _live_line(self, worker):
     # if all nodes have replied, display gathered line
     while self._mtreeq and len(self._mtreeq[0]) == len(self._nodes):
         mtree = self._mtreeq.pop(0)
         self._offload += 1
         self._runtimer_clean()
         nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1]))
         for buf, nodeset in sorted(map(nodesetify, mtree.walk()),
                                    key=bufnodeset_cmpkey):
             self._display.print_gather(nodeset, buf)
         self._runtimer_set_dirty()
Пример #13
0
def nodeset_cmp(ns1, ns2):
    """Compare 2 nodesets by their length (we want larger nodeset
    first) and then by first node."""
    len_cmp = cmp(len(ns2), len(ns1))
    if not len_cmp:
        smaller = NodeSet.fromlist([ns1[0], ns2[0]])[0]
        if smaller == ns1[0]:
            return -1
        else:
            return 1
    return len_cmp
Пример #14
0
def makedepgraph(config, rules, components_lists, options):
    """
    Return the dependency graph for the given pair ('req_ruleset',
    'components_lists').
    """
    ruleset = RuleSet(rules.values())
    all_set = get_component_set_from(config, components_lists)
    force_opt = options.force
    force_rule = force_opt.split(',') if force_opt is not None else []
    docache = True if getattr(options, 'docache', 'yes') == 'yes' else False
    _LOGGER.debug("Caching filter results (docache) is: %s", docache)
    depgraph = ruleset.get_depgraph(all_set, force_rule, docache)
    if _LOGGER.isEnabledFor(logging.DEBUG):
        _LOGGER.debug("Components set: %s",
                      NodeSet.fromlist([to_str_from_unicode(x.id) for x in all_set]))
        _LOGGER.debug("Remaining: %s",
                      NodeSet.fromlist([str(x) for x in depgraph.remaining_components]))
        _LOGGER.debug("List: %s",
                      NodeSet.fromlist([str(x) for x in depgraph.components_map]))

    return depgraph
Пример #15
0
    def ev_close(self, worker, timedout):
        # Worker is closing -- it's time to gather results...
        self._runtimer_finalize(worker)

        for mtree in self._mtreeq:
            nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1]))
            for buf, nodeset in sorted(map(nodesetify, mtree.walk()),
                                       key=bufnodeset_cmpkey):
                self._display.print_gather(nodeset, buf)

        self._close_common(worker)

        # Notify main thread to update its prompt
        self.update_prompt(worker)
Пример #16
0
    def print_running_tasks(self):
        '''Rewrite the current line and print the current running tasks'''
        rtasks = [t.parent.name for t in action_manager_self().running_tasks]
        if rtasks and self._show_running:
            tasks_disp = '[%s]' % NodeSet.fromlist(rtasks)
            width = min(self._pl_width, self._term_width)

            # truncate display to avoid buggy display when the length on
            # the displayed tasks is bigger than the screen width
            if len(tasks_disp) >= self._term_width:
                tasks_disp = "%s...]" % tasks_disp[:self._term_width - 4]

            eol = ' ' * (width - len(tasks_disp))
            if not self.cleanup:
                eol = ''

            sys.stderr.write('%s%s\r' % (tasks_disp, eol))
            sys.stderr.flush()
            self._pl_width = len(tasks_disp)
Пример #17
0
def display_tree(tree, disp, out):
    """display sub-routine for clubak -T (msgtree trace mode)"""
    togh = True
    offset = 2
    reldepth = -offset
    reldepths = {}
    line_mode = disp.line_mode
    for msgline, keys, depth, nchildren in tree.walk_trace():
        if togh:
            if depth in reldepths:
                reldepth = reldepths[depth]
            else:
                reldepth = reldepths[depth] = reldepth + offset
            nodeset = NodeSet.fromlist(keys)
            if line_mode:
                out.write(str(nodeset).encode() + b':\n')
            else:
                out.write(disp.format_header(nodeset, reldepth))
        out.write(b' ' * reldepth + msgline + b'\n')
        togh = nchildren != 1
Пример #18
0
    def _distant_action_by_server(self, action_class, servers, **kwargs):

        # filter local server
        distant_servers = Server.distant_servers(servers)

        # perform action on distant servers
        if len(distant_servers) > 0:
            action = action_class(nodes=distant_servers, fs=self, **kwargs)
            action.launch()
            self._run_actions()

            if action.status() == ACT_ERROR:
                err_code = None
                if task_self().num_timeout():
                    err_code = -1
                elif task_self().max_retcode():
                    err_code = task_self().max_retcode()

                # FSRemoteError is limited and cannot handle more than 1 error
                msg, nodes = list(self.proxy_errors.walk())[0]
                nodes = NodeSet.fromlist(nodes)
                msg = str(msg).replace('THIS_SHINE_HOST', str(nodes))
                raise FSRemoteError(nodes, err_code, msg)
Пример #19
0
def check_file_exists(hosts, filename, user=None):
    """Check if a specified file exist on each specified hosts.

    If specified, verify that the file exists and is owned by the user.

    Args:
        hosts (list): list of hosts
        filename (str): file to check for the existence of on each host
        user (str, optional): owner of the file. Defaults to None.

    Returns:
        (bool, NodeSet): A tuple of:
            - True if the file exists on each of the hosts; False otherwise
            - A NodeSet of hosts on which the file does not exist

    """
    missing_file = NodeSet()
    command = "test {} '{}'".format("-e" if user is None else "-O", filename)
    task = run_task(hosts, command)
    for ret_code, node_list in task.iter_retcodes():
        if ret_code != 0:
            missing_file.add(NodeSet.fromlist(node_list))

    return len(missing_file) == 0, missing_file
Пример #20
0
    def print_action_results(self, action, error_only=False):
        '''Remove the current line and write grouped results of an action'''
        line = ['%s %s ran in %.2f s' % \
            (self.string_color(action.name, 'MAGENTA'),
             action.parent.fullname(),
             action.duration)]
        buffers = []
        retcodes = []
        timeout = NodeSet()
        # Local action
        if action.worker.current_node is None:
            buffers = [(action.worker.read(), 'localhost')]
            if action.worker.did_timeout():
                timeout.add('localhost')
            if action.worker.retcode() is not None:
                retcodes.append((action.worker.retcode(),'localhost'))
        # Remote action
        else:
            buffers = action.worker.iter_buffers()
            retcodes = action.worker.iter_retcodes()
            timeout = NodeSet.fromlist(action.worker.iter_keys_timeout())

        line += self.__gen_action_output(buffers, retcodes, timeout, error_only)
        self.output("\n".join(line))
Пример #21
0
    def remote_copy(self, hostlist, remote_dir, local_dir):
        """Copy files from remote dir to local dir.

        Args:
                hostlist (list): list of remote nodes
                remote_dir (str): remote directory of files
                local_dir (str): local directory

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        this_host = socket.gethostname()
        result = pcmd(
            NodeSet.fromlist(hostlist),
            "if [ ! -z '$(ls -A {0})' ]; then "
            "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format(
                remote_dir, this_host, local_dir),
            verbose=False)
        if len(result) > 1 or 0 not in result:
            raise SoakTestError(
                "Error executing remote copy: {}".format(
                    ", ".join(
                        [str(result[key]) for key in result if key != 0])))
Пример #22
0
def get_active_network_interfaces(hosts, verbose=True):
    """Get all active network interfaces on the hosts.

    Args:
        hosts (NodeSet): hosts on which to find active interfaces
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys and NodeSet values on which they were found

    """
    net_path = os.path.join(os.path.sep, "sys", "class", "net")
    operstate = os.path.join(net_path, "*", "operstate")
    command = " | ".join([
        f"grep -l 'up' {operstate}", "grep -Ev '/(lo|bonding_masters)/'",
        "sort"
    ])
    task = run_task(hosts, command, verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of active interfaces with a NodSet of hosts on which it was found
    active_interfaces = {}
    for output, nodelist in task.iter_buffers():
        output_lines = [line.decode("utf-8") for line in output]
        nodeset = NodeSet.fromlist(nodelist)
        for line in output_lines:
            try:
                interface = line.split("/")[-2]
                if interface not in active_interfaces:
                    active_interfaces[interface] = NodeSet()
                active_interfaces[interface].update(nodeset)
            except IndexError:
                pass

    return active_interfaces
Пример #23
0
    def _parse_token(self, token):
        """Concrete implementation of parent abstract method.

        :Parameters:
            according to parent :py:meth:`cumin.backends.BaseQueryAggregator._parse_token`.
        """
        if not isinstance(token, pp.ParseResults
                          ):  # pragma: no cover - this should never happen
            raise InvalidQueryError(
                'Expecting ParseResults object, got {type}: {token}'.format(
                    type=type(token), token=token))

        token_dict = token.asDict()
        self.logger.trace('Token is: %s | %s', token_dict, token)

        if 'hosts' in token_dict:
            element = self._get_stack_element()
            element['hosts'] = NodeSet.fromlist(token_dict['hosts'],
                                                resolver=self.resolver)
            if 'bool' in token_dict:
                element['bool'] = token_dict['bool']
            self.stack_pointer['children'].append(element)
        elif 'open_subgroup' in token_dict and 'close_subgroup' in token_dict:
            self._open_subgroup()
            if 'bool' in token_dict:
                self.stack_pointer['bool'] = token_dict['bool']
            for subtoken in token:
                if isinstance(
                        subtoken, str
                ):  # Grammar literals, boolean operators and parentheses
                    continue
                self._parse_token(subtoken)
            self._close_subgroup()
        else:  # pragma: no cover - this should never happen
            raise InvalidQueryError(
                'Got unexpected token: {token}'.format(token=token))
Пример #24
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        self.dmg_command.copy_certificates(get_log_file("daosCA/certs"),
                                           self.hostlist_clients)
        self.dmg_command.copy_configuration(self.hostlist_clients)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file
        initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                         "initial", "resv_file")
        try:
            reserved_file_copy(self,
                               initial_resv_file,
                               self.pool[0],
                               resv_cont,
                               num_bytes=500000000,
                               cmd="write")
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container write failed>>") from error

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # verify reserved container data
        final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                       "final", "resv_file")
        try:
            reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont)
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container read failed>>") from error

        if not cmp(initial_resv_file, final_resv_file):
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        for file in [initial_resv_file, final_resv_file]:
            if os.path.isfile(file):
                file_name = os.path.split(os.path.dirname(file))[-1]
                # save a copy of the POSIX file in self.outputsoakdir
                copy_cmd = "cp -p {} {}/{}_resv_file".format(
                    file, self.outputsoakdir, file_name)
                try:
                    run_command(copy_cmd, timeout=30)
                except DaosTestError as error:
                    self.soak_errors.append(
                        "Reserved data file {} failed to archive".format(file))
                os.remove(file)
        self.container.append(resv_cont)
        # Gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))
Пример #25
0
    def verify_expected_states(self, set_expected=False):
        """Verify that the expected job process states match the current states.

        Args:
            set_expected (bool, optional): option to update the expected job
                process states to the current states prior to verification.
                Defaults to False.

        Returns:
            dict: a dictionary of whether or not any of the job process states
                were not 'expected' (which should warrant an error) and whether
                or not the job process require a 'restart' (either due to any
                unexpected states or because at least one job process was no
                longer found to be running)

        """
        status = {"expected": True, "restart": False}
        show_log_hosts = []

        # Get the current state of each job process
        current_states = self.get_current_state()
        if set_expected:
            # Assign the expected states to the current job process states
            self.log.info("<%s> Assigning expected %s states.",
                          self._id.upper(), self._id)
            self._expected_states = current_states.copy()

        # Verify the expected states match the current states
        self.log.info("<%s> Verifying %s states: group=%s, hosts=%s",
                      self._id.upper(), self._id,
                      self.get_config_value("name"),
                      NodeSet.fromlist(self._hosts))
        if current_states:
            log_format = "  %-4s  %-15s  %-36s  %-22s  %-14s  %s"
            self.log.info(log_format, "Rank", "Host", "UUID", "Expected State",
                          "Current State", "Result")
            self.log.info(log_format, "-" * 4, "-" * 15, "-" * 36, "-" * 22,
                          "-" * 14, "-" * 6)

            # Verify that each expected rank appears in the current states
            for rank in sorted(self._expected_states):
                current_host = self._expected_states[rank]["host"]
                expected = self._expected_states[rank]["state"]
                if isinstance(expected, (list, tuple)):
                    expected = [item.lower() for item in expected]
                else:
                    expected = [expected.lower()]
                try:
                    current_rank = current_states.pop(rank)
                    current = current_rank["state"].lower()
                except KeyError:
                    current = "not detected"

                # Check if the job's expected state matches the current state
                result = "PASS" if current in expected else "RESTART"
                status["expected"] &= current in expected

                # Restart all job processes if the expected rank is not running
                if current not in self._states["running"]:
                    status["restart"] = True
                    result = "RESTART"

                # Keep track of any server in the errored state or in an
                # unexpected state in order to display its log
                if (current in self._states["errored"]
                        or current not in expected):
                    if current_host not in show_log_hosts:
                        show_log_hosts.append(current_host)

                self.log.info(log_format, rank, current_host,
                              self._expected_states[rank]["uuid"],
                              "|".join(expected), current, result)

        elif not self._expected_states:
            # Expected states are populated as part of start() procedure,
            # so if it is empty there was an error starting the job processes.
            self.log.info(
                "  Unable to obtain current %s state.  Undefined expected %s "
                "states due to a failure starting the %s.",
                self._id,
                self._id,
                self._id,
            )
            status["restart"] = True

        else:
            # Any failure to obtain the current rank information is an error
            self.log.info(
                "  Unable to obtain current %s state.  If the %ss are "
                "not running this is expected.", self._id, self._id)

            # Do not report an error if all servers are expected to be stopped
            all_stopped = bool(self._expected_states)
            for rank in sorted(self._expected_states):
                states = self._expected_states[rank]["state"]
                if not isinstance(states, (list, tuple)):
                    states = [states]
                if "stopped" not in [item.lower() for item in states]:
                    all_stopped = False
                    break
            if all_stopped:
                self.log.info("  All %s are expected to be stopped.", self._id)
                status["restart"] = True
            else:
                status["expected"] = False

        # Any unexpected state detected warrants a restart of all job processes
        if not status["expected"]:
            status["restart"] = True

        # Set the verified timestamp
        if set_expected and hasattr(self.manager, "timestamps"):
            self.manager.timestamps["verified"] = datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")

        # Dump the server logs for any identified server
        if show_log_hosts:
            self.log.info(
                "<SERVER> logs for ranks in the errored state since start "
                "detection or detected in an unexpected state")
            if hasattr(self.manager, "dump_logs"):
                self.manager.dump_logs(show_log_hosts)

        return status
Пример #26
0
    def _get_deps(self, component, rule):
        """
        Find dependencies of a given component. This implies calling
        the rule.depsfinder script. Substitution of variables is done.
        Returns None if the given rule has already been applied on
        the given component.
        """
        result = dict()
        depsfinder = rule.depsfinder
        if rule.dependson is None or len(rule.dependson) == 0 or \
                depsfinder is None or len(depsfinder) == 0:
            _LOGGER.debug("No 'DepsFinder' or 'DependsOn' specified" + \
                              " in rule %s for component %s. Skipping.",
                          rule, component)
            return result
        var_map = _get_var_map(component.id,
                               component.name,
                               component.type,
                               component.category,
                               self.ruleset.name,
                               rule.name,
                               rule.help)
        cmd = substitute(var_map, depsfinder)
        _LOGGER.debug("Calling depsfinder for component %s: %s", component, cmd)
        popen_args = shlex.split(to_str_from_unicode(cmd, should_be_uni=True))
        try:
            popen = subprocess.Popen(popen_args,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     bufsize=-1) # Use system default
        except OSError as ose:
            _LOGGER.error("Can't call depsfinder '%s': %s", cmd, ose)
            return result

        (msg_std, msg_err) = popen.communicate()
        msg_std = msg_std.strip()
        msg_err = msg_err.strip()
        if len(msg_err) != 0:
            _LOGGER.warning("Depsfinder error when " + \
                                "applying rule %s to component %s: %s",
                            rule, component, msg_err)
        deps = set()
        with StringIO(to_unicode(msg_std)) as reader:
            for dep in reader:
                dep_id = dep.strip()
                if len(dep_id) == 0:
                    continue
                dependency = self.components_map.get(dep_id)
                if dependency is None:
                    _LOGGER.debug("Creating dep for component %s with id: %r",
                                  component, dep_id)
                    dependency = Component(dep_id)
                    self.components_map[dep_id] = dependency

                deps.add(dependency)
                _update_graph_with_node(self.dag, dep_id)

        if _LOGGER.isEnabledFor(INFO):
            _LOGGER.info("%s.depsfinder(%s): %s",
                         rule.name, component.id,
                         NodeSet.fromlist([str(x.id) for x in deps]))
        # Find match only on rule.dependson
        return _find_match([self.ruleset.rules_for[x] for x in rule.dependson],
                           deps)
Пример #27
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.job_timeout = self.params.get("job_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        self.nodesperjob = self.params.get("nodesperjob", test_param + "*")
        self.taskspernode = self.params.get("taskspernode", test_param + "*")
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        obj_class = self.params.get("oclass", "/run/container_reserved/*")
        if harassers:
            harasserlist = get_harassers(harassers)
            self.harassers = harasserlist[:]
            run_harasser = True
            self.log.info("<< Initial harrasser list = %s>>",
                          " ".join([harasser for harasser in self.harassers]))
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        self.pool[0].connect()

        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        resv_cont.write_objects(rank, obj_class)

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            # Create pool for jobs
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Initialize if harassers
            if run_harasser and not self.harassers:
                self.harasser_results = {}
                self.harasser_args = {}
                self.harassers = harasserlist[:]
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.soak_errors.extend(self.destroy_pools(self.pool[1]))
            # remove the test pools from self.pool; preserving reserved pool
            self.container = []
            self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and self.harassers:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # TO-DO: use IOR
        if not resv_cont.read_objects():
            self.soak_errors.append("Data verification error on reserved pool"
                                    "after SOAK completed")
        self.container.append(resv_cont)
        # gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))
Пример #28
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (nsdisp,
                                                                         rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s"
                                   % (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(VERB_QUIET,
                                   "clush: in progress(%d): %s%s\n"
                                   "clush: [tree] open gateways(%d): %s"
                                   % (len(ns_reg), ns_reg, pending,
                                      len(gws), NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(VERB_QUIET,
                                       "clush: [tree] in progress(%d) on %s: %s"
                                       % (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'): # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #29
0
def ttyloop(task, nodeset, timeout, display, remote):
    """Manage the interactive prompt to run command"""
    readline_avail = False
    interactive = task.default("USER_interactive")
    if interactive:
        try:
            import readline
            readline_setup()
            readline_avail = True
        except ImportError:
            pass
        display.vprint(VERB_STD, \
            "Enter 'quit' to leave this interactive mode")

    rc = 0
    ns = NodeSet(nodeset)
    ns_info = True
    cmd = ""
    while task.default("USER_running") or \
            (interactive and cmd.lower() != 'quit'):
        try:
            # Set SIGUSR1 handler if needed
            if task.default("USER_handle_SIGUSR1"):
                signal.signal(signal.SIGUSR1, signal_handler)

            if task.default("USER_interactive") and \
                    not task.default("USER_running"):
                if ns_info:
                    display.vprint(VERB_QUIET, \
                                   "Working with nodes: %s" % ns)
                    ns_info = False
                prompt = "clush> "
            else:
                prompt = ""
            try:
                cmd = raw_input(prompt)
                assert cmd is not None, "Result of raw_input() is None!"
            finally:
                signal.signal(signal.SIGUSR1, signal.SIG_IGN)
        except EOFError:
            print()
            return
        except UpdatePromptException:
            if task.default("USER_interactive"):
                continue
            return
        except KeyboardInterrupt as kbe:
            # Caught SIGINT here (main thread) but the signal will also reach
            # subprocesses (that will most likely kill them)
            if display.gather:
                # Suspend task, so we can safely access its data from here
                task.suspend()

                # If USER_running is not set, the task had time to finish,
                # that could mean all subprocesses have been killed and all
                # handlers have been processed.
                if not task.default("USER_running"):
                    # let's clush_excepthook handle the rest
                    raise kbe

                # If USER_running is set, the task didn't have time to finish
                # its work, so we must print something for the user...
                print_warn = False

                # Display command output, but cannot order buffers by rc
                nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
                for buf, nodeset in sorted(map(nodesetify,
                                               task.iter_buffers()),
                                           key=bufnodeset_cmpkey):
                    if not print_warn:
                        print_warn = True
                        display.vprint_err(VERB_STD, \
                            "Warning: Caught keyboard interrupt!")
                    display.print_gather(nodeset, buf)

                # Return code handling
                verbexit = VERB_QUIET
                if display.maxrc:
                    verbexit = VERB_STD
                ns_ok = NodeSet()
                for rc, nodelist in task.iter_retcodes():
                    ns_ok.add(NodeSet._fromlist1(nodelist))
                    if rc != 0:
                        # Display return code if not ok ( != 0)
                        nsdisp = ns = NodeSet._fromlist1(nodelist)
                        if display.verbosity >= VERB_QUIET and len(ns) > 1:
                            nsdisp = "%s (%d)" % (ns, len(ns))
                        msgrc = "clush: %s: exited with exit code %d" % (
                            nsdisp, rc)
                        display.vprint_err(verbexit, msgrc)

                # Add uncompleted nodeset to exception object
                kbe.uncompleted_nodes = ns - ns_ok

                # Display nodes that didn't answer within command timeout delay
                if task.num_timeout() > 0:
                    display.vprint_err(verbexit, \
                        "clush: %s: command timeout" % \
                            NodeSet._fromlist1(task.iter_keys_timeout()))
            raise kbe

        if task.default("USER_running"):
            ns_reg, ns_unreg = NodeSet(), NodeSet()
            for client in task._engine.clients():
                if client.registered:
                    ns_reg.add(client.key)
                else:
                    ns_unreg.add(client.key)
            if ns_unreg:
                pending = "\nclush: pending(%d): %s" % (len(ns_unreg),
                                                        ns_unreg)
            else:
                pending = ""
            display.vprint_err(VERB_QUIET,
                               "clush: interrupt (^C to abort task)")
            gws = list(task.gateways)
            if not gws:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s" %
                    (len(ns_reg), ns_reg, pending))
            else:
                display.vprint_err(
                    VERB_QUIET, "clush: in progress(%d): %s%s\n"
                    "clush: [tree] open gateways(%d): %s" %
                    (len(ns_reg), ns_reg, pending, len(gws),
                     NodeSet._fromlist1(gws)))
            for gw, (chan, metaworkers) in task.gateways.items():
                act_targets = NodeSet.fromlist(mw.gwtargets[gw]
                                               for mw in metaworkers)
                if act_targets:
                    display.vprint_err(
                        VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" %
                        (len(act_targets), gw, act_targets))
        else:
            cmdl = cmd.lower()
            try:
                ns_info = True
                if cmdl.startswith('+'):
                    ns.update(cmdl[1:])
                elif cmdl.startswith('-'):
                    ns.difference_update(cmdl[1:])
                elif cmdl.startswith('@'):
                    ns = NodeSet(cmdl[1:])
                elif cmdl == '=':
                    display.gather = not display.gather
                    if display.gather:
                        display.vprint(VERB_STD, \
                            "Switching to gathered output format")
                    else:
                        display.vprint(VERB_STD, \
                            "Switching to standard output format")
                    task.set_default("stdout_msgtree", \
                                     display.gather or display.line_mode)
                    ns_info = False
                    continue
                elif not cmdl.startswith('?'):  # if ?, just print ns_info
                    ns_info = False
            except NodeSetParseError:
                display.vprint_err(VERB_QUIET, \
                    "clush: nodeset parse error (ignoring)")

            if ns_info:
                continue

            if cmdl.startswith('!') and len(cmd.strip()) > 0:
                run_command(task, cmd[1:], None, timeout, display, remote)
            elif cmdl != "quit":
                if not cmd:
                    continue
                if readline_avail:
                    readline.write_history_file(get_history_file())
                run_command(task, cmd, ns, timeout, display, remote)
    return rc
Пример #30
0
 def ev_timeout(self, worker):
     """Timeout occurred on some nodes"""
     self.result.nodes_ko.add(NodeSet.fromlist(worker.iter_keys_timeout()))
Пример #31
0
 def add(self, node):
     self.ns |= NodeSet.fromlist(node)
Пример #32
0
 def __init__(self, nodes):
     self.nodes = NodeSet.fromlist(nodes)
Пример #33
0
    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
Пример #34
0
    def build_job_script(self, nodesperjob, job, pool):
        """Create a slurm batch script that will execute a list of jobs.

        Args:
            nodesperjob(int): number of nodes executing each job
            job(str): the job that will be defined in the slurm script with
            /run/"job"/.  It is currently defined in the yaml as:
            Example job:
            job1:
                name: job1    - unique name
                time: 10      - cmdline time in seconds; used in IOR -T param
                tasks: 1      - number of processes per node --ntaskspernode
                jobspec:
                    - ior_daos
                    - ior_mpiio
            pool (obj):   TestPool obj

        Returns:
            script_list: list of slurm batch scripts

        """
        self.log.info("<<Build Script for job %s >> at %s", job, time.ctime())

        script_list = []
        # create one batch script per cmdline
        # get job params
        job_params = "/run/" + job
        job_name = self.params.get("name", "/".join([job_params, "*"]))
        job_specs = self.params.get("jobspec", "/".join([job_params, "*"]))
        task_list = self.params.get("tasks", "/".join([job_params, "*"]))
        job_time = self.params.get("time", "/".join([job_params, "*"]))

        # job_time in minutes:seconds format
        job_time = str(job_time) + ":00"
        for job_spec in job_specs:
            if "ior" in job_spec:
                # Create IOR cmdline
                cmd_list = self.create_ior_cmdline(job_params, job_spec, pool)
            elif "dmg" in job_spec:
                # create dmg cmdline
                cmd_list = self.create_dmg_cmdline(job_params, job_spec, pool)
            else:
                raise SoakTestError(
                    "<<FAILED: Soak job: {} Job spec {} is invalid>>".format(
                        job, job_spec))

            # a single cmdline per batch job; so that a failure is per cmdline
            # change to multiple cmdlines per batch job  later.
            for cmd in cmd_list:
                # additional sbatch params
                for tasks in task_list:
                    output = os.path.join(
                        self.rem_pass_dir,
                        "%N_" + self.test_name + "_" + job_name + "_" +
                        job_spec + "_results.out_%j_%t_" + str(tasks) + "_")
                    num_tasks = nodesperjob * tasks
                    sbatch = {
                        "ntasks-per-node": tasks,
                        "ntasks": num_tasks,
                        "time": job_time,
                        "partition": self.partition_clients,
                        "exclude": NodeSet.fromlist(self.hostlist_servers)
                    }
                    script = slurm_utils.write_slurm_script(
                        self.rem_pass_dir, job_name, output, nodesperjob,
                        [cmd], sbatch)
                    script_list.append(script)
        return script_list
Пример #35
0
    def select_nodes(self, profil, name, nb_nodes, host):
        '''Select nodes to spawn'''
        # 1: recover available nodelist
        # 2: select nb_nodes among availables nodes
        # 3: return the list of nodes
        err = ""
        nodes = []
        if host is None:
            err = "Error: No host available\n"
            _LOGGER.error(err)
            self.rep_sock.send(msgpack.packb(('', [err])))
            return nodes
        if not vc.VirtualCluster.valid_clustername(name):
            err = "Error: clustername '{}' is not a valid name\n".format(name)
            _LOGGER.error(err)
            self.rep_sock.send(msgpack.packb(('', [err])))
            return nodes
        if profil not in self.profiles:
            err = "Error: Profil '{}' not found in configuration file\n".format(
                profil)
            _LOGGER.error(err)
            self.rep_sock.send(msgpack.packb(('', [err])))
            return nodes

        nodelist = self.list_nodes(byhost=False)
        nodeset = NodeSet.fromlist([node.name for node in nodelist])
        idx_min = 0
        idx_max = nb_nodes - 1
        base_range = RangeSet("%d-%d" % (idx_min, idx_max))
        base_nodeset = NodeSetBase(name + '%s', base_range)
        ndset_inter = nodeset.intersection(base_nodeset)
        while len(ndset_inter) != 0:
            indexes = [
                clustdock.VirtualNode.split_name(node)[1]
                for node in ndset_inter
            ]
            for idx in indexes:
                _LOGGER.debug("Removing %d from rangeset %s", idx, base_range)
                base_range.remove(idx)
            base_nodeset.difference_update(ndset_inter)
            _LOGGER.debug("Nodeset becomes '%s' after removing", base_nodeset)
            idx_min = max(indexes + list(base_range)) + 1
            idx_max = idx_min + max([len(indexes), nb_nodes - len(base_range)])
            base_range.add_range(idx_min, idx_max)
            _LOGGER.debug("New rangeset: %s", base_range)
            base_nodeset.update(
                NodeSetBase(name + '%s',
                            RangeSet.fromlist([range(idx_min, idx_max)])))
            _LOGGER.debug("New nodeset: %s", base_nodeset)
            ndset_inter = nodeset.intersection(base_nodeset)

        final_range = base_range
        _LOGGER.debug("final rangeset/nodeset: %s / %s", base_range,
                      base_nodeset)

        cluster = vc.VirtualCluster(name, profil, self.profiles[profil])
        nodes = []
        for idx in final_range:
            node = cluster.add_node(idx, host)
            nodes.append(node)
        return nodes
Пример #36
0
 def delete(self, node):
     self.ns -= NodeSet.fromlist(node)
Пример #37
0
 def ev_close(self, worker, timedout):
     """Worker has finished (command done on all nodes)"""
     if timedout:
         nodeset = NodeSet.fromlist(worker.iter_keys_timeout())
         self.result.nodes_ko.add(nodeset)
     self.result.show()
Пример #38
0
 def display_proxy_errors(cls, fs):
     """Display proxy error messages for the specified filesystem."""
     for msg, nodes in fs.proxy_errors.walk():
         nodes = str(NodeSet.fromlist(nodes))
         msg = str(msg).replace('THIS_SHINE_HOST', nodes)
         print("%s: %s" % (nodes, msg), file=sys.stderr)
Пример #39
0
 def ev_timeout(self, worker):
     """Timeout occurred on some nodes"""
     self.result.nodes_ko.add(NodeSet.fromlist(worker.iter_keys_timeout()))
Пример #40
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            dict: log output per host

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = [
            "sudo",
            "journalctl",
            "--unit={}".format(self._systemctl.service.value),
            "--since=\"{}\"".format(since),
        ]
        if until:
            command.append("--until=\"{}\"".format(until))
        self.log.info("Gathering log data on %s: %s", str(hosts),
                      " ".join(command))

        # Gather the log information per host
        task = run_task(hosts, " ".join(command), timeout)

        # Create a dictionary of hosts for each unique return code
        results = {code: hosts for code, hosts in task.iter_retcodes()}

        # Determine if the command completed successfully across all the hosts
        status = len(results) == 1 and 0 in results

        # Determine if any commands timed out
        timed_out = [str(hosts) for hosts in task.iter_keys_timeout()]
        if timed_out:
            status = False
        if not status:
            self.log.info("  Errors detected running \"%s\":", command)

        # List any hosts that timed out
        if timed_out:
            self.log.info("    %s: timeout detected after %s seconds",
                          str(NodeSet.fromlist(timed_out)), timeout)

        # Display/return the command output
        log_data = {}
        for code in sorted(results):
            # Get the command output from the hosts with this return code
            output_data = list(task.iter_buffers(results[code]))
            if not output_data:
                output_data = [["<NONE>", results[code]]]

            for output_buffer, output_hosts in output_data:
                node_set = NodeSet.fromlist(output_hosts)
                lines = str(output_buffer).splitlines()

                if status:
                    # Add the successful output from each node to the dictionary
                    log_data[node_set] = lines
                else:
                    # Display all of the results in the case of an error
                    if len(lines) > 1:
                        self.log.info("    %s: rc=%s, output:", node_set, code)
                        for line in lines:
                            self.log.info("      %s", line)
                    else:
                        self.log.info("    %s: rc=%s, output: %s", node_set,
                                      code, output_buffer)

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data
Пример #41
0
def get_ucx_info(hosts, supported=None, verbose=True):
    """Get the UCX provider information from the specified hosts.

    Args:
        hosts (NodeSet): hosts from which to gather the information
        supported (list, optional): list of supported providers when if provided will limit the
            inclusion to only those providers specified. Defaults to None.
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys with a dictionary value of a comma-separated string of
            providers key with a NodeSet value where the providers where detected.

    """
    task = run_task(hosts, "ucx_info -d", verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which
    # the providers were detected.
    providers = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [
                line.decode("utf-8").rstrip(os.linesep) for line in output
            ]
            nodeset = NodeSet.fromlist(nodelist)

            # Find all the transport, device, and type pairings. The ucx_info output reports these
            # on separate lines so when processing the re matches ensure each device is preceded by
            # a provider.
            interface_providers = {}
            data = re.findall(r"(Transport|Device):\s+([A-Za-z0-9;_+]+)",
                              "\n".join(output_lines))
            while data:
                transport = list(data.pop(0))
                if transport[0] == "Transport" and data[0][0] == "Device":
                    transport.pop(0)
                    device = list(data.pop(0))
                    device.pop(0)

                    # A transport and device must be specified
                    if not transport or not device:
                        continue

                    # Add 'ucx+' to the provider and replace 'mlx[0-9]' with 'x'
                    transport = [
                        "+".join(["ucx",
                                  re.sub(r"mlx[0-9]+", "x", item)])
                        for item in transport
                    ]

                    # Only include supported providers if a supported list is provided
                    if supported and transport[0] not in supported:
                        continue

                    if device[0] not in interface_providers:
                        interface_providers[device[0]] = set()
                    interface_providers[device[0]].update(transport)

            for interface, provider_set in interface_providers.items():
                if interface not in providers:
                    providers[interface] = {}
                provider_key = ",".join(list(provider_set))
                if provider_key not in providers[interface]:
                    providers[interface][provider_key] = NodeSet()
                providers[interface][provider_key].update(nodeset)

    return providers
Пример #42
0
def run_auto():
    """ Run auto mode """
    global EV

    nstates = pbs.node_states()
    #print json.dumps(nstates, sort_keys=True, indent=4, separators=(',', ': '))

    #make list of nodes that dont have jobs
    jobless = []
    for node, nodest in list(nstates.items()):
        if not ('resources_assigned' in nodest \
            and 'ncpus' in nodest['resources_assigned'] \
            and nodest['resources_assigned']['ncpus'] > 0
            ):
            jobless.append(node)

    for node, nodest in list(nstates.items()):
        states = nodest['state'].split(',')
        known_bad = node in STATE['nodes']

        vlog(
            5, 'eval node={} state={} jobs={} bad={}'.format(
                node, nodest['state'], node in jobless, known_bad))

        #find known bad nodes that are not offline
        if known_bad and not 'offline' in states:
            vlog(2, 'bad node %s was not offline in pbs' % (node))
            scheduler_close_nodes([node], 'known bad node')

        #look for known bad states
        if pbs.is_pbs_down(states) and not known_bad:
            #node is in bad state but not known to be bad
            vlog(2,
                 'detected node in bad state %s: %s' % (node, nodest['state']))
            add_nodes([node], 'PBS state = {}'.format(nodest['state']))

        #find nodes in pending states that no longer have jobs
        if node in jobless and 'offline' in states and known_bad:
            has_sibling_job = False
            for sib, sibst in list(STATE['nodes'].items()):
                if node in sibst['siblings'] and not sib in jobless:
                    has_sibling_job = True
                    vlog(5, 'node %s has sibling %s with job' % (node, sib))

            vlog(
                5, 'eval pending node={} job_sibling={} state={}'.format(
                    node, has_sibling_job, STATE['nodes'][node]['state']))
            if not has_sibling_job:
                release_pending_node(node)
            else:
                vlog(4, 'bad node %s skipped due to sibling jobs' % (node))

    #find nodes that are powered off and not already bad nodes
    check_nodes = []

    #create list of nodes that are not bad and update scheduler comments
    for node, nodest in list(nstates.items()):
        if not node in STATE['nodes'] or is_pending_node(node):
            check_nodes.append(node)
        if node in STATE['nodes']:
            scmt = None
            if 'comment' in nodest:
                scmt = '%s: %s' % (nodest['state'], nodest['comment'])
            else:
                scmt = nodest['state']

            if not 'scheduler_comment' in STATE['nodes'][
                    node] or STATE['nodes'][node]['scheduler_comment'] != scmt:
                for ev_id in STATE['nodes'][node]['extraview']:
                    EV.add_resolver_comment(ev_id,
                                            'PBS State Change:\n%s' % scmt)
                    vlog(
                        3, '%s EV#%s comment pbs state change: %s' %
                        (node, ev_id, scmt))
                STATE['nodes'][node]['scheduler_comment'] = scmt

    vlog(4, 'checking ipmi power status of %s nodes' % (len(check_nodes)))
    power_status = ipmi.command(NodeSet.fromlist(check_nodes), 'power status')
    if not power_status:
        vlog(
            2, 'unable to call ipmi power status for %s nodes' %
            (len(check_nodes)))
    else:
        for node in check_nodes:
            why = False  #has value if node is down

            if node in power_status:
                if not 'Chassis Power is on' in power_status[node]:
                    why = 'invalid power status: %s' % (power_status[node])
            else:
                why = 'unable to query power status'

            #release pending nodes if power is off since pbs won't notice for forever
            if node in STATE['nodes']:
                if is_pending_node(node) and why:
                    comment_nodes([node], why)
                    release_pending_node(node)
            else:  #not a bad node yet
                if why:
                    add_nodes([node], why)
                    release_pending_node(node)

    save_state()
Пример #43
0
def main():
    """clush script entry point"""
    sys.excepthook = clush_excepthook

    #
    # Argument management
    #
    usage = "%prog [options] command"

    parser = OptionParser(usage)

    parser.add_option("-n", "--nostdin", action="store_true", dest="nostdin",
                      help="don't watch for possible input from stdin")

    parser.install_groupsconf_option()
    parser.install_clush_config_options()
    parser.install_nodes_options()
    parser.install_display_options(verbose_options=True)
    parser.install_filecopy_options()
    parser.install_connector_options()

    (options, args) = parser.parse_args()

    set_std_group_resolver_config(options.groupsconf)

    #
    # Load config file and apply overrides
    #
    config = ClushConfig(options, options.conf)

    # Initialize logging
    if config.verbosity >= VERB_DEBUG:
        logging.basicConfig(level=logging.DEBUG)
        logging.debug("clush: STARTING DEBUG")
    else:
        logging.basicConfig(level=logging.CRITICAL)

    # Should we use ANSI colors for nodes?
    if config.color == "auto":
        color = sys.stdout.isatty() and (options.gatherall or \
                                         sys.stderr.isatty())
    else:
        color = config.color == "always"

    try:
        # Create and configure display object.
        display = Display(options, config, color)
    except ValueError as exc:
        parser.error("option mismatch (%s)" % exc)

    if options.groupsource:
        # Be sure -a/g -s source work as espected.
        std_group_resolver().default_source_name = options.groupsource

    # Compute the nodeset and warn for possible use of shell pathname
    # expansion (#225)
    wnodelist = []
    xnodelist = []
    if options.nodes:
        wnodelist = [NodeSet(nodes) for nodes in options.nodes]

    if options.exclude:
        xnodelist = [NodeSet(nodes) for nodes in options.exclude]

    for (opt, nodelist) in (('w', wnodelist), ('x', xnodelist)):
        for nodes in nodelist:
            if len(nodes) == 1 and exists(str(nodes)):
                display.vprint_err(VERB_STD, "Warning: using '-%s %s' and "
                                   "local path '%s' exists, was it expanded "
                                   "by the shell?" % (opt, nodes, nodes))

    # --hostfile support (#235)
    for opt_hostfile in options.hostfile:
        try:
            fnodeset = NodeSet()
            with open(opt_hostfile) as hostfile:
                for line in hostfile.read().splitlines():
                    fnodeset.updaten(nodes for nodes in line.split())
            display.vprint_err(VERB_DEBUG,
                               "Using nodeset %s from hostfile %s"
                               % (fnodeset, opt_hostfile))
            wnodelist.append(fnodeset)
        except IOError as exc:
            # re-raise as OSError to be properly handled
            errno, strerror = exc.args
            raise OSError(errno, strerror, exc.filename)

    # Instantiate target nodeset from command line and hostfile
    nodeset_base = NodeSet.fromlist(wnodelist)
    # Instantiate filter nodeset (command line only)
    nodeset_exclude = NodeSet.fromlist(xnodelist)

    # Specified engine prevails over default engine
    DEFAULTS.engine = options.engine

    # Do we have nodes group?
    task = task_self()
    task.set_info("debug", config.verbosity >= VERB_DEBUG)
    if config.verbosity == VERB_DEBUG:
        std_group_resolver().set_verbosity(1)
    if options.nodes_all:
        all_nodeset = NodeSet.fromall()
        display.vprint(VERB_DEBUG, "Adding nodes from option -a: %s" % \
                                   all_nodeset)
        nodeset_base.add(all_nodeset)

    if options.group:
        grp_nodeset = NodeSet.fromlist(options.group,
                                       resolver=RESOLVER_NOGROUP)
        for grp in grp_nodeset:
            addingrp = NodeSet("@" + grp)
            display.vprint(VERB_DEBUG, \
                "Adding nodes from option -g %s: %s" % (grp, addingrp))
            nodeset_base.update(addingrp)

    if options.exgroup:
        grp_nodeset = NodeSet.fromlist(options.exgroup,
                                       resolver=RESOLVER_NOGROUP)
        for grp in grp_nodeset:
            removingrp = NodeSet("@" + grp)
            display.vprint(VERB_DEBUG, \
                "Excluding nodes from option -X %s: %s" % (grp, removingrp))
            nodeset_exclude.update(removingrp)

    # Do we have an exclude list? (-x ...)
    nodeset_base.difference_update(nodeset_exclude)
    if len(nodeset_base) < 1:
        parser.error('No node to run on.')

    if options.pick and options.pick < len(nodeset_base):
        # convert to string for sample as nsiter() is slower for big
        # nodesets; and we assume options.pick will remain small-ish
        keep = random.sample(list(nodeset_base), options.pick)
        nodeset_base.intersection_update(','.join(keep))
        if config.verbosity >= VERB_VERB:
            msg = "Picked random nodes: %s" % nodeset_base
            print(Display.COLOR_RESULT_FMT % msg)

    # Set open files limit.
    set_fdlimit(config.fd_max, display)

    #
    # Task management
    #
    # check for clush interactive mode
    interactive = not len(args) and \
                  not (options.copy or options.rcopy)
    # check for foreground ttys presence (input)
    stdin_isafgtty = sys.stdin.isatty() and \
        os.tcgetpgrp(sys.stdin.fileno()) == os.getpgrp()
    # check for special condition (empty command and stdin not a tty)
    if interactive and not stdin_isafgtty:
        # looks like interactive but stdin is not a tty:
        # switch to non-interactive + disable ssh pseudo-tty
        interactive = False
        # SSH: disable pseudo-tty allocation (-T)
        ssh_options = config.ssh_options or ''
        ssh_options += ' -T'
        config._set_main("ssh_options", ssh_options)
    if options.nostdin and interactive:
        parser.error("illegal option `--nostdin' in that case")

    # Force user_interaction if Clush._f_user_interaction for test purposes
    user_interaction = hasattr(sys.modules[__name__], '_f_user_interaction')
    if not options.nostdin:
        # Try user interaction: check for foreground ttys presence (ouput)
        stdout_isafgtty = sys.stdout.isatty() and \
            os.tcgetpgrp(sys.stdout.fileno()) == os.getpgrp()
        user_interaction |= stdin_isafgtty and stdout_isafgtty
    display.vprint(VERB_DEBUG, "User interaction: %s" % user_interaction)
    if user_interaction:
        # Standard input is a terminal and we want to perform some user
        # interactions in the main thread (using blocking calls), so
        # we run cluster commands in a new ClusterShell Task (a new
        # thread is created).
        task = Task()
    # else: perform everything in the main thread

    # Handle special signal only when user_interaction is set
    task.set_default("USER_handle_SIGUSR1", user_interaction)

    task.excepthook = sys.excepthook
    task.set_default("USER_stdin_worker", not (sys.stdin.isatty() or \
                                               options.nostdin or \
                                               user_interaction))
    display.vprint(VERB_DEBUG, "Create STDIN worker: %s" % \
                               task.default("USER_stdin_worker"))

    task.set_info("debug", config.verbosity >= VERB_DEBUG)
    task.set_info("fanout", config.fanout)

    if options.worker:
        try:
            if options.remote == 'no':
                task.set_default('local_worker',
                                 _load_workerclass(options.worker))
            else:
                task.set_default('distant_worker',
                                 _load_workerclass(options.worker))
        except (ImportError, AttributeError):
            msg = "ERROR: Could not load worker '%s'" % options.worker
            display.vprint_err(VERB_QUIET, msg)
            clush_exit(1, task)

    if options.topofile or task._default_tree_is_enabled():
        if options.topofile:
            task.load_topology(options.topofile)
        if config.verbosity >= VERB_VERB:
            roots = len(task.topology.root.nodeset)
            gws = task.topology.inner_node_count() - roots
            msg = "enabling tree topology (%d gateways)" % gws
            print("clush: %s" % msg, file=sys.stderr)

    if options.grooming_delay:
        if config.verbosity >= VERB_VERB:
            msg = Display.COLOR_RESULT_FMT % ("Grooming delay: %f" %
                                              options.grooming_delay)
            print(msg, file=sys.stderr)
        task.set_info("grooming_delay", options.grooming_delay)
    elif options.rcopy:
        # By default, --rcopy should inhibit grooming
        task.set_info("grooming_delay", 0)

    if config.ssh_user:
        task.set_info("ssh_user", config.ssh_user)
    if config.ssh_path:
        task.set_info("ssh_path", config.ssh_path)
    if config.ssh_options:
        task.set_info("ssh_options", config.ssh_options)
    if config.scp_path:
        task.set_info("scp_path", config.scp_path)
    if config.scp_options:
        task.set_info("scp_options", config.scp_options)
    if config.rsh_path:
        task.set_info("rsh_path", config.rsh_path)
    if config.rcp_path:
        task.set_info("rcp_path", config.rcp_path)
    if config.rsh_options:
        task.set_info("rsh_options", config.rsh_options)

    # Set detailed timeout values
    task.set_info("connect_timeout", config.connect_timeout)
    task.set_info("command_timeout", config.command_timeout)

    # Enable stdout/stderr separation
    task.set_default("stderr", not options.gatherall)

    # Prevent reading from stdin?
    task.set_default("stdin", not options.nostdin)

    # Disable MsgTree buffering if not gathering outputs
    task.set_default("stdout_msgtree", display.gather or display.line_mode)

    # Always disable stderr MsgTree buffering
    task.set_default("stderr_msgtree", False)

    # Set timeout at worker level when command_timeout is defined.
    if config.command_timeout > 0:
        timeout = config.command_timeout
    else:
        timeout = -1

    # Configure task custom status
    task.set_default("USER_interactive", interactive)
    task.set_default("USER_running", False)

    if (options.copy or options.rcopy) and not args:
        parser.error("--[r]copy option requires at least one argument")
    if options.copy:
        if not options.dest_path:
            # append '/' to clearly indicate a directory for tree mode
            options.dest_path = join(dirname(abspath(args[0])), '')
        op = "copy sources=%s dest=%s" % (args, options.dest_path)
    elif options.rcopy:
        if not options.dest_path:
            options.dest_path = dirname(abspath(args[0]))
        op = "rcopy sources=%s dest=%s" % (args, options.dest_path)
    else:
        op = "command=\"%s\"" % ' '.join(args)

    # print debug values (fanout value is get from the config object
    # and not task itself as set_info() is an asynchronous call.
    display.vprint(VERB_DEBUG, "clush: nodeset=%s fanout=%d [timeout " \
                   "conn=%.1f cmd=%.1f] %s" %  (nodeset_base, config.fanout,
                                                config.connect_timeout,
                                                config.command_timeout,
                                                op))
    if not task.default("USER_interactive"):
        if display.verbosity >= VERB_DEBUG and task.topology:
            print(Display.COLOR_RESULT_FMT % '-' * 15)
            print(Display.COLOR_RESULT_FMT % task.topology, end='')
            print(Display.COLOR_RESULT_FMT % '-' * 15)
        if options.copy:
            run_copy(task, args, options.dest_path, nodeset_base, timeout,
                     options.preserve_flag, display)
        elif options.rcopy:
            run_rcopy(task, args, options.dest_path, nodeset_base, timeout,
                      options.preserve_flag, display)
        else:
            run_command(task, ' '.join(args), nodeset_base, timeout, display,
                        options.remote != 'no')

    if user_interaction:
        ttyloop(task, nodeset_base, timeout, display, options.remote != 'no')
    elif task.default("USER_interactive"):
        display.vprint_err(VERB_QUIET, \
            "ERROR: interactive mode requires a tty")
        clush_exit(1, task)

    rc = 0
    if options.maxrc:
        # Instead of clush return code, return commands retcode
        rc = task.max_retcode()
        if task.num_timeout() > 0:
            rc = 255
    clush_exit(rc, task)
Пример #44
0
 def __str__(self):
     return '\n'.join([
         "{0}: {1}".format(NodeSet.fromlist(map(lambda x: "vm" + x, keys)),
                           m) for m, keys in self._res_tree.walk()
     ])
Пример #45
0
def main():
    """clush script entry point"""
    sys.excepthook = clush_excepthook

    #
    # Argument management
    #
    usage = "%prog [options] command"

    parser = OptionParser(usage)

    parser.add_option("-n",
                      "--nostdin",
                      action="store_true",
                      dest="nostdin",
                      help="don't watch for possible input from stdin")

    parser.install_groupsconf_option()
    parser.install_clush_config_options()
    parser.install_nodes_options()
    parser.install_display_options(verbose_options=True)
    parser.install_filecopy_options()
    parser.install_connector_options()

    (options, args) = parser.parse_args()

    set_std_group_resolver_config(options.groupsconf)

    #
    # Load config file and apply overrides
    #
    config = ClushConfig(options, options.conf)

    # Initialize logging
    if config.verbosity >= VERB_DEBUG:
        logging.basicConfig(level=logging.DEBUG)
        logging.debug("clush: STARTING DEBUG")
    else:
        logging.basicConfig(level=logging.CRITICAL)

    # Should we use ANSI colors for nodes?
    if config.color == "auto":
        color = sys.stdout.isatty() and (options.gatherall or \
                                         sys.stderr.isatty())
    else:
        color = config.color == "always"

    try:
        # Create and configure display object.
        display = Display(options, config, color)
    except ValueError as exc:
        parser.error("option mismatch (%s)" % exc)

    if options.groupsource:
        # Be sure -a/g -s source work as espected.
        std_group_resolver().default_source_name = options.groupsource

    # Compute the nodeset and warn for possible use of shell pathname
    # expansion (#225)
    wnodelist = []
    xnodelist = []
    if options.nodes:
        wnodelist = [NodeSet(nodes) for nodes in options.nodes]

    if options.exclude:
        xnodelist = [NodeSet(nodes) for nodes in options.exclude]

    for (opt, nodelist) in (('w', wnodelist), ('x', xnodelist)):
        for nodes in nodelist:
            if len(nodes) == 1 and exists(str(nodes)):
                display.vprint_err(
                    VERB_STD, "Warning: using '-%s %s' and "
                    "local path '%s' exists, was it expanded "
                    "by the shell?" % (opt, nodes, nodes))

    # --hostfile support (#235)
    for opt_hostfile in options.hostfile:
        try:
            fnodeset = NodeSet()
            with open(opt_hostfile) as hostfile:
                for line in hostfile.read().splitlines():
                    fnodeset.updaten(nodes for nodes in line.split())
            display.vprint_err(
                VERB_DEBUG,
                "Using nodeset %s from hostfile %s" % (fnodeset, opt_hostfile))
            wnodelist.append(fnodeset)
        except IOError as exc:
            # re-raise as OSError to be properly handled
            errno, strerror = exc.args
            raise OSError(errno, strerror, exc.filename)

    # Instantiate target nodeset from command line and hostfile
    nodeset_base = NodeSet.fromlist(wnodelist)
    # Instantiate filter nodeset (command line only)
    nodeset_exclude = NodeSet.fromlist(xnodelist)

    # Specified engine prevails over default engine
    DEFAULTS.engine = options.engine

    # Do we have nodes group?
    task = task_self()
    task.set_info("debug", config.verbosity >= VERB_DEBUG)
    if config.verbosity == VERB_DEBUG:
        std_group_resolver().set_verbosity(1)
    if options.nodes_all:
        all_nodeset = NodeSet.fromall()
        display.vprint(VERB_DEBUG, "Adding nodes from option -a: %s" % \
                                   all_nodeset)
        nodeset_base.add(all_nodeset)

    if options.group:
        grp_nodeset = NodeSet.fromlist(options.group,
                                       resolver=RESOLVER_NOGROUP)
        for grp in grp_nodeset:
            addingrp = NodeSet("@" + grp)
            display.vprint(VERB_DEBUG, \
                "Adding nodes from option -g %s: %s" % (grp, addingrp))
            nodeset_base.update(addingrp)

    if options.exgroup:
        grp_nodeset = NodeSet.fromlist(options.exgroup,
                                       resolver=RESOLVER_NOGROUP)
        for grp in grp_nodeset:
            removingrp = NodeSet("@" + grp)
            display.vprint(VERB_DEBUG, \
                "Excluding nodes from option -X %s: %s" % (grp, removingrp))
            nodeset_exclude.update(removingrp)

    # Do we have an exclude list? (-x ...)
    nodeset_base.difference_update(nodeset_exclude)
    if len(nodeset_base) < 1:
        parser.error('No node to run on.')

    if options.pick and options.pick < len(nodeset_base):
        # convert to string for sample as nsiter() is slower for big
        # nodesets; and we assume options.pick will remain small-ish
        keep = random.sample(list(nodeset_base), options.pick)
        nodeset_base.intersection_update(','.join(keep))
        if config.verbosity >= VERB_VERB:
            msg = "Picked random nodes: %s" % nodeset_base
            print(Display.COLOR_RESULT_FMT % msg)

    # Set open files limit.
    set_fdlimit(config.fd_max, display)

    #
    # Task management
    #
    # check for clush interactive mode
    interactive = not len(args) and \
                  not (options.copy or options.rcopy)
    # check for foreground ttys presence (input)
    stdin_isafgtty = sys.stdin.isatty() and \
        os.tcgetpgrp(sys.stdin.fileno()) == os.getpgrp()
    # check for special condition (empty command and stdin not a tty)
    if interactive and not stdin_isafgtty:
        # looks like interactive but stdin is not a tty:
        # switch to non-interactive + disable ssh pseudo-tty
        interactive = False
        # SSH: disable pseudo-tty allocation (-T)
        ssh_options = config.ssh_options or ''
        ssh_options += ' -T'
        config._set_main("ssh_options", ssh_options)
    if options.nostdin and interactive:
        parser.error("illegal option `--nostdin' in that case")

    # Force user_interaction if Clush._f_user_interaction for test purposes
    user_interaction = hasattr(sys.modules[__name__], '_f_user_interaction')
    if not options.nostdin:
        # Try user interaction: check for foreground ttys presence (ouput)
        stdout_isafgtty = sys.stdout.isatty() and \
            os.tcgetpgrp(sys.stdout.fileno()) == os.getpgrp()
        user_interaction |= stdin_isafgtty and stdout_isafgtty
    display.vprint(VERB_DEBUG, "User interaction: %s" % user_interaction)
    if user_interaction:
        # Standard input is a terminal and we want to perform some user
        # interactions in the main thread (using blocking calls), so
        # we run cluster commands in a new ClusterShell Task (a new
        # thread is created).
        task = Task()
    # else: perform everything in the main thread

    # Handle special signal only when user_interaction is set
    task.set_default("USER_handle_SIGUSR1", user_interaction)

    task.excepthook = sys.excepthook
    task.set_default("USER_stdin_worker", not (sys.stdin.isatty() or \
                                               options.nostdin or \
                                               user_interaction))
    display.vprint(VERB_DEBUG, "Create STDIN worker: %s" % \
                               task.default("USER_stdin_worker"))

    task.set_info("debug", config.verbosity >= VERB_DEBUG)
    task.set_info("fanout", config.fanout)

    if options.worker:
        try:
            if options.remote == 'no':
                task.set_default('local_worker',
                                 _load_workerclass(options.worker))
            else:
                task.set_default('distant_worker',
                                 _load_workerclass(options.worker))
        except (ImportError, AttributeError):
            msg = "ERROR: Could not load worker '%s'" % options.worker
            display.vprint_err(VERB_QUIET, msg)
            clush_exit(1, task)

    if options.topofile or task._default_tree_is_enabled():
        if options.topofile:
            task.load_topology(options.topofile)
        if config.verbosity >= VERB_VERB:
            roots = len(task.topology.root.nodeset)
            gws = task.topology.inner_node_count() - roots
            msg = "enabling tree topology (%d gateways)" % gws
            print("clush: %s" % msg, file=sys.stderr)

    if options.grooming_delay:
        if config.verbosity >= VERB_VERB:
            msg = Display.COLOR_RESULT_FMT % ("Grooming delay: %f" %
                                              options.grooming_delay)
            print(msg, file=sys.stderr)
        task.set_info("grooming_delay", options.grooming_delay)
    elif options.rcopy:
        # By default, --rcopy should inhibit grooming
        task.set_info("grooming_delay", 0)

    if config.ssh_user:
        task.set_info("ssh_user", config.ssh_user)
    if config.ssh_path:
        task.set_info("ssh_path", config.ssh_path)
    if config.ssh_options:
        task.set_info("ssh_options", config.ssh_options)
    if config.scp_path:
        task.set_info("scp_path", config.scp_path)
    if config.scp_options:
        task.set_info("scp_options", config.scp_options)
    if config.rsh_path:
        task.set_info("rsh_path", config.rsh_path)
    if config.rcp_path:
        task.set_info("rcp_path", config.rcp_path)
    if config.rsh_options:
        task.set_info("rsh_options", config.rsh_options)

    # Set detailed timeout values
    task.set_info("connect_timeout", config.connect_timeout)
    task.set_info("command_timeout", config.command_timeout)

    # Enable stdout/stderr separation
    task.set_default("stderr", not options.gatherall)

    # Prevent reading from stdin?
    task.set_default("stdin", not options.nostdin)

    # Disable MsgTree buffering if not gathering outputs
    task.set_default("stdout_msgtree", display.gather or display.line_mode)

    # Always disable stderr MsgTree buffering
    task.set_default("stderr_msgtree", False)

    # Set timeout at worker level when command_timeout is defined.
    if config.command_timeout > 0:
        timeout = config.command_timeout
    else:
        timeout = -1

    # Configure task custom status
    task.set_default("USER_interactive", interactive)
    task.set_default("USER_running", False)

    if (options.copy or options.rcopy) and not args:
        parser.error("--[r]copy option requires at least one argument")
    if options.copy:
        if not options.dest_path:
            # append '/' to clearly indicate a directory for tree mode
            options.dest_path = join(dirname(abspath(args[0])), '')
        op = "copy sources=%s dest=%s" % (args, options.dest_path)
    elif options.rcopy:
        if not options.dest_path:
            options.dest_path = dirname(abspath(args[0]))
        op = "rcopy sources=%s dest=%s" % (args, options.dest_path)
    else:
        op = "command=\"%s\"" % ' '.join(args)

    # print debug values (fanout value is get from the config object
    # and not task itself as set_info() is an asynchronous call.
    display.vprint(VERB_DEBUG, "clush: nodeset=%s fanout=%d [timeout " \
                   "conn=%.1f cmd=%.1f] %s" %  (nodeset_base, config.fanout,
                                                config.connect_timeout,
                                                config.command_timeout,
                                                op))
    if not task.default("USER_interactive"):
        if display.verbosity >= VERB_DEBUG and task.topology:
            print(Display.COLOR_RESULT_FMT % '-' * 15)
            print(Display.COLOR_RESULT_FMT % task.topology, end='')
            print(Display.COLOR_RESULT_FMT % '-' * 15)
        if options.copy:
            run_copy(task, args, options.dest_path, nodeset_base, timeout,
                     options.preserve_flag, display)
        elif options.rcopy:
            run_rcopy(task, args, options.dest_path, nodeset_base, timeout,
                      options.preserve_flag, display)
        else:
            run_command(task, ' '.join(args), nodeset_base, timeout, display,
                        options.remote != 'no')

    if user_interaction:
        ttyloop(task, nodeset_base, timeout, display, options.remote != 'no')
    elif task.default("USER_interactive"):
        display.vprint_err(VERB_QUIET, \
            "ERROR: interactive mode requires a tty")
        clush_exit(1, task)

    rc = 0
    if options.maxrc:
        # Instead of clush return code, return commands retcode
        rc = task.max_retcode()
        if task.num_timeout() > 0:
            rc = 255
    clush_exit(rc, task)
Пример #46
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        self.mpi_module = self.params.get("mpi_module",
                                          "/run/*",
                                          default="mpi/mpich-x86_64")
        enable_sudo = self.params.get("enable_sudo", "/run/*", default=True)
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000)
        ignore_soak_errors = self.params.get("ignore_soak_errors",
                                             test_param + "*", False)
        self.sudo_cmd = "sudo" if enable_sudo else ""
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        self.resv_cont = self.get_container(self.pool[0],
                                            "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file unless test is smoke
        self.initial_resv_file = os.path.join(self.test_dir, "initial",
                                              "resv_file")
        try:
            reserved_file_copy(self,
                               self.initial_resv_file,
                               self.pool[0],
                               self.resv_cont,
                               num_bytes=resv_bytes,
                               cmd="write")
        except CommandFailure as error:
            self.fail(error)

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.soak_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.soak_dir, self.sharedsoak_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error
        # Baseline metrics data
        run_metrics_check(self, prefix="initial")
        # Initialize time
        self.start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = self.start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            # Cleanup any dfuse mounts before destroying containers
            cleanup_dfuse(self)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1:]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Gather metrics data after jobs complete
            run_metrics_check(self)
            # Fail if the pool/containers did not clean up correctly
            if not ignore_soak_errors:
                self.assertEqual(len(self.soak_errors), 0,
                                 "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - self.start_time))
Пример #47
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.harasser_joblist = []
        self.harasser_results = {}
        test_to = self.params.get("test_timeout", test_param)
        self.job_timeout = self.params.get("job_timeout", test_param)
        self.harasser_timeout = self.params.get("harasser_timeout", test_param)
        self.test_name = self.params.get("name", test_param)
        self.nodesperjob = self.params.get("nodesperjob", test_param)
        self.test_iteration = self.params.get("iteration", test_param)
        self.task_list = self.params.get("taskspernode", test_param + "*")
        self.h_list = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        pool_list = self.params.get("poollist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        if self.is_harasser("rebuild"):
            obj_class = "_".join(["OC", str(
                self.params.get("daos_oclass", "/run/rebuild/*")[0])])
        else:
            obj_class = self.params.get(
                "object_class", "/run/container_reserved/*")
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        self.add_pools(["pool_reserved"])
        self.pool[0].connect()
        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        self.container = TestContainer(self.pool[0])
        self.container.namespace = "/run/container_reserved/*"
        self.container.get_params(self)
        self.container.create()
        self.container.write_objects(rank, obj_class)
        self.all_failed_jobs = []
        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(
            NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(
                self.log_dir), self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError(
                "<<FAILED: Soak directories not removed"
                "from clients>>: {}".format(self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))
        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info(
                "<<Soak1 PASS %s: time until done %s>>", self.loop,
                DDHHMMSS_format(self.end_time - time.time()))
            # Create all specified pools
            self.add_pools(pool_list)
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))
            try:
                self.execute_jobs(job_list, self.pool[1:])
            except SoakTestError as error:
                self.fail(error)
            errors = self.destroy_pools(self.pool[1:])
            # remove the test pools from self.pool; preserving reserved pool
            self.pool = [self.pool[0]]
            self.log.info(
                "Current pools: %s",
                " ".join([pool.uuid for pool in self.pool]))
            self.assertEqual(len(errors), 0, "\n".join(errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info(
                "<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format(
                    loop_time))
            self.loop += 1
        # TO-DO: use IOR
        self.assertTrue(
            self.container.read_objects(),
            "Data verification error on reserved pool"
            "after SOAK completed")
        # gather the daos logs from the client nodes
        self.log.info(
            "<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format(
                time.time() - start_time))
try:
   # ClusterShell ( pip install clustershell )
    from ClusterShell.NodeSet import NodeSet
    from ClusterShell.Task import task_self
except ImportError:
    raise ImportError('Please install clustershell >= 1.2.0')
 
# RING0
RING0DEV=["10.10.1.39","10.10.1.40","10.10.1.43","10.10.1.50"]
RING0LIVE=[]

RING1DEV=["10.10.1.39","10.10.1.40","10.10.1.43","10.10.1.50"]
RING1LIVE=[]

# initialise rings from config file
RING_1_dev__allnodes  = NodeSet.fromlist(RING1DEV)
RING_1_dev__bootstrapnode = NodeSet.fromlist(RING1DEV)


# CASSANDRA
CASSANDRA_VERSION="0.8.5"
CASSANDRA_CORE="/opt/cassandra-dev"
CASSANDRA_HOME=CASSANDRA_CORE +"/apache-cassandra-" + CASSANDRA_VERSION
CASSANDRA_BIN=CASSANDRA_HOME +"/bin/cassandra"
CASSANDRA_CONF=CASSANDRA_CORE +"/cluster_config"
CASSANDRA_INCLUDE=CASSANDRA_CONF +"/cassandra.in.sh"
# JMX port
PORT=7198
CASSANDRA_STRESS_TEST=CASSANDRA_HOME+"/SOFTWARE/apache-cassandra-0.8.6-src/tools/stress/bin/stress"

# CASSANDRA.YAML
Пример #49
0
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0):
    """Run a command on each host in parallel and get the return codes.

    Args:
        hosts (list): list of hosts
        command (str): the command to run in parallel
        verbose (bool, optional): display command output. Defaults to True.
        timeout (int, optional): command timeout in seconds. Defaults to None.
        expect_rc (int, optional): exepcted return code. Defaults to 0.

    Returns:
        dict: a dictionary of return codes keys and accompanying NodeSet
            values indicating which hosts yielded the return code.

    """
    # Run the command on each host in parallel
    task = run_task(hosts, command, timeout)

    # Report any errors / display output if requested
    retcode_dict = {}
    for retcode, rc_nodes in task.iter_retcodes():
        # Create a NodeSet for this list of nodes
        nodeset = NodeSet.fromlist(rc_nodes)

        # Include this NodeSet for this return code
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(nodeset)

        # Display any errors or requested output
        if retcode != expect_rc or verbose:
            msg = "failure running" if retcode != expect_rc else "output from"
            if len(list(task.iter_buffers(rc_nodes))) == 0:
                print(
                    "{}: {} '{}': rc={}".format(
                        nodeset, msg, command, retcode))
            else:
                for output, nodes in task.iter_buffers(rc_nodes):
                    nodeset = NodeSet.fromlist(nodes)
                    lines = str(output).splitlines()
                    output = "rc={}{}".format(
                        retcode,
                        ", {}".format(output) if len(lines) < 2 else
                        "\n  {}".format("\n  ".join(lines)))
                    print(
                        "{}: {} '{}': {}".format(
                            NodeSet.fromlist(nodes), msg, command, output))

    # Report any timeouts
    if timeout and task.num_timeout() > 0:
        nodes = task.iter_keys_timeout()
        print(
            "{}: timeout detected running '{}' on {}/{} hosts".format(
                NodeSet.fromlist(nodes),
                command, task.num_timeout(), len(hosts)))
        retcode = 255
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(NodeSet.fromlist(nodes))

    return retcode_dict
Пример #50
0
 def labels(self):
     """Return a NodeSet containing all component label."""
     return NodeSet.fromlist((comp.label for comp in self))
Пример #51
0
def get_ofi_info(hosts, supported=None, verbose=True):
    """Get the OFI provider information from the specified hosts.

    Args:
        hosts (NodeSet): hosts from which to gather the information
        supported (list, optional): list of supported providers when if provided will limit the
            inclusion to only those providers specified. Defaults to None.
        verbose (bool, optional): display command details. Defaults to True.

    Returns:
        dict: a dictionary of interface keys with a dictionary value of a comma-separated string of
            providers key with a NodeSet value where the providers where detected.

    """
    task = run_task(hosts, "fi_info", verbose=verbose)
    if verbose:
        display_task(task)

    # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which
    # the providers were detected.
    providers = {}
    results = dict(task.iter_retcodes())
    if 0 in results:
        for output, nodelist in task.iter_buffers(results[0]):
            output_lines = [
                line.decode("utf-8").rstrip(os.linesep) for line in output
            ]
            nodeset = NodeSet.fromlist(nodelist)

            # Find all the provider and domain pairings. The fi_info output reports these on
            # separate lines when processing the re matches ensure each domain is preceded by a
            # provider.
            interface_providers = {}
            data = re.findall(r"(provider|domain):\s+([A-Za-z0-9;_+]+)",
                              "\n".join(output_lines))
            while data:
                provider = list(data.pop(0))
                if provider[0] == "provider" and data[0][0] == "domain":
                    provider.pop(0)
                    domain = list(data.pop(0))
                    domain.pop(0)

                    # A provider and domain must be specified
                    if not provider or not domain:
                        continue

                    # Add 'ofi+' to the provider
                    provider = ["+".join(["ofi", item]) for item in provider]

                    # Only include supported providers if a supported list is provided
                    if supported and provider[0] not in supported:
                        continue

                    if domain[0] not in interface_providers:
                        interface_providers[domain[0]] = set()
                    interface_providers[domain[0]].update(provider)

            for interface, provider_set in interface_providers.items():
                if interface not in providers:
                    providers[interface] = {}
                provider_key = ",".join(list(provider_set))
                if provider_key not in providers[interface]:
                    providers[interface][provider_key] = NodeSet()
                providers[interface][provider_key].update(nodeset)

    return providers
Пример #52
0
 def servers(self):
     """Return a NodeSet containing all component servers."""
     return NodeSet.fromlist((comp.server.hostname for comp in self))
Пример #53
0
    def ev_close(self, worker):
        """End of proxy command."""
        Action.ev_close(self, worker)

        # Before all, we must check if shine command ran without bugs, node
        # crash, etc...
        # So we need to verify all node retcodes and change the component state
        # on the bad nodes.

        # Action timed out
        if worker.did_timeout():
            self.set_status(ACT_ERROR)
            return

        status = ACT_OK

        # Remove the 'proxy' running action for each component.
        if self._comps:
            for comp in self._comps:
                # XXX: This should be changed using a real event for proxy.
                comp._del_action('proxy')

                if comp.state is None:
                    comp.state = RUNTIME_ERROR

                # At this step, there should be no more INPROGRESS component.
                # If yes, this is a bug, change state to RUNTIME_ERROR.
                # INPROGRESS management could be change using running action
                # list.
                # Starting with v1.3, there is no more code setting INPROGRESS.
                # This is for compatibility with older clients.
                elif comp.state == INPROGRESS:
                    actions = ""
                    if len(comp._list_action()):
                        actions = "actions: " + ", ".join(comp._list_action())
                    print >> sys.stderr, "ERROR: bad state for %s: %d %s" % \
                                    (comp.label, comp.state, actions)
                    comp.state = RUNTIME_ERROR

        # Gather nodes by return code
        for rc, nodes in worker.iter_retcodes():
            # Remote command returns only RUNTIME_ERROR (See RemoteCommand)
            # some common remote errors:
            # rc 127 = command not found
            # rc 126 = found but not executable
            # rc 1 = python failure...
            if rc != 0:

                # If there is at least one error, the action is on error.
                status = ACT_ERROR

                # Gather these nodes by buffer
                key = nodes.__contains__
                for buffers, nodes in self._outputs.walk(match=key):
                    # Handle proxy command error
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Remote action %s failed: %s\n" % \
                                                        (self.action, buffers)
                    self.fs._handle_shine_proxy_error(nodes, msg)

        # Raise errors for each unpickling error,
        # which could happen mostly when Shine exits with 0.
        for buffers, nodes in self._errpickle.walk():
            nodes = NodeSet.fromlist(nodes)
            self.fs._handle_shine_proxy_error(nodes, str(buffers))

        # Raise an error for nodes without output
        if len(self._silentnodes) > 0:
            msg = "Remote action %s failed: No response" % self.action
            self.fs._handle_shine_proxy_error(self._silentnodes, msg)

        self.set_status(status)
Пример #54
0
def get_host_data(hosts, command, text, error, timeout=None):
    """Get the data requested for each host using the specified command.

    Args:
        hosts (list): list of hosts
        command (str): command used to obtain the data on each server
        text (str): data identification string
        error (str): data error string

    Returns:
        dict: a dictionary of data values for each NodeSet key

    """
    # Find the data for each specified servers
    print("  Obtaining {} data on {}".format(text, hosts))
    task = run_task(hosts, command, timeout)
    host_data = {}
    DATA_ERROR = "[ERROR]"

    # Create a list of NodeSets with the same return code
    data = {code: hosts for code, hosts in task.iter_retcodes()}

    # Multiple return codes or a single non-zero return code
    # indicate at least one error obtaining the data
    if len(data) > 1 or 0 not in data:
        # Report the errors
        messages = []
        for code, hosts in data.items():
            if code != 0:
                output_data = list(task.iter_buffers(hosts))
                if len(output_data) == 0:
                    messages.append("{}: rc={}, command=\"{}\"".format(
                        NodeSet.fromlist(hosts), code, command))
                else:
                    for output, o_hosts in output_data:
                        lines = str(output).splitlines()
                        info = "rc={}{}".format(
                            code, ", {}".format(output) if len(lines) < 2 else
                            "\n  {}".format("\n  ".join(lines)))
                        messages.append("{}: {}".format(
                            NodeSet.fromlist(o_hosts), info))
        print("    {} on the following hosts:\n      {}".format(
            error, "\n      ".join(messages)))

        # Return an error data set for all of the hosts
        host_data = {NodeSet.fromlist(hosts): DATA_ERROR}

    else:
        # The command completed successfully on all servers.
        for output, hosts in task.iter_buffers(data[0]):
            # Find the maximum size of the all the devices reported by
            # this group of hosts as only one needs to meet the minimum
            nodes = NodeSet.fromlist(hosts)
            try:
                # The assumption here is that each line of command output
                # will begin with a number and that for the purposes of
                # checking this requirement the maximum of these numbers is
                # needed
                int_host_values = [
                    int(line.split()[0]) for line in str(output).splitlines()
                ]
                host_data[nodes] = max(int_host_values)

            except (IndexError, ValueError):
                # Log the error
                print("    {}: Unable to obtain the maximum {} size due to "
                      "unexpected output:\n      {}".format(
                          nodes, text,
                          "\n      ".join(str(output).splitlines())))

                # Return an error data set for all of the hosts
                host_data = {NodeSet.fromlist(hosts): DATA_ERROR}
                break

    return host_data
Пример #55
0
 def nodeset(self):
     return str(NodeSet.fromlist(self.nodes.keys()))
Пример #56
0
 def display_proxy_errors(cls, fs):
     """Display proxy error messages for the specified filesystem."""
     for msg, nodes in fs.proxy_errors.walk():
         nodes = str(NodeSet.fromlist(nodes))
         msg = str(msg).replace('THIS_SHINE_HOST', nodes)
         print >> sys.stderr, "%s: %s" % (nodes, msg)
Пример #57
0
 def labels(self):
     """Return a NodeSet containing all component label."""
     return NodeSet.fromlist((comp.label for comp in self))
Пример #58
0
    def get_log_data(self, hosts, since, until=None, timeout=60):
        """Gather log output for the command running on each host.

        Note (from journalctl man page):
            Date specifications should be of the format "2012-10-30 18:17:16".
            If the time part is omitted, "00:00:00" is assumed. If only the
            seconds component is omitted, ":00" is assumed. If the date
            component is omitted, the current day is assumed. Alternatively the
            strings "yesterday", "today", "tomorrow" are understood, which refer
            to 00:00:00 of the day before the current day, the current day, or
            the day after the current day, respectively.  "now" refers to the
            current time. Finally, relative times may be specified, prefixed
            with "-" or "+", referring to times before or after the current
            time, respectively.

        Args:
            hosts (list): list of hosts from which to gather log data.
            since (str): show log entries from this date.
            until (str, optional): show log entries up to this date. Defaults
                to None, in which case it is not utilized.
            timeout (int, optional): timeout for issuing the command. Defaults
                to 60 seconds.

        Returns:
            list: a list of dictionaries including:
                "hosts": <NodeSet() of hosts with this data>
                "data": <journalctl output>

        """
        # Setup the journalctl command to capture all unit activity from the
        # specified start date to now or a specified end date
        #   --output=json?
        command = self.get_journalctl_command(since, until)
        self.log.info("Gathering log data on %s: %s", str(hosts), command)

        # Gather the log information per host
        results = run_pcmd(hosts, command, False, timeout, None)

        # Determine if the command completed successfully without a timeout
        status = True
        for result in results:
            if result["interrupted"]:
                self.log.info("  Errors detected running \"%s\":", command)
                self.log.info("    %s: timeout detected after %s seconds",
                              str(result["hosts"]), timeout)
                status = False
            elif result["exit_status"] != 0:
                self.log.info("  Errors detected running \"%s\":", command)
                status = False
            if not status:
                break

        # Display/return the command output
        log_data = []
        for result in results:
            if result["exit_status"] == 0 and not result["interrupted"]:
                # Add the successful output from each node to the dictionary
                log_data.append({
                    "hosts": result["hosts"],
                    "data": result["stdout"]
                })
            else:
                # Display all of the results in the case of an error
                if len(result["stdout"]) > 1:
                    self.log.info("    %s: rc=%s, output:",
                                  str(result["hosts"]), result["exit_status"])
                    for line in result["stdout"]:
                        self.log.info("      %s", line)
                else:
                    self.log.info("    %s: rc=%s, output: %s",
                                  str(result["hosts"]), result["exit_status"],
                                  result["stdout"][0])

        # Report any errors through an exception
        if not status:
            raise CommandFailure(
                "Error(s) detected gathering {} log data on {}".format(
                    self._systemctl.service.value, NodeSet.fromlist(hosts)))

        # Return the successful command output per set of hosts
        return log_data
Пример #59
0
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0):
    """Run a command on each host in parallel and get the return codes.

    Args:
        hosts (list): list of hosts
        command (str): the command to run in parallel
        verbose (bool, optional): display command output. Defaults to True.
        timeout (int, optional): command timeout in seconds. Defaults to None.
        expect_rc (int, optional): expected return code. Defaults to 0.

    Returns:
        dict: a dictionary of return codes keys and accompanying NodeSet
            values indicating which hosts yielded the return code.

    """
    # Run the command on each host in parallel
    task = run_task(hosts, command, timeout)

    # Report any errors
    retcode_dict = {}
    errors = False
    for retcode, rc_nodes in task.iter_retcodes():
        # Create a NodeSet for this list of nodes
        nodeset = NodeSet.fromlist(rc_nodes)

        # Include this NodeSet for this return code
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(nodeset)

        # Keep track of any errors
        if expect_rc is not None and expect_rc != retcode:
            errors = True

    # Report command output if requested or errors are detected
    if verbose or errors:
        print("Command:\n  {}".format(command))
        print("Command return codes:")
        for retcode in sorted(retcode_dict):
            print("  {}: rc={}".format(retcode_dict[retcode], retcode))

        print("Command output:")
        for output, bf_nodes in task.iter_buffers():
            # Create a NodeSet for this list of nodes
            nodeset = NodeSet.fromlist(bf_nodes)

            # Display the output per node set
            print("  {}:\n    {}".format(
                nodeset, "\n    ".join(str(output).splitlines())))

    # Report any timeouts
    if timeout and task.num_timeout() > 0:
        nodes = task.iter_keys_timeout()
        print("{}: timeout detected running '{}' on {}/{} hosts after {}s".
              format(NodeSet.fromlist(nodes), command, task.num_timeout(),
                     len(hosts), timeout))
        retcode = 255
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(NodeSet.fromlist(nodes))

    return retcode_dict
Пример #60
0
 def servers(self):
     """Return a NodeSet containing all component servers."""
     return NodeSet.fromlist((comp.server.hostname for comp in self))