def display(tree, disp, gather, trace_mode, enable_nodeset_key): """nicely display MsgTree instance `tree' content according to `disp' Display object and `gather' boolean flag""" out = sys_stdout() try: if trace_mode: display_tree(tree, disp, out) else: if gather: if enable_nodeset_key: # lambda to create a NodeSet from keys returned by walk() ns_getter = lambda x: NodeSet.fromlist(x[1]) for nodeset in sorted((ns_getter(item) for item in tree.walk()), key=nodeset_cmpkey): disp.print_gather(nodeset, tree[nodeset[0]]) else: for msg, key in tree.walk(): disp.print_gather_keys(key, msg) else: if enable_nodeset_key: # nodes are automagically sorted by NodeSet for node in NodeSet.fromlist(tree.keys()).nsiter(): disp.print_gather(node, tree[str(node)]) else: for key in tree.keys(): disp.print_gather_keys([ key ], tree[key]) finally: out.flush()
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): nodes = NodeSet.fromlist(worker.iter_keys_timeout()) self.fs._handle_shine_proxy_error(nodes, "Nodes timed out") self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: for rc, nodes in worker.iter_retcodes(): if rc == 0: continue # Avoid warnings, flag this component in error state for comp in self._comps or []: comp.sanitize_state(nodes=worker.nodes) for output, nodes in worker.iter_buffers(match_keys=nodes): nodes = NodeSet.fromlist(nodes) msg = "Copy failed: %s" % output self.fs._handle_shine_proxy_error(nodes, msg) self.set_status(ACT_ERROR)
def __gen_action_output(self, iterbuf, iterrc, timeouts, error_only): '''Display command result from output and retcodes.''' # Build the list of non-zero rc nodes retcodes = list(iterrc) ok_nodes = NodeSet.fromlist((nds for rc, nds in retcodes if rc == 0)) output = [] for out, nodes in iterbuf: if error_only: nodes = NodeSet(nodes) - ok_nodes if nodes and out: for lbuf in out.splitlines(): output.append(' > %s: %s' % (self.string_color(nodes, 'CYAN'), lbuf)) for retcode, nodes in retcodes: if retcode == 0 and not error_only: output.append(' > %s exited with %s' % (self.string_color(nodes, 'CYAN'), self.string_color(retcode, 'GREEN'))) elif retcode != 0: output.append(' > %s exited with %s' % (self.string_color(nodes, 'CYAN'), self.string_color(retcode, 'RED'))) if len(timeouts): output.append(' > %s has %s' % (self.string_color(timeouts, 'CYAN'), self.string_color('timeout', 'RED'))) return output
def torque_job_nodelist(self,nodelist): nodelist = self._exechostpat.sub('',nodelist) nodelist = nodelist.split('+') nbprocs = len(nodelist) nodelist = NodeSet.fromlist(nodelist) nbnodes = len(nodelist) nodelist = str(nodelist) return nbprocs, nbnodes, nodelist
def parse(self, filename=None): """ Function called to parse the content of the tuning configuratio file and store the configuration in the object. """ # Build the patterns to retrieve alias and parameter declaration alias_re = re.compile("alias\s+(\S+)\s*=\s*(\S+)$") parameter_re = re.compile('("[^"]+"|\S+)\s+(\S+)\s+(\S+)$') supported = NodeSet.fromlist(list(NODE_TYPES) + TYPE_ALIASES.keys()) # Open the file to read each lines try: tuning_file = open(filename or self.filename) for line in tuning_file.readlines(): # Skip comments and blanks line = line.split('#', 1)[0].strip() if not line: continue m_alias = alias_re.match(line) m_param = parameter_re.match(line) if m_alias: # This line is an alias creation self.create_parameter_alias(m_alias.group(1), m_alias.group(2)) elif m_param: # This line is a parameter instanciation nodes = NodeSet.fromlist( m_param.group(3).lower().split(';')) self.create_parameter(m_param.group(2), m_param.group(1), nodes & supported, nodes - supported) else: # This line is not recognized raise TuningError("Wrong tuning syntax '%s'" % line) tuning_file.close() except IOError, error: msg = "Error while reading tuning configuration file: %s" % error raise TuningError(msg)
def iter_retcodes(self, match_keys=None): """ Returns an iterator over return codes and associated NodeSet. If the optional parameter match_keys is defined, only keys found in match_keys are returned. """ self._task_bound_check() for rc, keys in self.task._rc_iter_by_worker(self, match_keys): yield rc, NodeSet.fromlist(keys)
def __init__(self, name, value, node_types=None, node_list=None): self.name = name self.value = value self._node_types = set() self.node_types = node_types or set() self.node_list = NodeSet() if node_list is not None: self.node_list = NodeSet.fromlist(node_list)
def nodes_timeout(self): """Get nodeset of timeout nodes for this action.""" if self.worker: if isinstance(self.worker, WorkerPopen): if self.worker.did_timeout(): return NodeSet("localhost") else: return NodeSet.fromlist(list(self.worker.iter_keys_timeout())) return NodeSet()
def checkNodes(self): try: # print command info print '\n== Checking active nodes ==' # launch ping on the specified nodes task_self().run('echo OK', nodes=self.ns) # retrieve and check return code for retcode, nodes in task_self().iter_retcodes(): if retcode in (0, 1, 2): # add nodes to OK set self.ns_ok |= NodeSet.fromlist(nodes) print '%s : OK' % nodes else: # add nodes to KO set self.ns_ko |= NodeSet.fromlist(nodes) print '%s : KO' % nodes # syntax error except NodeSetException: print >> sys.stderr, '(!) Error : the submitted nodeset [%s] is not valid' % self.ns
def connected(self, src_ns): """find out and return the aggregation of directly connected children from src_ns. Argument src_ns is expected to be a NodeSet instance. Result is returned as a NodeSet instance """ next_hop = NodeSet.fromlist([dst for dst in [route.dest(src_ns) for route in self._routes] if dst is not None]) if len(next_hop) == 0: return None return next_hop
def iter_errors(self, match_keys=None): """ Returns an iterator over available error buffers and associated NodeSet. If the optional parameter match_keys is defined, only keys found in match_keys are returned. """ self._task_bound_check() for msg, keys in self.task._call_tree_matcher( self.task._msgtree(self.SNAME_STDERR).walk, match_keys, self): yield msg, NodeSet.fromlist(keys)
def _live_line(self, worker): # if all nodes have replied, display gathered line while self._mtreeq and len(self._mtreeq[0]) == len(self._nodes): mtree = self._mtreeq.pop(0) self._offload += 1 self._runtimer_clean() nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1])) for buf, nodeset in sorted(map(nodesetify, mtree.walk()), key=bufnodeset_cmpkey): self._display.print_gather(nodeset, buf) self._runtimer_set_dirty()
def nodeset_cmp(ns1, ns2): """Compare 2 nodesets by their length (we want larger nodeset first) and then by first node.""" len_cmp = cmp(len(ns2), len(ns1)) if not len_cmp: smaller = NodeSet.fromlist([ns1[0], ns2[0]])[0] if smaller == ns1[0]: return -1 else: return 1 return len_cmp
def makedepgraph(config, rules, components_lists, options): """ Return the dependency graph for the given pair ('req_ruleset', 'components_lists'). """ ruleset = RuleSet(rules.values()) all_set = get_component_set_from(config, components_lists) force_opt = options.force force_rule = force_opt.split(',') if force_opt is not None else [] docache = True if getattr(options, 'docache', 'yes') == 'yes' else False _LOGGER.debug("Caching filter results (docache) is: %s", docache) depgraph = ruleset.get_depgraph(all_set, force_rule, docache) if _LOGGER.isEnabledFor(logging.DEBUG): _LOGGER.debug("Components set: %s", NodeSet.fromlist([to_str_from_unicode(x.id) for x in all_set])) _LOGGER.debug("Remaining: %s", NodeSet.fromlist([str(x) for x in depgraph.remaining_components])) _LOGGER.debug("List: %s", NodeSet.fromlist([str(x) for x in depgraph.components_map])) return depgraph
def ev_close(self, worker, timedout): # Worker is closing -- it's time to gather results... self._runtimer_finalize(worker) for mtree in self._mtreeq: nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1])) for buf, nodeset in sorted(map(nodesetify, mtree.walk()), key=bufnodeset_cmpkey): self._display.print_gather(nodeset, buf) self._close_common(worker) # Notify main thread to update its prompt self.update_prompt(worker)
def print_running_tasks(self): '''Rewrite the current line and print the current running tasks''' rtasks = [t.parent.name for t in action_manager_self().running_tasks] if rtasks and self._show_running: tasks_disp = '[%s]' % NodeSet.fromlist(rtasks) width = min(self._pl_width, self._term_width) # truncate display to avoid buggy display when the length on # the displayed tasks is bigger than the screen width if len(tasks_disp) >= self._term_width: tasks_disp = "%s...]" % tasks_disp[:self._term_width - 4] eol = ' ' * (width - len(tasks_disp)) if not self.cleanup: eol = '' sys.stderr.write('%s%s\r' % (tasks_disp, eol)) sys.stderr.flush() self._pl_width = len(tasks_disp)
def display_tree(tree, disp, out): """display sub-routine for clubak -T (msgtree trace mode)""" togh = True offset = 2 reldepth = -offset reldepths = {} line_mode = disp.line_mode for msgline, keys, depth, nchildren in tree.walk_trace(): if togh: if depth in reldepths: reldepth = reldepths[depth] else: reldepth = reldepths[depth] = reldepth + offset nodeset = NodeSet.fromlist(keys) if line_mode: out.write(str(nodeset).encode() + b':\n') else: out.write(disp.format_header(nodeset, reldepth)) out.write(b' ' * reldepth + msgline + b'\n') togh = nchildren != 1
def _distant_action_by_server(self, action_class, servers, **kwargs): # filter local server distant_servers = Server.distant_servers(servers) # perform action on distant servers if len(distant_servers) > 0: action = action_class(nodes=distant_servers, fs=self, **kwargs) action.launch() self._run_actions() if action.status() == ACT_ERROR: err_code = None if task_self().num_timeout(): err_code = -1 elif task_self().max_retcode(): err_code = task_self().max_retcode() # FSRemoteError is limited and cannot handle more than 1 error msg, nodes = list(self.proxy_errors.walk())[0] nodes = NodeSet.fromlist(nodes) msg = str(msg).replace('THIS_SHINE_HOST', str(nodes)) raise FSRemoteError(nodes, err_code, msg)
def check_file_exists(hosts, filename, user=None): """Check if a specified file exist on each specified hosts. If specified, verify that the file exists and is owned by the user. Args: hosts (list): list of hosts filename (str): file to check for the existence of on each host user (str, optional): owner of the file. Defaults to None. Returns: (bool, NodeSet): A tuple of: - True if the file exists on each of the hosts; False otherwise - A NodeSet of hosts on which the file does not exist """ missing_file = NodeSet() command = "test {} '{}'".format("-e" if user is None else "-O", filename) task = run_task(hosts, command) for ret_code, node_list in task.iter_retcodes(): if ret_code != 0: missing_file.add(NodeSet.fromlist(node_list)) return len(missing_file) == 0, missing_file
def print_action_results(self, action, error_only=False): '''Remove the current line and write grouped results of an action''' line = ['%s %s ran in %.2f s' % \ (self.string_color(action.name, 'MAGENTA'), action.parent.fullname(), action.duration)] buffers = [] retcodes = [] timeout = NodeSet() # Local action if action.worker.current_node is None: buffers = [(action.worker.read(), 'localhost')] if action.worker.did_timeout(): timeout.add('localhost') if action.worker.retcode() is not None: retcodes.append((action.worker.retcode(),'localhost')) # Remote action else: buffers = action.worker.iter_buffers() retcodes = action.worker.iter_retcodes() timeout = NodeSet.fromlist(action.worker.iter_keys_timeout()) line += self.__gen_action_output(buffers, retcodes, timeout, error_only) self.output("\n".join(line))
def remote_copy(self, hostlist, remote_dir, local_dir): """Copy files from remote dir to local dir. Args: hostlist (list): list of remote nodes remote_dir (str): remote directory of files local_dir (str): local directory Raises: SoakTestError: if there is an error with the remote copy """ this_host = socket.gethostname() result = pcmd( NodeSet.fromlist(hostlist), "if [ ! -z '$(ls -A {0})' ]; then " "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format( remote_dir, this_host, local_dir), verbose=False) if len(result) > 1 or 0 not in result: raise SoakTestError( "Error executing remote copy: {}".format( ", ".join( [str(result[key]) for key in result if key != 0])))
def get_active_network_interfaces(hosts, verbose=True): """Get all active network interfaces on the hosts. Args: hosts (NodeSet): hosts on which to find active interfaces verbose (bool, optional): display command details. Defaults to True. Returns: dict: a dictionary of interface keys and NodeSet values on which they were found """ net_path = os.path.join(os.path.sep, "sys", "class", "net") operstate = os.path.join(net_path, "*", "operstate") command = " | ".join([ f"grep -l 'up' {operstate}", "grep -Ev '/(lo|bonding_masters)/'", "sort" ]) task = run_task(hosts, command, verbose=verbose) if verbose: display_task(task) # Populate a dictionary of active interfaces with a NodSet of hosts on which it was found active_interfaces = {} for output, nodelist in task.iter_buffers(): output_lines = [line.decode("utf-8") for line in output] nodeset = NodeSet.fromlist(nodelist) for line in output_lines: try: interface = line.split("/")[-2] if interface not in active_interfaces: active_interfaces[interface] = NodeSet() active_interfaces[interface].update(nodeset) except IndexError: pass return active_interfaces
def _parse_token(self, token): """Concrete implementation of parent abstract method. :Parameters: according to parent :py:meth:`cumin.backends.BaseQueryAggregator._parse_token`. """ if not isinstance(token, pp.ParseResults ): # pragma: no cover - this should never happen raise InvalidQueryError( 'Expecting ParseResults object, got {type}: {token}'.format( type=type(token), token=token)) token_dict = token.asDict() self.logger.trace('Token is: %s | %s', token_dict, token) if 'hosts' in token_dict: element = self._get_stack_element() element['hosts'] = NodeSet.fromlist(token_dict['hosts'], resolver=self.resolver) if 'bool' in token_dict: element['bool'] = token_dict['bool'] self.stack_pointer['children'].append(element) elif 'open_subgroup' in token_dict and 'close_subgroup' in token_dict: self._open_subgroup() if 'bool' in token_dict: self.stack_pointer['bool'] = token_dict['bool'] for subtoken in token: if isinstance( subtoken, str ): # Grammar literals, boolean operators and parentheses continue self._parse_token(subtoken) self._close_subgroup() else: # pragma: no cover - this should never happen raise InvalidQueryError( 'Got unexpected token: {token}'.format(token=token))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) self.dmg_command.copy_certificates(get_log_file("daosCA/certs"), self.hostlist_clients) self.dmg_command.copy_configuration(self.hostlist_clients) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "initial", "resv_file") try: reserved_file_copy(self, initial_resv_file, self.pool[0], resv_cont, num_bytes=500000000, cmd="write") except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container write failed>>") from error # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 # verify reserved container data final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "final", "resv_file") try: reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont) except CommandFailure as error: raise SoakTestError( "<<FAILED: Soak reserved container read failed>>") from error if not cmp(initial_resv_file, final_resv_file): self.soak_errors.append("Data verification error on reserved pool" " after SOAK completed") for file in [initial_resv_file, final_resv_file]: if os.path.isfile(file): file_name = os.path.split(os.path.dirname(file))[-1] # save a copy of the POSIX file in self.outputsoakdir copy_cmd = "cp -p {} {}/{}_resv_file".format( file, self.outputsoakdir, file_name) try: run_command(copy_cmd, timeout=30) except DaosTestError as error: self.soak_errors.append( "Reserved data file {} failed to archive".format(file)) os.remove(file) self.container.append(resv_cont) # Gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def verify_expected_states(self, set_expected=False): """Verify that the expected job process states match the current states. Args: set_expected (bool, optional): option to update the expected job process states to the current states prior to verification. Defaults to False. Returns: dict: a dictionary of whether or not any of the job process states were not 'expected' (which should warrant an error) and whether or not the job process require a 'restart' (either due to any unexpected states or because at least one job process was no longer found to be running) """ status = {"expected": True, "restart": False} show_log_hosts = [] # Get the current state of each job process current_states = self.get_current_state() if set_expected: # Assign the expected states to the current job process states self.log.info("<%s> Assigning expected %s states.", self._id.upper(), self._id) self._expected_states = current_states.copy() # Verify the expected states match the current states self.log.info("<%s> Verifying %s states: group=%s, hosts=%s", self._id.upper(), self._id, self.get_config_value("name"), NodeSet.fromlist(self._hosts)) if current_states: log_format = " %-4s %-15s %-36s %-22s %-14s %s" self.log.info(log_format, "Rank", "Host", "UUID", "Expected State", "Current State", "Result") self.log.info(log_format, "-" * 4, "-" * 15, "-" * 36, "-" * 22, "-" * 14, "-" * 6) # Verify that each expected rank appears in the current states for rank in sorted(self._expected_states): current_host = self._expected_states[rank]["host"] expected = self._expected_states[rank]["state"] if isinstance(expected, (list, tuple)): expected = [item.lower() for item in expected] else: expected = [expected.lower()] try: current_rank = current_states.pop(rank) current = current_rank["state"].lower() except KeyError: current = "not detected" # Check if the job's expected state matches the current state result = "PASS" if current in expected else "RESTART" status["expected"] &= current in expected # Restart all job processes if the expected rank is not running if current not in self._states["running"]: status["restart"] = True result = "RESTART" # Keep track of any server in the errored state or in an # unexpected state in order to display its log if (current in self._states["errored"] or current not in expected): if current_host not in show_log_hosts: show_log_hosts.append(current_host) self.log.info(log_format, rank, current_host, self._expected_states[rank]["uuid"], "|".join(expected), current, result) elif not self._expected_states: # Expected states are populated as part of start() procedure, # so if it is empty there was an error starting the job processes. self.log.info( " Unable to obtain current %s state. Undefined expected %s " "states due to a failure starting the %s.", self._id, self._id, self._id, ) status["restart"] = True else: # Any failure to obtain the current rank information is an error self.log.info( " Unable to obtain current %s state. If the %ss are " "not running this is expected.", self._id, self._id) # Do not report an error if all servers are expected to be stopped all_stopped = bool(self._expected_states) for rank in sorted(self._expected_states): states = self._expected_states[rank]["state"] if not isinstance(states, (list, tuple)): states = [states] if "stopped" not in [item.lower() for item in states]: all_stopped = False break if all_stopped: self.log.info(" All %s are expected to be stopped.", self._id) status["restart"] = True else: status["expected"] = False # Any unexpected state detected warrants a restart of all job processes if not status["expected"]: status["restart"] = True # Set the verified timestamp if set_expected and hasattr(self.manager, "timestamps"): self.manager.timestamps["verified"] = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") # Dump the server logs for any identified server if show_log_hosts: self.log.info( "<SERVER> logs for ranks in the errored state since start " "detection or detected in an unexpected state") if hasattr(self.manager, "dump_logs"): self.manager.dump_logs(show_log_hosts) return status
def _get_deps(self, component, rule): """ Find dependencies of a given component. This implies calling the rule.depsfinder script. Substitution of variables is done. Returns None if the given rule has already been applied on the given component. """ result = dict() depsfinder = rule.depsfinder if rule.dependson is None or len(rule.dependson) == 0 or \ depsfinder is None or len(depsfinder) == 0: _LOGGER.debug("No 'DepsFinder' or 'DependsOn' specified" + \ " in rule %s for component %s. Skipping.", rule, component) return result var_map = _get_var_map(component.id, component.name, component.type, component.category, self.ruleset.name, rule.name, rule.help) cmd = substitute(var_map, depsfinder) _LOGGER.debug("Calling depsfinder for component %s: %s", component, cmd) popen_args = shlex.split(to_str_from_unicode(cmd, should_be_uni=True)) try: popen = subprocess.Popen(popen_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1) # Use system default except OSError as ose: _LOGGER.error("Can't call depsfinder '%s': %s", cmd, ose) return result (msg_std, msg_err) = popen.communicate() msg_std = msg_std.strip() msg_err = msg_err.strip() if len(msg_err) != 0: _LOGGER.warning("Depsfinder error when " + \ "applying rule %s to component %s: %s", rule, component, msg_err) deps = set() with StringIO(to_unicode(msg_std)) as reader: for dep in reader: dep_id = dep.strip() if len(dep_id) == 0: continue dependency = self.components_map.get(dep_id) if dependency is None: _LOGGER.debug("Creating dep for component %s with id: %r", component, dep_id) dependency = Component(dep_id) self.components_map[dep_id] = dependency deps.add(dependency) _update_graph_with_node(self.dag, dep_id) if _LOGGER.isEnabledFor(INFO): _LOGGER.info("%s.depsfinder(%s): %s", rule.name, component.id, NodeSet.fromlist([str(x.id) for x in deps])) # Find match only on rule.dependson return _find_match([self.ruleset.rules_for[x] for x in rule.dependson], deps)
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] test_to = self.params.get("test_timeout", test_param + "*") self.job_timeout = self.params.get("job_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") self.nodesperjob = self.params.get("nodesperjob", test_param + "*") self.taskspernode = self.params.get("taskspernode", test_param + "*") harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") obj_class = self.params.get("oclass", "/run/container_reserved/*") if harassers: harasserlist = get_harassers(harassers) self.harassers = harasserlist[:] run_harasser = True self.log.info("<< Initial harrasser list = %s>>", " ".join([harasser for harasser in self.harassers])) # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) resv_cont.write_objects(rank, obj_class) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize if harassers if run_harasser and not self.harassers: self.harasser_results = {} self.harasser_args = {} self.harassers = harasserlist[:] try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) self.soak_errors.extend(self.destroy_containers(self.container)) self.soak_errors.extend(self.destroy_pools(self.pool[1])) # remove the test pools from self.pool; preserving reserved pool self.container = [] self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # fail if the pool/containers did not clean up correctly self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and self.harassers: self.harasser_loop_time = loop_time self.loop += 1 # TO-DO: use IOR if not resv_cont.read_objects(): self.soak_errors.append("Data verification error on reserved pool" "after SOAK completed") self.container.append(resv_cont) # gather the daos logs from the client nodes self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - start_time))
def ttyloop(task, nodeset, timeout, display, remote): """Manage the interactive prompt to run command""" readline_avail = False interactive = task.default("USER_interactive") if interactive: try: import readline readline_setup() readline_avail = True except ImportError: pass display.vprint(VERB_STD, \ "Enter 'quit' to leave this interactive mode") rc = 0 ns = NodeSet(nodeset) ns_info = True cmd = "" while task.default("USER_running") or \ (interactive and cmd.lower() != 'quit'): try: # Set SIGUSR1 handler if needed if task.default("USER_handle_SIGUSR1"): signal.signal(signal.SIGUSR1, signal_handler) if task.default("USER_interactive") and \ not task.default("USER_running"): if ns_info: display.vprint(VERB_QUIET, \ "Working with nodes: %s" % ns) ns_info = False prompt = "clush> " else: prompt = "" try: cmd = raw_input(prompt) assert cmd is not None, "Result of raw_input() is None!" finally: signal.signal(signal.SIGUSR1, signal.SIG_IGN) except EOFError: print() return except UpdatePromptException: if task.default("USER_interactive"): continue return except KeyboardInterrupt as kbe: # Caught SIGINT here (main thread) but the signal will also reach # subprocesses (that will most likely kill them) if display.gather: # Suspend task, so we can safely access its data from here task.suspend() # If USER_running is not set, the task had time to finish, # that could mean all subprocesses have been killed and all # handlers have been processed. if not task.default("USER_running"): # let's clush_excepthook handle the rest raise kbe # If USER_running is set, the task didn't have time to finish # its work, so we must print something for the user... print_warn = False # Display command output, but cannot order buffers by rc nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1])) for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()), key=bufnodeset_cmpkey): if not print_warn: print_warn = True display.vprint_err(VERB_STD, \ "Warning: Caught keyboard interrupt!") display.print_gather(nodeset, buf) # Return code handling verbexit = VERB_QUIET if display.maxrc: verbexit = VERB_STD ns_ok = NodeSet() for rc, nodelist in task.iter_retcodes(): ns_ok.add(NodeSet._fromlist1(nodelist)) if rc != 0: # Display return code if not ok ( != 0) nsdisp = ns = NodeSet._fromlist1(nodelist) if display.verbosity >= VERB_QUIET and len(ns) > 1: nsdisp = "%s (%d)" % (ns, len(ns)) msgrc = "clush: %s: exited with exit code %d" % (nsdisp, rc) display.vprint_err(verbexit, msgrc) # Add uncompleted nodeset to exception object kbe.uncompleted_nodes = ns - ns_ok # Display nodes that didn't answer within command timeout delay if task.num_timeout() > 0: display.vprint_err(verbexit, \ "clush: %s: command timeout" % \ NodeSet._fromlist1(task.iter_keys_timeout())) raise kbe if task.default("USER_running"): ns_reg, ns_unreg = NodeSet(), NodeSet() for client in task._engine.clients(): if client.registered: ns_reg.add(client.key) else: ns_unreg.add(client.key) if ns_unreg: pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg) else: pending = "" display.vprint_err(VERB_QUIET, "clush: interrupt (^C to abort task)") gws = list(task.gateways) if not gws: display.vprint_err(VERB_QUIET, "clush: in progress(%d): %s%s" % (len(ns_reg), ns_reg, pending)) else: display.vprint_err(VERB_QUIET, "clush: in progress(%d): %s%s\n" "clush: [tree] open gateways(%d): %s" % (len(ns_reg), ns_reg, pending, len(gws), NodeSet._fromlist1(gws))) for gw, (chan, metaworkers) in task.gateways.items(): act_targets = NodeSet.fromlist(mw.gwtargets[gw] for mw in metaworkers) if act_targets: display.vprint_err(VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" % (len(act_targets), gw, act_targets)) else: cmdl = cmd.lower() try: ns_info = True if cmdl.startswith('+'): ns.update(cmdl[1:]) elif cmdl.startswith('-'): ns.difference_update(cmdl[1:]) elif cmdl.startswith('@'): ns = NodeSet(cmdl[1:]) elif cmdl == '=': display.gather = not display.gather if display.gather: display.vprint(VERB_STD, \ "Switching to gathered output format") else: display.vprint(VERB_STD, \ "Switching to standard output format") task.set_default("stdout_msgtree", \ display.gather or display.line_mode) ns_info = False continue elif not cmdl.startswith('?'): # if ?, just print ns_info ns_info = False except NodeSetParseError: display.vprint_err(VERB_QUIET, \ "clush: nodeset parse error (ignoring)") if ns_info: continue if cmdl.startswith('!') and len(cmd.strip()) > 0: run_command(task, cmd[1:], None, timeout, display, remote) elif cmdl != "quit": if not cmd: continue if readline_avail: readline.write_history_file(get_history_file()) run_command(task, cmd, ns, timeout, display, remote) return rc
def ttyloop(task, nodeset, timeout, display, remote): """Manage the interactive prompt to run command""" readline_avail = False interactive = task.default("USER_interactive") if interactive: try: import readline readline_setup() readline_avail = True except ImportError: pass display.vprint(VERB_STD, \ "Enter 'quit' to leave this interactive mode") rc = 0 ns = NodeSet(nodeset) ns_info = True cmd = "" while task.default("USER_running") or \ (interactive and cmd.lower() != 'quit'): try: # Set SIGUSR1 handler if needed if task.default("USER_handle_SIGUSR1"): signal.signal(signal.SIGUSR1, signal_handler) if task.default("USER_interactive") and \ not task.default("USER_running"): if ns_info: display.vprint(VERB_QUIET, \ "Working with nodes: %s" % ns) ns_info = False prompt = "clush> " else: prompt = "" try: cmd = raw_input(prompt) assert cmd is not None, "Result of raw_input() is None!" finally: signal.signal(signal.SIGUSR1, signal.SIG_IGN) except EOFError: print() return except UpdatePromptException: if task.default("USER_interactive"): continue return except KeyboardInterrupt as kbe: # Caught SIGINT here (main thread) but the signal will also reach # subprocesses (that will most likely kill them) if display.gather: # Suspend task, so we can safely access its data from here task.suspend() # If USER_running is not set, the task had time to finish, # that could mean all subprocesses have been killed and all # handlers have been processed. if not task.default("USER_running"): # let's clush_excepthook handle the rest raise kbe # If USER_running is set, the task didn't have time to finish # its work, so we must print something for the user... print_warn = False # Display command output, but cannot order buffers by rc nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1])) for buf, nodeset in sorted(map(nodesetify, task.iter_buffers()), key=bufnodeset_cmpkey): if not print_warn: print_warn = True display.vprint_err(VERB_STD, \ "Warning: Caught keyboard interrupt!") display.print_gather(nodeset, buf) # Return code handling verbexit = VERB_QUIET if display.maxrc: verbexit = VERB_STD ns_ok = NodeSet() for rc, nodelist in task.iter_retcodes(): ns_ok.add(NodeSet._fromlist1(nodelist)) if rc != 0: # Display return code if not ok ( != 0) nsdisp = ns = NodeSet._fromlist1(nodelist) if display.verbosity >= VERB_QUIET and len(ns) > 1: nsdisp = "%s (%d)" % (ns, len(ns)) msgrc = "clush: %s: exited with exit code %d" % ( nsdisp, rc) display.vprint_err(verbexit, msgrc) # Add uncompleted nodeset to exception object kbe.uncompleted_nodes = ns - ns_ok # Display nodes that didn't answer within command timeout delay if task.num_timeout() > 0: display.vprint_err(verbexit, \ "clush: %s: command timeout" % \ NodeSet._fromlist1(task.iter_keys_timeout())) raise kbe if task.default("USER_running"): ns_reg, ns_unreg = NodeSet(), NodeSet() for client in task._engine.clients(): if client.registered: ns_reg.add(client.key) else: ns_unreg.add(client.key) if ns_unreg: pending = "\nclush: pending(%d): %s" % (len(ns_unreg), ns_unreg) else: pending = "" display.vprint_err(VERB_QUIET, "clush: interrupt (^C to abort task)") gws = list(task.gateways) if not gws: display.vprint_err( VERB_QUIET, "clush: in progress(%d): %s%s" % (len(ns_reg), ns_reg, pending)) else: display.vprint_err( VERB_QUIET, "clush: in progress(%d): %s%s\n" "clush: [tree] open gateways(%d): %s" % (len(ns_reg), ns_reg, pending, len(gws), NodeSet._fromlist1(gws))) for gw, (chan, metaworkers) in task.gateways.items(): act_targets = NodeSet.fromlist(mw.gwtargets[gw] for mw in metaworkers) if act_targets: display.vprint_err( VERB_QUIET, "clush: [tree] in progress(%d) on %s: %s" % (len(act_targets), gw, act_targets)) else: cmdl = cmd.lower() try: ns_info = True if cmdl.startswith('+'): ns.update(cmdl[1:]) elif cmdl.startswith('-'): ns.difference_update(cmdl[1:]) elif cmdl.startswith('@'): ns = NodeSet(cmdl[1:]) elif cmdl == '=': display.gather = not display.gather if display.gather: display.vprint(VERB_STD, \ "Switching to gathered output format") else: display.vprint(VERB_STD, \ "Switching to standard output format") task.set_default("stdout_msgtree", \ display.gather or display.line_mode) ns_info = False continue elif not cmdl.startswith('?'): # if ?, just print ns_info ns_info = False except NodeSetParseError: display.vprint_err(VERB_QUIET, \ "clush: nodeset parse error (ignoring)") if ns_info: continue if cmdl.startswith('!') and len(cmd.strip()) > 0: run_command(task, cmd[1:], None, timeout, display, remote) elif cmdl != "quit": if not cmd: continue if readline_avail: readline.write_history_file(get_history_file()) run_command(task, cmd, ns, timeout, display, remote) return rc
def ev_timeout(self, worker): """Timeout occurred on some nodes""" self.result.nodes_ko.add(NodeSet.fromlist(worker.iter_keys_timeout()))
def add(self, node): self.ns |= NodeSet.fromlist(node)
def __init__(self, nodes): self.nodes = NodeSet.fromlist(nodes)
def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ # Include all hosts when stopping to ensure all mount points in any # state are properly removed self.running_hosts.add(NodeSet.fromlist(self.hosts)) self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value, self.running_hosts) if self.mount_dir.value and self.running_hosts: error_list = [] # Loop until all fuseblk mounted devices are unmounted counter = 0 while self.running_hosts and counter < 3: # Attempt to kill dfuse on after first unmount fails if self.running_hosts and counter > 1: kill_command = "pkill dfuse --signal KILL" pcmd(self.running_hosts, kill_command, timeout=30) # Attempt to unmount any fuseblk mounted devices after detection if self.running_hosts and counter > 0: pcmd(self.running_hosts, self.get_umount_command(counter > 1), expect_rc=None) time.sleep(2) # Detect which hosts have fuseblk mounted devices and remove any # hosts which no longer have the dfuse mount point mounted state = self.check_mount_state(self.running_hosts) for host in state["unmounted"].union(state["nodirectory"]): self.running_hosts.remove(host) # Increment the loop counter counter += 1 if self.running_hosts: error_list.append("Error stopping dfuse on {}".format( self.running_hosts)) # Remove mount points try: self.remove_mount_point() except CommandFailure as error: error_list.append(error) # Report any errors if error_list: raise CommandFailure("\n".join(error_list)) elif self.mount_dir.value is None: self.log.info("No dfuse mount directory defined - nothing to stop") else: self.log.info("No hosts running dfuse - nothing to stop")
def build_job_script(self, nodesperjob, job, pool): """Create a slurm batch script that will execute a list of jobs. Args: nodesperjob(int): number of nodes executing each job job(str): the job that will be defined in the slurm script with /run/"job"/. It is currently defined in the yaml as: Example job: job1: name: job1 - unique name time: 10 - cmdline time in seconds; used in IOR -T param tasks: 1 - number of processes per node --ntaskspernode jobspec: - ior_daos - ior_mpiio pool (obj): TestPool obj Returns: script_list: list of slurm batch scripts """ self.log.info("<<Build Script for job %s >> at %s", job, time.ctime()) script_list = [] # create one batch script per cmdline # get job params job_params = "/run/" + job job_name = self.params.get("name", "/".join([job_params, "*"])) job_specs = self.params.get("jobspec", "/".join([job_params, "*"])) task_list = self.params.get("tasks", "/".join([job_params, "*"])) job_time = self.params.get("time", "/".join([job_params, "*"])) # job_time in minutes:seconds format job_time = str(job_time) + ":00" for job_spec in job_specs: if "ior" in job_spec: # Create IOR cmdline cmd_list = self.create_ior_cmdline(job_params, job_spec, pool) elif "dmg" in job_spec: # create dmg cmdline cmd_list = self.create_dmg_cmdline(job_params, job_spec, pool) else: raise SoakTestError( "<<FAILED: Soak job: {} Job spec {} is invalid>>".format( job, job_spec)) # a single cmdline per batch job; so that a failure is per cmdline # change to multiple cmdlines per batch job later. for cmd in cmd_list: # additional sbatch params for tasks in task_list: output = os.path.join( self.rem_pass_dir, "%N_" + self.test_name + "_" + job_name + "_" + job_spec + "_results.out_%j_%t_" + str(tasks) + "_") num_tasks = nodesperjob * tasks sbatch = { "ntasks-per-node": tasks, "ntasks": num_tasks, "time": job_time, "partition": self.partition_clients, "exclude": NodeSet.fromlist(self.hostlist_servers) } script = slurm_utils.write_slurm_script( self.rem_pass_dir, job_name, output, nodesperjob, [cmd], sbatch) script_list.append(script) return script_list
def select_nodes(self, profil, name, nb_nodes, host): '''Select nodes to spawn''' # 1: recover available nodelist # 2: select nb_nodes among availables nodes # 3: return the list of nodes err = "" nodes = [] if host is None: err = "Error: No host available\n" _LOGGER.error(err) self.rep_sock.send(msgpack.packb(('', [err]))) return nodes if not vc.VirtualCluster.valid_clustername(name): err = "Error: clustername '{}' is not a valid name\n".format(name) _LOGGER.error(err) self.rep_sock.send(msgpack.packb(('', [err]))) return nodes if profil not in self.profiles: err = "Error: Profil '{}' not found in configuration file\n".format( profil) _LOGGER.error(err) self.rep_sock.send(msgpack.packb(('', [err]))) return nodes nodelist = self.list_nodes(byhost=False) nodeset = NodeSet.fromlist([node.name for node in nodelist]) idx_min = 0 idx_max = nb_nodes - 1 base_range = RangeSet("%d-%d" % (idx_min, idx_max)) base_nodeset = NodeSetBase(name + '%s', base_range) ndset_inter = nodeset.intersection(base_nodeset) while len(ndset_inter) != 0: indexes = [ clustdock.VirtualNode.split_name(node)[1] for node in ndset_inter ] for idx in indexes: _LOGGER.debug("Removing %d from rangeset %s", idx, base_range) base_range.remove(idx) base_nodeset.difference_update(ndset_inter) _LOGGER.debug("Nodeset becomes '%s' after removing", base_nodeset) idx_min = max(indexes + list(base_range)) + 1 idx_max = idx_min + max([len(indexes), nb_nodes - len(base_range)]) base_range.add_range(idx_min, idx_max) _LOGGER.debug("New rangeset: %s", base_range) base_nodeset.update( NodeSetBase(name + '%s', RangeSet.fromlist([range(idx_min, idx_max)]))) _LOGGER.debug("New nodeset: %s", base_nodeset) ndset_inter = nodeset.intersection(base_nodeset) final_range = base_range _LOGGER.debug("final rangeset/nodeset: %s / %s", base_range, base_nodeset) cluster = vc.VirtualCluster(name, profil, self.profiles[profil]) nodes = [] for idx in final_range: node = cluster.add_node(idx, host) nodes.append(node) return nodes
def delete(self, node): self.ns -= NodeSet.fromlist(node)
def ev_close(self, worker, timedout): """Worker has finished (command done on all nodes)""" if timedout: nodeset = NodeSet.fromlist(worker.iter_keys_timeout()) self.result.nodes_ko.add(nodeset) self.result.show()
def display_proxy_errors(cls, fs): """Display proxy error messages for the specified filesystem.""" for msg, nodes in fs.proxy_errors.walk(): nodes = str(NodeSet.fromlist(nodes)) msg = str(msg).replace('THIS_SHINE_HOST', nodes) print("%s: %s" % (nodes, msg), file=sys.stderr)
def ev_timeout(self, worker): """Timeout occurred on some nodes""" self.result.nodes_ko.add(NodeSet.fromlist(worker.iter_keys_timeout()))
def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: dict: log output per host """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = [ "sudo", "journalctl", "--unit={}".format(self._systemctl.service.value), "--since=\"{}\"".format(since), ] if until: command.append("--until=\"{}\"".format(until)) self.log.info("Gathering log data on %s: %s", str(hosts), " ".join(command)) # Gather the log information per host task = run_task(hosts, " ".join(command), timeout) # Create a dictionary of hosts for each unique return code results = {code: hosts for code, hosts in task.iter_retcodes()} # Determine if the command completed successfully across all the hosts status = len(results) == 1 and 0 in results # Determine if any commands timed out timed_out = [str(hosts) for hosts in task.iter_keys_timeout()] if timed_out: status = False if not status: self.log.info(" Errors detected running \"%s\":", command) # List any hosts that timed out if timed_out: self.log.info(" %s: timeout detected after %s seconds", str(NodeSet.fromlist(timed_out)), timeout) # Display/return the command output log_data = {} for code in sorted(results): # Get the command output from the hosts with this return code output_data = list(task.iter_buffers(results[code])) if not output_data: output_data = [["<NONE>", results[code]]] for output_buffer, output_hosts in output_data: node_set = NodeSet.fromlist(output_hosts) lines = str(output_buffer).splitlines() if status: # Add the successful output from each node to the dictionary log_data[node_set] = lines else: # Display all of the results in the case of an error if len(lines) > 1: self.log.info(" %s: rc=%s, output:", node_set, code) for line in lines: self.log.info(" %s", line) else: self.log.info(" %s: rc=%s, output: %s", node_set, code, output_buffer) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data
def get_ucx_info(hosts, supported=None, verbose=True): """Get the UCX provider information from the specified hosts. Args: hosts (NodeSet): hosts from which to gather the information supported (list, optional): list of supported providers when if provided will limit the inclusion to only those providers specified. Defaults to None. verbose (bool, optional): display command details. Defaults to True. Returns: dict: a dictionary of interface keys with a dictionary value of a comma-separated string of providers key with a NodeSet value where the providers where detected. """ task = run_task(hosts, "ucx_info -d", verbose=verbose) if verbose: display_task(task) # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which # the providers were detected. providers = {} results = dict(task.iter_retcodes()) if 0 in results: for output, nodelist in task.iter_buffers(results[0]): output_lines = [ line.decode("utf-8").rstrip(os.linesep) for line in output ] nodeset = NodeSet.fromlist(nodelist) # Find all the transport, device, and type pairings. The ucx_info output reports these # on separate lines so when processing the re matches ensure each device is preceded by # a provider. interface_providers = {} data = re.findall(r"(Transport|Device):\s+([A-Za-z0-9;_+]+)", "\n".join(output_lines)) while data: transport = list(data.pop(0)) if transport[0] == "Transport" and data[0][0] == "Device": transport.pop(0) device = list(data.pop(0)) device.pop(0) # A transport and device must be specified if not transport or not device: continue # Add 'ucx+' to the provider and replace 'mlx[0-9]' with 'x' transport = [ "+".join(["ucx", re.sub(r"mlx[0-9]+", "x", item)]) for item in transport ] # Only include supported providers if a supported list is provided if supported and transport[0] not in supported: continue if device[0] not in interface_providers: interface_providers[device[0]] = set() interface_providers[device[0]].update(transport) for interface, provider_set in interface_providers.items(): if interface not in providers: providers[interface] = {} provider_key = ",".join(list(provider_set)) if provider_key not in providers[interface]: providers[interface][provider_key] = NodeSet() providers[interface][provider_key].update(nodeset) return providers
def run_auto(): """ Run auto mode """ global EV nstates = pbs.node_states() #print json.dumps(nstates, sort_keys=True, indent=4, separators=(',', ': ')) #make list of nodes that dont have jobs jobless = [] for node, nodest in list(nstates.items()): if not ('resources_assigned' in nodest \ and 'ncpus' in nodest['resources_assigned'] \ and nodest['resources_assigned']['ncpus'] > 0 ): jobless.append(node) for node, nodest in list(nstates.items()): states = nodest['state'].split(',') known_bad = node in STATE['nodes'] vlog( 5, 'eval node={} state={} jobs={} bad={}'.format( node, nodest['state'], node in jobless, known_bad)) #find known bad nodes that are not offline if known_bad and not 'offline' in states: vlog(2, 'bad node %s was not offline in pbs' % (node)) scheduler_close_nodes([node], 'known bad node') #look for known bad states if pbs.is_pbs_down(states) and not known_bad: #node is in bad state but not known to be bad vlog(2, 'detected node in bad state %s: %s' % (node, nodest['state'])) add_nodes([node], 'PBS state = {}'.format(nodest['state'])) #find nodes in pending states that no longer have jobs if node in jobless and 'offline' in states and known_bad: has_sibling_job = False for sib, sibst in list(STATE['nodes'].items()): if node in sibst['siblings'] and not sib in jobless: has_sibling_job = True vlog(5, 'node %s has sibling %s with job' % (node, sib)) vlog( 5, 'eval pending node={} job_sibling={} state={}'.format( node, has_sibling_job, STATE['nodes'][node]['state'])) if not has_sibling_job: release_pending_node(node) else: vlog(4, 'bad node %s skipped due to sibling jobs' % (node)) #find nodes that are powered off and not already bad nodes check_nodes = [] #create list of nodes that are not bad and update scheduler comments for node, nodest in list(nstates.items()): if not node in STATE['nodes'] or is_pending_node(node): check_nodes.append(node) if node in STATE['nodes']: scmt = None if 'comment' in nodest: scmt = '%s: %s' % (nodest['state'], nodest['comment']) else: scmt = nodest['state'] if not 'scheduler_comment' in STATE['nodes'][ node] or STATE['nodes'][node]['scheduler_comment'] != scmt: for ev_id in STATE['nodes'][node]['extraview']: EV.add_resolver_comment(ev_id, 'PBS State Change:\n%s' % scmt) vlog( 3, '%s EV#%s comment pbs state change: %s' % (node, ev_id, scmt)) STATE['nodes'][node]['scheduler_comment'] = scmt vlog(4, 'checking ipmi power status of %s nodes' % (len(check_nodes))) power_status = ipmi.command(NodeSet.fromlist(check_nodes), 'power status') if not power_status: vlog( 2, 'unable to call ipmi power status for %s nodes' % (len(check_nodes))) else: for node in check_nodes: why = False #has value if node is down if node in power_status: if not 'Chassis Power is on' in power_status[node]: why = 'invalid power status: %s' % (power_status[node]) else: why = 'unable to query power status' #release pending nodes if power is off since pbs won't notice for forever if node in STATE['nodes']: if is_pending_node(node) and why: comment_nodes([node], why) release_pending_node(node) else: #not a bad node yet if why: add_nodes([node], why) release_pending_node(node) save_state()
def main(): """clush script entry point""" sys.excepthook = clush_excepthook # # Argument management # usage = "%prog [options] command" parser = OptionParser(usage) parser.add_option("-n", "--nostdin", action="store_true", dest="nostdin", help="don't watch for possible input from stdin") parser.install_groupsconf_option() parser.install_clush_config_options() parser.install_nodes_options() parser.install_display_options(verbose_options=True) parser.install_filecopy_options() parser.install_connector_options() (options, args) = parser.parse_args() set_std_group_resolver_config(options.groupsconf) # # Load config file and apply overrides # config = ClushConfig(options, options.conf) # Initialize logging if config.verbosity >= VERB_DEBUG: logging.basicConfig(level=logging.DEBUG) logging.debug("clush: STARTING DEBUG") else: logging.basicConfig(level=logging.CRITICAL) # Should we use ANSI colors for nodes? if config.color == "auto": color = sys.stdout.isatty() and (options.gatherall or \ sys.stderr.isatty()) else: color = config.color == "always" try: # Create and configure display object. display = Display(options, config, color) except ValueError as exc: parser.error("option mismatch (%s)" % exc) if options.groupsource: # Be sure -a/g -s source work as espected. std_group_resolver().default_source_name = options.groupsource # Compute the nodeset and warn for possible use of shell pathname # expansion (#225) wnodelist = [] xnodelist = [] if options.nodes: wnodelist = [NodeSet(nodes) for nodes in options.nodes] if options.exclude: xnodelist = [NodeSet(nodes) for nodes in options.exclude] for (opt, nodelist) in (('w', wnodelist), ('x', xnodelist)): for nodes in nodelist: if len(nodes) == 1 and exists(str(nodes)): display.vprint_err(VERB_STD, "Warning: using '-%s %s' and " "local path '%s' exists, was it expanded " "by the shell?" % (opt, nodes, nodes)) # --hostfile support (#235) for opt_hostfile in options.hostfile: try: fnodeset = NodeSet() with open(opt_hostfile) as hostfile: for line in hostfile.read().splitlines(): fnodeset.updaten(nodes for nodes in line.split()) display.vprint_err(VERB_DEBUG, "Using nodeset %s from hostfile %s" % (fnodeset, opt_hostfile)) wnodelist.append(fnodeset) except IOError as exc: # re-raise as OSError to be properly handled errno, strerror = exc.args raise OSError(errno, strerror, exc.filename) # Instantiate target nodeset from command line and hostfile nodeset_base = NodeSet.fromlist(wnodelist) # Instantiate filter nodeset (command line only) nodeset_exclude = NodeSet.fromlist(xnodelist) # Specified engine prevails over default engine DEFAULTS.engine = options.engine # Do we have nodes group? task = task_self() task.set_info("debug", config.verbosity >= VERB_DEBUG) if config.verbosity == VERB_DEBUG: std_group_resolver().set_verbosity(1) if options.nodes_all: all_nodeset = NodeSet.fromall() display.vprint(VERB_DEBUG, "Adding nodes from option -a: %s" % \ all_nodeset) nodeset_base.add(all_nodeset) if options.group: grp_nodeset = NodeSet.fromlist(options.group, resolver=RESOLVER_NOGROUP) for grp in grp_nodeset: addingrp = NodeSet("@" + grp) display.vprint(VERB_DEBUG, \ "Adding nodes from option -g %s: %s" % (grp, addingrp)) nodeset_base.update(addingrp) if options.exgroup: grp_nodeset = NodeSet.fromlist(options.exgroup, resolver=RESOLVER_NOGROUP) for grp in grp_nodeset: removingrp = NodeSet("@" + grp) display.vprint(VERB_DEBUG, \ "Excluding nodes from option -X %s: %s" % (grp, removingrp)) nodeset_exclude.update(removingrp) # Do we have an exclude list? (-x ...) nodeset_base.difference_update(nodeset_exclude) if len(nodeset_base) < 1: parser.error('No node to run on.') if options.pick and options.pick < len(nodeset_base): # convert to string for sample as nsiter() is slower for big # nodesets; and we assume options.pick will remain small-ish keep = random.sample(list(nodeset_base), options.pick) nodeset_base.intersection_update(','.join(keep)) if config.verbosity >= VERB_VERB: msg = "Picked random nodes: %s" % nodeset_base print(Display.COLOR_RESULT_FMT % msg) # Set open files limit. set_fdlimit(config.fd_max, display) # # Task management # # check for clush interactive mode interactive = not len(args) and \ not (options.copy or options.rcopy) # check for foreground ttys presence (input) stdin_isafgtty = sys.stdin.isatty() and \ os.tcgetpgrp(sys.stdin.fileno()) == os.getpgrp() # check for special condition (empty command and stdin not a tty) if interactive and not stdin_isafgtty: # looks like interactive but stdin is not a tty: # switch to non-interactive + disable ssh pseudo-tty interactive = False # SSH: disable pseudo-tty allocation (-T) ssh_options = config.ssh_options or '' ssh_options += ' -T' config._set_main("ssh_options", ssh_options) if options.nostdin and interactive: parser.error("illegal option `--nostdin' in that case") # Force user_interaction if Clush._f_user_interaction for test purposes user_interaction = hasattr(sys.modules[__name__], '_f_user_interaction') if not options.nostdin: # Try user interaction: check for foreground ttys presence (ouput) stdout_isafgtty = sys.stdout.isatty() and \ os.tcgetpgrp(sys.stdout.fileno()) == os.getpgrp() user_interaction |= stdin_isafgtty and stdout_isafgtty display.vprint(VERB_DEBUG, "User interaction: %s" % user_interaction) if user_interaction: # Standard input is a terminal and we want to perform some user # interactions in the main thread (using blocking calls), so # we run cluster commands in a new ClusterShell Task (a new # thread is created). task = Task() # else: perform everything in the main thread # Handle special signal only when user_interaction is set task.set_default("USER_handle_SIGUSR1", user_interaction) task.excepthook = sys.excepthook task.set_default("USER_stdin_worker", not (sys.stdin.isatty() or \ options.nostdin or \ user_interaction)) display.vprint(VERB_DEBUG, "Create STDIN worker: %s" % \ task.default("USER_stdin_worker")) task.set_info("debug", config.verbosity >= VERB_DEBUG) task.set_info("fanout", config.fanout) if options.worker: try: if options.remote == 'no': task.set_default('local_worker', _load_workerclass(options.worker)) else: task.set_default('distant_worker', _load_workerclass(options.worker)) except (ImportError, AttributeError): msg = "ERROR: Could not load worker '%s'" % options.worker display.vprint_err(VERB_QUIET, msg) clush_exit(1, task) if options.topofile or task._default_tree_is_enabled(): if options.topofile: task.load_topology(options.topofile) if config.verbosity >= VERB_VERB: roots = len(task.topology.root.nodeset) gws = task.topology.inner_node_count() - roots msg = "enabling tree topology (%d gateways)" % gws print("clush: %s" % msg, file=sys.stderr) if options.grooming_delay: if config.verbosity >= VERB_VERB: msg = Display.COLOR_RESULT_FMT % ("Grooming delay: %f" % options.grooming_delay) print(msg, file=sys.stderr) task.set_info("grooming_delay", options.grooming_delay) elif options.rcopy: # By default, --rcopy should inhibit grooming task.set_info("grooming_delay", 0) if config.ssh_user: task.set_info("ssh_user", config.ssh_user) if config.ssh_path: task.set_info("ssh_path", config.ssh_path) if config.ssh_options: task.set_info("ssh_options", config.ssh_options) if config.scp_path: task.set_info("scp_path", config.scp_path) if config.scp_options: task.set_info("scp_options", config.scp_options) if config.rsh_path: task.set_info("rsh_path", config.rsh_path) if config.rcp_path: task.set_info("rcp_path", config.rcp_path) if config.rsh_options: task.set_info("rsh_options", config.rsh_options) # Set detailed timeout values task.set_info("connect_timeout", config.connect_timeout) task.set_info("command_timeout", config.command_timeout) # Enable stdout/stderr separation task.set_default("stderr", not options.gatherall) # Prevent reading from stdin? task.set_default("stdin", not options.nostdin) # Disable MsgTree buffering if not gathering outputs task.set_default("stdout_msgtree", display.gather or display.line_mode) # Always disable stderr MsgTree buffering task.set_default("stderr_msgtree", False) # Set timeout at worker level when command_timeout is defined. if config.command_timeout > 0: timeout = config.command_timeout else: timeout = -1 # Configure task custom status task.set_default("USER_interactive", interactive) task.set_default("USER_running", False) if (options.copy or options.rcopy) and not args: parser.error("--[r]copy option requires at least one argument") if options.copy: if not options.dest_path: # append '/' to clearly indicate a directory for tree mode options.dest_path = join(dirname(abspath(args[0])), '') op = "copy sources=%s dest=%s" % (args, options.dest_path) elif options.rcopy: if not options.dest_path: options.dest_path = dirname(abspath(args[0])) op = "rcopy sources=%s dest=%s" % (args, options.dest_path) else: op = "command=\"%s\"" % ' '.join(args) # print debug values (fanout value is get from the config object # and not task itself as set_info() is an asynchronous call. display.vprint(VERB_DEBUG, "clush: nodeset=%s fanout=%d [timeout " \ "conn=%.1f cmd=%.1f] %s" % (nodeset_base, config.fanout, config.connect_timeout, config.command_timeout, op)) if not task.default("USER_interactive"): if display.verbosity >= VERB_DEBUG and task.topology: print(Display.COLOR_RESULT_FMT % '-' * 15) print(Display.COLOR_RESULT_FMT % task.topology, end='') print(Display.COLOR_RESULT_FMT % '-' * 15) if options.copy: run_copy(task, args, options.dest_path, nodeset_base, timeout, options.preserve_flag, display) elif options.rcopy: run_rcopy(task, args, options.dest_path, nodeset_base, timeout, options.preserve_flag, display) else: run_command(task, ' '.join(args), nodeset_base, timeout, display, options.remote != 'no') if user_interaction: ttyloop(task, nodeset_base, timeout, display, options.remote != 'no') elif task.default("USER_interactive"): display.vprint_err(VERB_QUIET, \ "ERROR: interactive mode requires a tty") clush_exit(1, task) rc = 0 if options.maxrc: # Instead of clush return code, return commands retcode rc = task.max_retcode() if task.num_timeout() > 0: rc = 255 clush_exit(rc, task)
def __str__(self): return '\n'.join([ "{0}: {1}".format(NodeSet.fromlist(map(lambda x: "vm" + x, keys)), m) for m, keys in self._res_tree.walk() ])
def main(): """clush script entry point""" sys.excepthook = clush_excepthook # # Argument management # usage = "%prog [options] command" parser = OptionParser(usage) parser.add_option("-n", "--nostdin", action="store_true", dest="nostdin", help="don't watch for possible input from stdin") parser.install_groupsconf_option() parser.install_clush_config_options() parser.install_nodes_options() parser.install_display_options(verbose_options=True) parser.install_filecopy_options() parser.install_connector_options() (options, args) = parser.parse_args() set_std_group_resolver_config(options.groupsconf) # # Load config file and apply overrides # config = ClushConfig(options, options.conf) # Initialize logging if config.verbosity >= VERB_DEBUG: logging.basicConfig(level=logging.DEBUG) logging.debug("clush: STARTING DEBUG") else: logging.basicConfig(level=logging.CRITICAL) # Should we use ANSI colors for nodes? if config.color == "auto": color = sys.stdout.isatty() and (options.gatherall or \ sys.stderr.isatty()) else: color = config.color == "always" try: # Create and configure display object. display = Display(options, config, color) except ValueError as exc: parser.error("option mismatch (%s)" % exc) if options.groupsource: # Be sure -a/g -s source work as espected. std_group_resolver().default_source_name = options.groupsource # Compute the nodeset and warn for possible use of shell pathname # expansion (#225) wnodelist = [] xnodelist = [] if options.nodes: wnodelist = [NodeSet(nodes) for nodes in options.nodes] if options.exclude: xnodelist = [NodeSet(nodes) for nodes in options.exclude] for (opt, nodelist) in (('w', wnodelist), ('x', xnodelist)): for nodes in nodelist: if len(nodes) == 1 and exists(str(nodes)): display.vprint_err( VERB_STD, "Warning: using '-%s %s' and " "local path '%s' exists, was it expanded " "by the shell?" % (opt, nodes, nodes)) # --hostfile support (#235) for opt_hostfile in options.hostfile: try: fnodeset = NodeSet() with open(opt_hostfile) as hostfile: for line in hostfile.read().splitlines(): fnodeset.updaten(nodes for nodes in line.split()) display.vprint_err( VERB_DEBUG, "Using nodeset %s from hostfile %s" % (fnodeset, opt_hostfile)) wnodelist.append(fnodeset) except IOError as exc: # re-raise as OSError to be properly handled errno, strerror = exc.args raise OSError(errno, strerror, exc.filename) # Instantiate target nodeset from command line and hostfile nodeset_base = NodeSet.fromlist(wnodelist) # Instantiate filter nodeset (command line only) nodeset_exclude = NodeSet.fromlist(xnodelist) # Specified engine prevails over default engine DEFAULTS.engine = options.engine # Do we have nodes group? task = task_self() task.set_info("debug", config.verbosity >= VERB_DEBUG) if config.verbosity == VERB_DEBUG: std_group_resolver().set_verbosity(1) if options.nodes_all: all_nodeset = NodeSet.fromall() display.vprint(VERB_DEBUG, "Adding nodes from option -a: %s" % \ all_nodeset) nodeset_base.add(all_nodeset) if options.group: grp_nodeset = NodeSet.fromlist(options.group, resolver=RESOLVER_NOGROUP) for grp in grp_nodeset: addingrp = NodeSet("@" + grp) display.vprint(VERB_DEBUG, \ "Adding nodes from option -g %s: %s" % (grp, addingrp)) nodeset_base.update(addingrp) if options.exgroup: grp_nodeset = NodeSet.fromlist(options.exgroup, resolver=RESOLVER_NOGROUP) for grp in grp_nodeset: removingrp = NodeSet("@" + grp) display.vprint(VERB_DEBUG, \ "Excluding nodes from option -X %s: %s" % (grp, removingrp)) nodeset_exclude.update(removingrp) # Do we have an exclude list? (-x ...) nodeset_base.difference_update(nodeset_exclude) if len(nodeset_base) < 1: parser.error('No node to run on.') if options.pick and options.pick < len(nodeset_base): # convert to string for sample as nsiter() is slower for big # nodesets; and we assume options.pick will remain small-ish keep = random.sample(list(nodeset_base), options.pick) nodeset_base.intersection_update(','.join(keep)) if config.verbosity >= VERB_VERB: msg = "Picked random nodes: %s" % nodeset_base print(Display.COLOR_RESULT_FMT % msg) # Set open files limit. set_fdlimit(config.fd_max, display) # # Task management # # check for clush interactive mode interactive = not len(args) and \ not (options.copy or options.rcopy) # check for foreground ttys presence (input) stdin_isafgtty = sys.stdin.isatty() and \ os.tcgetpgrp(sys.stdin.fileno()) == os.getpgrp() # check for special condition (empty command and stdin not a tty) if interactive and not stdin_isafgtty: # looks like interactive but stdin is not a tty: # switch to non-interactive + disable ssh pseudo-tty interactive = False # SSH: disable pseudo-tty allocation (-T) ssh_options = config.ssh_options or '' ssh_options += ' -T' config._set_main("ssh_options", ssh_options) if options.nostdin and interactive: parser.error("illegal option `--nostdin' in that case") # Force user_interaction if Clush._f_user_interaction for test purposes user_interaction = hasattr(sys.modules[__name__], '_f_user_interaction') if not options.nostdin: # Try user interaction: check for foreground ttys presence (ouput) stdout_isafgtty = sys.stdout.isatty() and \ os.tcgetpgrp(sys.stdout.fileno()) == os.getpgrp() user_interaction |= stdin_isafgtty and stdout_isafgtty display.vprint(VERB_DEBUG, "User interaction: %s" % user_interaction) if user_interaction: # Standard input is a terminal and we want to perform some user # interactions in the main thread (using blocking calls), so # we run cluster commands in a new ClusterShell Task (a new # thread is created). task = Task() # else: perform everything in the main thread # Handle special signal only when user_interaction is set task.set_default("USER_handle_SIGUSR1", user_interaction) task.excepthook = sys.excepthook task.set_default("USER_stdin_worker", not (sys.stdin.isatty() or \ options.nostdin or \ user_interaction)) display.vprint(VERB_DEBUG, "Create STDIN worker: %s" % \ task.default("USER_stdin_worker")) task.set_info("debug", config.verbosity >= VERB_DEBUG) task.set_info("fanout", config.fanout) if options.worker: try: if options.remote == 'no': task.set_default('local_worker', _load_workerclass(options.worker)) else: task.set_default('distant_worker', _load_workerclass(options.worker)) except (ImportError, AttributeError): msg = "ERROR: Could not load worker '%s'" % options.worker display.vprint_err(VERB_QUIET, msg) clush_exit(1, task) if options.topofile or task._default_tree_is_enabled(): if options.topofile: task.load_topology(options.topofile) if config.verbosity >= VERB_VERB: roots = len(task.topology.root.nodeset) gws = task.topology.inner_node_count() - roots msg = "enabling tree topology (%d gateways)" % gws print("clush: %s" % msg, file=sys.stderr) if options.grooming_delay: if config.verbosity >= VERB_VERB: msg = Display.COLOR_RESULT_FMT % ("Grooming delay: %f" % options.grooming_delay) print(msg, file=sys.stderr) task.set_info("grooming_delay", options.grooming_delay) elif options.rcopy: # By default, --rcopy should inhibit grooming task.set_info("grooming_delay", 0) if config.ssh_user: task.set_info("ssh_user", config.ssh_user) if config.ssh_path: task.set_info("ssh_path", config.ssh_path) if config.ssh_options: task.set_info("ssh_options", config.ssh_options) if config.scp_path: task.set_info("scp_path", config.scp_path) if config.scp_options: task.set_info("scp_options", config.scp_options) if config.rsh_path: task.set_info("rsh_path", config.rsh_path) if config.rcp_path: task.set_info("rcp_path", config.rcp_path) if config.rsh_options: task.set_info("rsh_options", config.rsh_options) # Set detailed timeout values task.set_info("connect_timeout", config.connect_timeout) task.set_info("command_timeout", config.command_timeout) # Enable stdout/stderr separation task.set_default("stderr", not options.gatherall) # Prevent reading from stdin? task.set_default("stdin", not options.nostdin) # Disable MsgTree buffering if not gathering outputs task.set_default("stdout_msgtree", display.gather or display.line_mode) # Always disable stderr MsgTree buffering task.set_default("stderr_msgtree", False) # Set timeout at worker level when command_timeout is defined. if config.command_timeout > 0: timeout = config.command_timeout else: timeout = -1 # Configure task custom status task.set_default("USER_interactive", interactive) task.set_default("USER_running", False) if (options.copy or options.rcopy) and not args: parser.error("--[r]copy option requires at least one argument") if options.copy: if not options.dest_path: # append '/' to clearly indicate a directory for tree mode options.dest_path = join(dirname(abspath(args[0])), '') op = "copy sources=%s dest=%s" % (args, options.dest_path) elif options.rcopy: if not options.dest_path: options.dest_path = dirname(abspath(args[0])) op = "rcopy sources=%s dest=%s" % (args, options.dest_path) else: op = "command=\"%s\"" % ' '.join(args) # print debug values (fanout value is get from the config object # and not task itself as set_info() is an asynchronous call. display.vprint(VERB_DEBUG, "clush: nodeset=%s fanout=%d [timeout " \ "conn=%.1f cmd=%.1f] %s" % (nodeset_base, config.fanout, config.connect_timeout, config.command_timeout, op)) if not task.default("USER_interactive"): if display.verbosity >= VERB_DEBUG and task.topology: print(Display.COLOR_RESULT_FMT % '-' * 15) print(Display.COLOR_RESULT_FMT % task.topology, end='') print(Display.COLOR_RESULT_FMT % '-' * 15) if options.copy: run_copy(task, args, options.dest_path, nodeset_base, timeout, options.preserve_flag, display) elif options.rcopy: run_rcopy(task, args, options.dest_path, nodeset_base, timeout, options.preserve_flag, display) else: run_command(task, ' '.join(args), nodeset_base, timeout, display, options.remote != 'no') if user_interaction: ttyloop(task, nodeset_base, timeout, display, options.remote != 'no') elif task.default("USER_interactive"): display.vprint_err(VERB_QUIET, \ "ERROR: interactive mode requires a tty") clush_exit(1, task) rc = 0 if options.maxrc: # Instead of clush return code, return commands retcode rc = task.max_retcode() if task.num_timeout() > 0: rc = 255 clush_exit(rc, task)
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.container = [] self.harasser_results = {} self.harasser_args = {} run_harasser = False self.all_failed_jobs = [] self.all_failed_harassers = [] self.soak_errors = [] self.check_errors = [] self.used = [] self.mpi_module = self.params.get("mpi_module", "/run/*", default="mpi/mpich-x86_64") enable_sudo = self.params.get("enable_sudo", "/run/*", default=True) test_to = self.params.get("test_timeout", test_param + "*") self.test_name = self.params.get("name", test_param + "*") single_test_pool = self.params.get("single_test_pool", test_param + "*", True) harassers = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") resv_bytes = self.params.get("resv_bytes", test_param + "*", 500000000) ignore_soak_errors = self.params.get("ignore_soak_errors", test_param + "*", False) self.sudo_cmd = "sudo" if enable_sudo else "" if harassers: run_harasser = True self.log.info("<< Initial harasser list = %s>>", harassers) harasserlist = harassers[:] # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool add_pools(self, ["pool_reserved"]) # Create the reserved container self.resv_cont = self.get_container(self.pool[0], "/run/container_reserved/*", True) # populate reserved container with a 500MB file unless test is smoke self.initial_resv_file = os.path.join(self.test_dir, "initial", "resv_file") try: reserved_file_copy(self, self.initial_resv_file, self.pool[0], self.resv_cont, num_bytes=resv_bytes, cmd="write") except CommandFailure as error: self.fail(error) # Create pool for jobs if single_test_pool: add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # cleanup soak log directories before test on all nodes result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format(self.soak_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError("<<FAILED: Soak directories not removed" "from clients>>: {}".format( self.hostlist_clients)) # cleanup test_node for log_dir in [self.soak_dir, self.sharedsoak_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed>>".format( log_dir)) from error # Baseline metrics data run_metrics_check(self, prefix="initial") # Initialize time self.start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = self.start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) if not single_test_pool: # Create pool for jobs add_pools(self, ["pool_jobs"]) self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Initialize harassers if run_harasser: if not harasserlist: harasserlist = harassers[:] harasser = harasserlist.pop(0) self.harasser_args = {} self.harasser_results = {} self.harassers, self.offline_harassers = get_harassers( harasser) try: self.execute_jobs(job_list, self.pool[1]) except SoakTestError as error: self.fail(error) # Check space after jobs done for pool in self.pool: self.dmg_command.pool_query(pool.uuid) # Cleanup any dfuse mounts before destroying containers cleanup_dfuse(self) self.soak_errors.extend(self.destroy_containers(self.container)) self.container = [] # Remove the test pools from self.pool; preserving reserved pool if not single_test_pool: self.soak_errors.extend(self.destroy_pools(self.pool[1:])) self.pool = [self.pool[0]] self.log.info("Current pools: %s", " ".join([pool.uuid for pool in self.pool])) # Gather metrics data after jobs complete run_metrics_check(self) # Fail if the pool/containers did not clean up correctly if not ignore_soak_errors: self.assertEqual(len(self.soak_errors), 0, "\n".join(self.soak_errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info("<<LOOP %s completed in %s at %s>>", self.loop, DDHHMMSS_format(loop_time), time.ctime()) # Initialize harasser loop time from first pass loop time if self.loop == 1 and run_harasser: self.harasser_loop_time = loop_time self.loop += 1 self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>", DDHHMMSS_format(time.time() - self.start_time))
def run_soak(self, test_param): """Run the soak test specified by the test params. Args: test_param (str): test_params from yaml file """ self.soak_results = {} self.pool = [] self.harasser_joblist = [] self.harasser_results = {} test_to = self.params.get("test_timeout", test_param) self.job_timeout = self.params.get("job_timeout", test_param) self.harasser_timeout = self.params.get("harasser_timeout", test_param) self.test_name = self.params.get("name", test_param) self.nodesperjob = self.params.get("nodesperjob", test_param) self.test_iteration = self.params.get("iteration", test_param) self.task_list = self.params.get("taskspernode", test_param + "*") self.h_list = self.params.get("harasserlist", test_param + "*") job_list = self.params.get("joblist", test_param + "*") pool_list = self.params.get("poollist", test_param + "*") rank = self.params.get("rank", "/run/container_reserved/*") if self.is_harasser("rebuild"): obj_class = "_".join(["OC", str( self.params.get("daos_oclass", "/run/rebuild/*")[0])]) else: obj_class = self.params.get( "object_class", "/run/container_reserved/*") # Create the reserved pool with data # self.pool is a list of all the pools used in soak # self.pool[0] will always be the reserved pool self.add_pools(["pool_reserved"]) self.pool[0].connect() # Create the container and populate with a known data # TO-DO: use IOR to write and later read verify the data self.container = TestContainer(self.pool[0]) self.container.namespace = "/run/container_reserved/*" self.container.get_params(self) self.container.create() self.container.write_objects(rank, obj_class) self.all_failed_jobs = [] # cleanup soak log directories before test on all nodes result = slurm_utils.srun( NodeSet.fromlist(self.hostlist_clients), "rm -rf {}".format( self.log_dir), self.srun_params) if result.exit_status > 0: raise SoakTestError( "<<FAILED: Soak directories not removed" "from clients>>: {}".format(self.hostlist_clients)) # cleanup test_node for log_dir in [self.log_dir, self.sharedlog_dir]: cmd = "rm -rf {}".format(log_dir) try: result = run_command(cmd, timeout=30) except DaosTestError as error: raise SoakTestError( "<<FAILED: Soak directory {} was not removed {}>>".format( log_dir, error)) # Initialize time start_time = time.time() self.test_timeout = int(3600 * test_to) self.end_time = start_time + self.test_timeout self.log.info("<<START %s >> at %s", self.test_name, time.ctime()) while time.time() < self.end_time: # Start new pass start_loop_time = time.time() self.log.info( "<<Soak1 PASS %s: time until done %s>>", self.loop, DDHHMMSS_format(self.end_time - time.time())) # Create all specified pools self.add_pools(pool_list) self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) try: self.execute_jobs(job_list, self.pool[1:]) except SoakTestError as error: self.fail(error) errors = self.destroy_pools(self.pool[1:]) # remove the test pools from self.pool; preserving reserved pool self.pool = [self.pool[0]] self.log.info( "Current pools: %s", " ".join([pool.uuid for pool in self.pool])) self.assertEqual(len(errors), 0, "\n".join(errors)) # Break out of loop if smoke if "smoke" in self.test_name: break loop_time = time.time() - start_loop_time self.log.info( "<<PASS %s completed in %s >>", self.loop, DDHHMMSS_format( loop_time)) self.loop += 1 # TO-DO: use IOR self.assertTrue( self.container.read_objects(), "Data verification error on reserved pool" "after SOAK completed") # gather the daos logs from the client nodes self.log.info( "<<<<SOAK TOTAL TEST TIME = %s>>>", DDHHMMSS_format( time.time() - start_time))
try: # ClusterShell ( pip install clustershell ) from ClusterShell.NodeSet import NodeSet from ClusterShell.Task import task_self except ImportError: raise ImportError('Please install clustershell >= 1.2.0') # RING0 RING0DEV=["10.10.1.39","10.10.1.40","10.10.1.43","10.10.1.50"] RING0LIVE=[] RING1DEV=["10.10.1.39","10.10.1.40","10.10.1.43","10.10.1.50"] RING1LIVE=[] # initialise rings from config file RING_1_dev__allnodes = NodeSet.fromlist(RING1DEV) RING_1_dev__bootstrapnode = NodeSet.fromlist(RING1DEV) # CASSANDRA CASSANDRA_VERSION="0.8.5" CASSANDRA_CORE="/opt/cassandra-dev" CASSANDRA_HOME=CASSANDRA_CORE +"/apache-cassandra-" + CASSANDRA_VERSION CASSANDRA_BIN=CASSANDRA_HOME +"/bin/cassandra" CASSANDRA_CONF=CASSANDRA_CORE +"/cluster_config" CASSANDRA_INCLUDE=CASSANDRA_CONF +"/cassandra.in.sh" # JMX port PORT=7198 CASSANDRA_STRESS_TEST=CASSANDRA_HOME+"/SOFTWARE/apache-cassandra-0.8.6-src/tools/stress/bin/stress" # CASSANDRA.YAML
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0): """Run a command on each host in parallel and get the return codes. Args: hosts (list): list of hosts command (str): the command to run in parallel verbose (bool, optional): display command output. Defaults to True. timeout (int, optional): command timeout in seconds. Defaults to None. expect_rc (int, optional): exepcted return code. Defaults to 0. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Run the command on each host in parallel task = run_task(hosts, command, timeout) # Report any errors / display output if requested retcode_dict = {} for retcode, rc_nodes in task.iter_retcodes(): # Create a NodeSet for this list of nodes nodeset = NodeSet.fromlist(rc_nodes) # Include this NodeSet for this return code if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(nodeset) # Display any errors or requested output if retcode != expect_rc or verbose: msg = "failure running" if retcode != expect_rc else "output from" if len(list(task.iter_buffers(rc_nodes))) == 0: print( "{}: {} '{}': rc={}".format( nodeset, msg, command, retcode)) else: for output, nodes in task.iter_buffers(rc_nodes): nodeset = NodeSet.fromlist(nodes) lines = str(output).splitlines() output = "rc={}{}".format( retcode, ", {}".format(output) if len(lines) < 2 else "\n {}".format("\n ".join(lines))) print( "{}: {} '{}': {}".format( NodeSet.fromlist(nodes), msg, command, output)) # Report any timeouts if timeout and task.num_timeout() > 0: nodes = task.iter_keys_timeout() print( "{}: timeout detected running '{}' on {}/{} hosts".format( NodeSet.fromlist(nodes), command, task.num_timeout(), len(hosts))) retcode = 255 if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(NodeSet.fromlist(nodes)) return retcode_dict
def labels(self): """Return a NodeSet containing all component label.""" return NodeSet.fromlist((comp.label for comp in self))
def get_ofi_info(hosts, supported=None, verbose=True): """Get the OFI provider information from the specified hosts. Args: hosts (NodeSet): hosts from which to gather the information supported (list, optional): list of supported providers when if provided will limit the inclusion to only those providers specified. Defaults to None. verbose (bool, optional): display command details. Defaults to True. Returns: dict: a dictionary of interface keys with a dictionary value of a comma-separated string of providers key with a NodeSet value where the providers where detected. """ task = run_task(hosts, "fi_info", verbose=verbose) if verbose: display_task(task) # Populate a dictionary of interfaces with a list of provider lists and NodSet of hosts on which # the providers were detected. providers = {} results = dict(task.iter_retcodes()) if 0 in results: for output, nodelist in task.iter_buffers(results[0]): output_lines = [ line.decode("utf-8").rstrip(os.linesep) for line in output ] nodeset = NodeSet.fromlist(nodelist) # Find all the provider and domain pairings. The fi_info output reports these on # separate lines when processing the re matches ensure each domain is preceded by a # provider. interface_providers = {} data = re.findall(r"(provider|domain):\s+([A-Za-z0-9;_+]+)", "\n".join(output_lines)) while data: provider = list(data.pop(0)) if provider[0] == "provider" and data[0][0] == "domain": provider.pop(0) domain = list(data.pop(0)) domain.pop(0) # A provider and domain must be specified if not provider or not domain: continue # Add 'ofi+' to the provider provider = ["+".join(["ofi", item]) for item in provider] # Only include supported providers if a supported list is provided if supported and provider[0] not in supported: continue if domain[0] not in interface_providers: interface_providers[domain[0]] = set() interface_providers[domain[0]].update(provider) for interface, provider_set in interface_providers.items(): if interface not in providers: providers[interface] = {} provider_key = ",".join(list(provider_set)) if provider_key not in providers[interface]: providers[interface][provider_key] = NodeSet() providers[interface][provider_key].update(nodeset) return providers
def servers(self): """Return a NodeSet containing all component servers.""" return NodeSet.fromlist((comp.server.hostname for comp in self))
def ev_close(self, worker): """End of proxy command.""" Action.ev_close(self, worker) # Before all, we must check if shine command ran without bugs, node # crash, etc... # So we need to verify all node retcodes and change the component state # on the bad nodes. # Action timed out if worker.did_timeout(): self.set_status(ACT_ERROR) return status = ACT_OK # Remove the 'proxy' running action for each component. if self._comps: for comp in self._comps: # XXX: This should be changed using a real event for proxy. comp._del_action('proxy') if comp.state is None: comp.state = RUNTIME_ERROR # At this step, there should be no more INPROGRESS component. # If yes, this is a bug, change state to RUNTIME_ERROR. # INPROGRESS management could be change using running action # list. # Starting with v1.3, there is no more code setting INPROGRESS. # This is for compatibility with older clients. elif comp.state == INPROGRESS: actions = "" if len(comp._list_action()): actions = "actions: " + ", ".join(comp._list_action()) print >> sys.stderr, "ERROR: bad state for %s: %d %s" % \ (comp.label, comp.state, actions) comp.state = RUNTIME_ERROR # Gather nodes by return code for rc, nodes in worker.iter_retcodes(): # Remote command returns only RUNTIME_ERROR (See RemoteCommand) # some common remote errors: # rc 127 = command not found # rc 126 = found but not executable # rc 1 = python failure... if rc != 0: # If there is at least one error, the action is on error. status = ACT_ERROR # Gather these nodes by buffer key = nodes.__contains__ for buffers, nodes in self._outputs.walk(match=key): # Handle proxy command error nodes = NodeSet.fromlist(nodes) msg = "Remote action %s failed: %s\n" % \ (self.action, buffers) self.fs._handle_shine_proxy_error(nodes, msg) # Raise errors for each unpickling error, # which could happen mostly when Shine exits with 0. for buffers, nodes in self._errpickle.walk(): nodes = NodeSet.fromlist(nodes) self.fs._handle_shine_proxy_error(nodes, str(buffers)) # Raise an error for nodes without output if len(self._silentnodes) > 0: msg = "Remote action %s failed: No response" % self.action self.fs._handle_shine_proxy_error(self._silentnodes, msg) self.set_status(status)
def get_host_data(hosts, command, text, error, timeout=None): """Get the data requested for each host using the specified command. Args: hosts (list): list of hosts command (str): command used to obtain the data on each server text (str): data identification string error (str): data error string Returns: dict: a dictionary of data values for each NodeSet key """ # Find the data for each specified servers print(" Obtaining {} data on {}".format(text, hosts)) task = run_task(hosts, command, timeout) host_data = {} DATA_ERROR = "[ERROR]" # Create a list of NodeSets with the same return code data = {code: hosts for code, hosts in task.iter_retcodes()} # Multiple return codes or a single non-zero return code # indicate at least one error obtaining the data if len(data) > 1 or 0 not in data: # Report the errors messages = [] for code, hosts in data.items(): if code != 0: output_data = list(task.iter_buffers(hosts)) if len(output_data) == 0: messages.append("{}: rc={}, command=\"{}\"".format( NodeSet.fromlist(hosts), code, command)) else: for output, o_hosts in output_data: lines = str(output).splitlines() info = "rc={}{}".format( code, ", {}".format(output) if len(lines) < 2 else "\n {}".format("\n ".join(lines))) messages.append("{}: {}".format( NodeSet.fromlist(o_hosts), info)) print(" {} on the following hosts:\n {}".format( error, "\n ".join(messages))) # Return an error data set for all of the hosts host_data = {NodeSet.fromlist(hosts): DATA_ERROR} else: # The command completed successfully on all servers. for output, hosts in task.iter_buffers(data[0]): # Find the maximum size of the all the devices reported by # this group of hosts as only one needs to meet the minimum nodes = NodeSet.fromlist(hosts) try: # The assumption here is that each line of command output # will begin with a number and that for the purposes of # checking this requirement the maximum of these numbers is # needed int_host_values = [ int(line.split()[0]) for line in str(output).splitlines() ] host_data[nodes] = max(int_host_values) except (IndexError, ValueError): # Log the error print(" {}: Unable to obtain the maximum {} size due to " "unexpected output:\n {}".format( nodes, text, "\n ".join(str(output).splitlines()))) # Return an error data set for all of the hosts host_data = {NodeSet.fromlist(hosts): DATA_ERROR} break return host_data
def nodeset(self): return str(NodeSet.fromlist(self.nodes.keys()))
def display_proxy_errors(cls, fs): """Display proxy error messages for the specified filesystem.""" for msg, nodes in fs.proxy_errors.walk(): nodes = str(NodeSet.fromlist(nodes)) msg = str(msg).replace('THIS_SHINE_HOST', nodes) print >> sys.stderr, "%s: %s" % (nodes, msg)
def labels(self): """Return a NodeSet containing all component label.""" return NodeSet.fromlist((comp.label for comp in self))
def get_log_data(self, hosts, since, until=None, timeout=60): """Gather log output for the command running on each host. Note (from journalctl man page): Date specifications should be of the format "2012-10-30 18:17:16". If the time part is omitted, "00:00:00" is assumed. If only the seconds component is omitted, ":00" is assumed. If the date component is omitted, the current day is assumed. Alternatively the strings "yesterday", "today", "tomorrow" are understood, which refer to 00:00:00 of the day before the current day, the current day, or the day after the current day, respectively. "now" refers to the current time. Finally, relative times may be specified, prefixed with "-" or "+", referring to times before or after the current time, respectively. Args: hosts (list): list of hosts from which to gather log data. since (str): show log entries from this date. until (str, optional): show log entries up to this date. Defaults to None, in which case it is not utilized. timeout (int, optional): timeout for issuing the command. Defaults to 60 seconds. Returns: list: a list of dictionaries including: "hosts": <NodeSet() of hosts with this data> "data": <journalctl output> """ # Setup the journalctl command to capture all unit activity from the # specified start date to now or a specified end date # --output=json? command = self.get_journalctl_command(since, until) self.log.info("Gathering log data on %s: %s", str(hosts), command) # Gather the log information per host results = run_pcmd(hosts, command, False, timeout, None) # Determine if the command completed successfully without a timeout status = True for result in results: if result["interrupted"]: self.log.info(" Errors detected running \"%s\":", command) self.log.info(" %s: timeout detected after %s seconds", str(result["hosts"]), timeout) status = False elif result["exit_status"] != 0: self.log.info(" Errors detected running \"%s\":", command) status = False if not status: break # Display/return the command output log_data = [] for result in results: if result["exit_status"] == 0 and not result["interrupted"]: # Add the successful output from each node to the dictionary log_data.append({ "hosts": result["hosts"], "data": result["stdout"] }) else: # Display all of the results in the case of an error if len(result["stdout"]) > 1: self.log.info(" %s: rc=%s, output:", str(result["hosts"]), result["exit_status"]) for line in result["stdout"]: self.log.info(" %s", line) else: self.log.info(" %s: rc=%s, output: %s", str(result["hosts"]), result["exit_status"], result["stdout"][0]) # Report any errors through an exception if not status: raise CommandFailure( "Error(s) detected gathering {} log data on {}".format( self._systemctl.service.value, NodeSet.fromlist(hosts))) # Return the successful command output per set of hosts return log_data
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0): """Run a command on each host in parallel and get the return codes. Args: hosts (list): list of hosts command (str): the command to run in parallel verbose (bool, optional): display command output. Defaults to True. timeout (int, optional): command timeout in seconds. Defaults to None. expect_rc (int, optional): expected return code. Defaults to 0. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Run the command on each host in parallel task = run_task(hosts, command, timeout) # Report any errors retcode_dict = {} errors = False for retcode, rc_nodes in task.iter_retcodes(): # Create a NodeSet for this list of nodes nodeset = NodeSet.fromlist(rc_nodes) # Include this NodeSet for this return code if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(nodeset) # Keep track of any errors if expect_rc is not None and expect_rc != retcode: errors = True # Report command output if requested or errors are detected if verbose or errors: print("Command:\n {}".format(command)) print("Command return codes:") for retcode in sorted(retcode_dict): print(" {}: rc={}".format(retcode_dict[retcode], retcode)) print("Command output:") for output, bf_nodes in task.iter_buffers(): # Create a NodeSet for this list of nodes nodeset = NodeSet.fromlist(bf_nodes) # Display the output per node set print(" {}:\n {}".format( nodeset, "\n ".join(str(output).splitlines()))) # Report any timeouts if timeout and task.num_timeout() > 0: nodes = task.iter_keys_timeout() print("{}: timeout detected running '{}' on {}/{} hosts after {}s". format(NodeSet.fromlist(nodes), command, task.num_timeout(), len(hosts), timeout)) retcode = 255 if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(NodeSet.fromlist(nodes)) return retcode_dict
def servers(self): """Return a NodeSet containing all component servers.""" return NodeSet.fromlist((comp.server.hostname for comp in self))