def cc_list(args): """ Return a list of values representing job copies given by --cc/--bcc """ cclist = [""] if args.cc and args.bcc: raise ValueError("specify only one of --cc or --bcc") if args.cc: cclist = IDset(args.cc) elif args.bcc: cclist = IDset(args.bcc) return cclist
def drain_list(): headings = { "timestamp": "TIMESTAMP", "ranks": "RANK", "reason": "REASON", "nodelist": "NODELIST", } resp = RPC(flux.Flux(), "resource.status").get() rset = ResourceSet(resp["R"]) nodelist = rset.nodelist lines = [] for ranks, entry in resp["drain"].items(): ranks = IDset(ranks) line = StatusLine( "drain", ranks, Hostlist([nodelist[i] for i in ranks]), entry["reason"], entry["timestamp"], ) lines.append(line) fmt = "{timestamp:<20} {ranks:<8} {reason:<30} {nodelist}" formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") print(formatter.header()) for line in lines: print(formatter.format(line))
def fetch_job_ranks(handle, jobid): """Fetch job ranks from KVS for jobid""" try: return IDset(ResourceSet(flux.kvs.get(handle, f"{jobid.kvs}.R")).ranks) except FileNotFoundError: LOGGER.error("R not found in kvs for job %s, unable to continue", jobid) return None
def _encode(self): hList = Hostlist(self._rv1NoSched["execution"]["nodelist"]) vtx = FluxionResourcePoolV1( self._uniqId, "cluster", "cluster", "cluster0", 0, self._uniqId, -1, True, "", 1, "/cluster0", ) self._add_and_tick_uniq_id(vtx, None) i = 0 rdict = {} for entry in self._rv1NoSched["execution"]["R_lite"]: for rank in list(IDset(entry["rank"])): if rank in rdict: raise Exception(f"R_lite: rank={rank} found again!") rdict[rank] = i i += 1 for entry in self._rv1NoSched["execution"]["R_lite"]: self._encode_rlite(vtx.get_id(), entry, hList, rdict)
def from_status_response(cls, resp, fmt, allocated=None): # Return empty ResourceStatus object if resp not set: # (mainly used for testing) if not resp: return cls() if allocated is None: allocated = IDset() if isinstance(resp, str): resp = json.loads(resp) rstat = cls(resp["R"]) # Append a line for listing all ranks/hosts rstat.append("all", rstat.all) # "online", "offline", "exclude" keys contain idsets # specifying the set of ranks in that state: # for state in ["online", "offline", "exclude"]: rstat.append(state, IDset(resp[state])) # "drain" key contains a dict of idsets with timestamp,reason # drained = 0 for drain_ranks, entry in resp["drain"].items(): for ranks, state in split_draining(IDset(drain_ranks), allocated): # Only include reason if it will be displayed in format reason = "" if ranks and "reason" in fmt: reason = entry["reason"] rstat.append(state, IDset(ranks), reason) drained = drained + 1 # If no drained nodes, append an empty StatusLine if drained == 0: for state in ["drain", "draining", "drained"]: rstat.append(state) # "avail" is computed from above rstat.append("avail", rstat.avail) return rstat
def status(args): valid_states = [ "all", "online", "avail", "offline", "exclude", "drain", "draining", "drained", ] default_states = "avail,offline,exclude,draining,drained" headings = { "state": "STATUS", "nnodes": "NNODES", "ranks": "RANKS", "nodelist": "NODELIST", "reason": "REASON", } # Emit list of valid states or formats if requested if "help" in [args.states, args.format]: status_help(args, valid_states, headings) # Get state list from args or defaults: states = status_get_state_list(args, valid_states, default_states) # Include reason field only with -vv if args.verbose >= 2: fmt = "{state:>10} {nnodes:>6} {reason:<25} {nodelist}" else: fmt = "{state:>10} {nnodes:>6} {nodelist}" if args.format: fmt = args.format # Get payload from stdin or from resource.status RPC: if args.from_stdin: resp = sys.stdin.read() allocated = IDset() else: rpc = ListStatusRPC(flux.Flux()) resp = rpc.get_status() allocated = rpc.get_allocated_ranks() rstat = ResourceStatus.from_status_response(resp, fmt, allocated) formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") if not args.no_header: print(formatter.header()) for line in sorted(rstat, key=lambda x: valid_states.index(x.state)): if line.state not in states: continue # Skip empty lines unless --verbose or --states if line.nnodes == 0 and args.states is None and not args.verbose: continue print(formatter.format(line))
def waitfor_continuation(rpc, count): """ Stop the reactor once the group has the right number of members. """ resp = rpc.get() ids = IDset(resp["members"]) if ids.count() == count: rpc.flux_handle.reactor_stop() else: rpc.reset()
def barrier_continuation(rpc, fullset): """ Stop the reactor once the group matches fullset. """ resp = rpc.get() ids = IDset(resp["members"]) if ids.equal(fullset): rpc.flux_handle.reactor_stop() else: rpc.reset()
def remove_ranks(self, ranks): """ Remove the rank or ranks specified from the ResourceSet :param ranks: A flux.idset.IDset object, or number or string which can be converted into an IDset, containing the ranks to remove """ if not isinstance(ranks, IDset): ranks = IDset(str(ranks)) self.impl.remove_ranks(ranks) return self
def run_per_rank(name, jobid, args): """Run args.exec_per_rank on every rank of jobid If command fails on any rank then drain that rank """ returncode = 0 if args.exec_per_rank is None: return 0 per_rank_cmd = args.exec_per_rank.split(",") processes = {} fail_ids = IDset() handle = flux.Flux() hostlist = flux.hostlist.Hostlist(handle.attr_get("hostlist")) ranks = fetch_job_ranks(handle, jobid) if ranks is None: return 1 if args.verbose: LOGGER.info( "%s: %s: executing %s on ranks %s", jobid, name, per_rank_cmd, ranks ) for rank in ranks: cmd = ["flux", "exec", "-qn", f"-r{rank}"] + per_rank_cmd processes[rank] = process_create(cmd, stderr=subprocess.PIPE) for rank in ranks: rc = processes[rank].wait() for line in processes[rank].stderr: errline = line.decode("utf-8").rstrip() LOGGER.error("%s (rank %d): %s", hostlist[rank], rank, errline) if rc != 0: fail_ids.set(rank) if rc > returncode: returncode = rc if len(fail_ids) > 0: LOGGER.error("%s: rank %s failed %s, draining", jobid, fail_ids, name) drain(handle, fail_ids, f"{name} failed for jobid {jobid}") if args.verbose: ranks.subtract(fail_ids) if len(ranks) > 0: LOGGER.info("%s: %s: completed successfully on %s", jobid, name, ranks) return returncode
def get_allocated_ranks(self): if not self.allocated_ranks: # # If the scheduler is not loaded, do not propagate an error, # just return an empty idset for allocated ranks. # try: self.get() self.rlist = self.children[1].get() self.allocated_ranks = self.rlist.allocated.ranks except EnvironmentError: self.allocated_ranks = IDset() return self.allocated_ranks
def _encode_rank(self, ppid, rank, children, hList, hIndex): hPath = f"/cluster0/{hList[hIndex]}" iden = self._extract_id_from_hn(hList[hIndex]) vtx = FluxionResourcePoolV1( self._uniqId, "node", "node", hList[hIndex], iden, self._uniqId, rank, True, "", 1, hPath, ) edg = FluxionResourceRelationshipV1(ppid, vtx.get_id()) self._add_and_tick_uniq_id(vtx, edg) for key, val in children.items(): for i in IDset(val): self._encode_child(vtx.get_id(), hPath, rank, str(key), i)
def barrier(args): """ This is functionally a barrier if run with flux exec on all broker ranks. If --leave is specified, leave explicitly, otherwise just disconnect. """ h = flux.Flux() size = int(h.attr_get("size")) fullset = IDset("0-" + str(size - 1)) entry = h.rpc( "groups.get", {"name": args.name}, nodeid=0, flags=flux.constants.FLUX_RPC_STREAMING, ) entry.then(barrier_continuation, fullset) h.rpc("groups.join", {"name": args.name}) h.reactor_run() # run until idset is full if args.leave: h.rpc("groups.leave", {"name": args.name}).get()
def _encode_rank(self, ppid, rank, children, hList, rdict): if rdict[rank] >= len(hList): raise Exception(f"nodelist doesn't include node for rank={rank}") hPath = f"/cluster0/{hList[rdict[rank]]}" iden = self._extract_id_from_hn(hList[rdict[rank]]) vtx = FluxionResourcePoolV1( self._uniqId, "node", "node", hList[rdict[rank]], iden, self._uniqId, rank, True, "", 1, hPath, ) edg = FluxionResourceRelationshipV1(ppid, vtx.get_id()) self._add_and_tick_uniq_id(vtx, edg) for key, val in children.items(): for i in IDset(val): self._encode_child(vtx.get_id(), hPath, rank, str(key), i)
def drain_list(): headings = { "timestamp": "TIMESTAMP", "ranks": "RANK", "reason": "REASON", "nodelist": "NODELIST", "state": "STATE", } result = ListStatusRPC(flux.Flux()) resp = result.get_status() allocated = result.get_allocated_ranks() rset = ResourceSet(resp["R"]) nodelist = rset.nodelist lines = [] for drain_ranks, entry in resp["drain"].items(): for ranks, state in split_draining(IDset(drain_ranks), allocated): # Do not report empty or "drain" rank sets # Only draining & drained are reported in this view if not ranks or state == "drain": continue line = StatusLine( state, ranks, Hostlist([nodelist[i] for i in ranks]), entry["reason"], entry["timestamp"], ) lines.append(line) fmt = "{timestamp:<20} {state:<8.8} {ranks:<8.8} {reason:<30} {nodelist}" formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") print(formatter.header()) for line in lines: print(formatter.format(line))
def remove_ranks(self, ranks): if not isinstance(ranks, IDset): ranks = IDset(str(ranks)) self.pimpl.remove_ranks(ranks) return self
def _encode_rlite(self, ppid, entry, hList, rdict): for rank in list(IDset(entry["rank"])): self._encode_rank(ppid, rank, entry["children"], hList, rdict)
def _encode_rlite(self, ppid, entry, hList, hIndex): for rank in list(IDset(entry["rank"])): hIndex += 1 self._encode_rank(ppid, rank, entry["children"], hList, hIndex) return hIndex
def _idset_update(self, state, idset): if state not in self.idsets: self.idsets[state] = IDset() self.idsets[state].add(idset)
def ranks(self, hosts=None): if hosts is None: return IDset(handle=self.pimpl.ranks()) return IDset(handle=self.pimpl.hosts_to_ranks(hosts))