Пример #1
0
 def cc_list(args):
     """
     Return a list of values representing job copies given by --cc/--bcc
     """
     cclist = [""]
     if args.cc and args.bcc:
         raise ValueError("specify only one of --cc or --bcc")
     if args.cc:
         cclist = IDset(args.cc)
     elif args.bcc:
         cclist = IDset(args.bcc)
     return cclist
Пример #2
0
def drain_list():
    headings = {
        "timestamp": "TIMESTAMP",
        "ranks": "RANK",
        "reason": "REASON",
        "nodelist": "NODELIST",
    }
    resp = RPC(flux.Flux(), "resource.status").get()
    rset = ResourceSet(resp["R"])
    nodelist = rset.nodelist

    lines = []
    for ranks, entry in resp["drain"].items():
        ranks = IDset(ranks)
        line = StatusLine(
            "drain",
            ranks,
            Hostlist([nodelist[i] for i in ranks]),
            entry["reason"],
            entry["timestamp"],
        )
        lines.append(line)

    fmt = "{timestamp:<20} {ranks:<8} {reason:<30} {nodelist}"
    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")
    print(formatter.header())
    for line in lines:
        print(formatter.format(line))
Пример #3
0
def fetch_job_ranks(handle, jobid):
    """Fetch job ranks from KVS for jobid"""
    try:
        return IDset(ResourceSet(flux.kvs.get(handle, f"{jobid.kvs}.R")).ranks)
    except FileNotFoundError:
        LOGGER.error("R not found in kvs for job %s, unable to continue", jobid)
        return None
Пример #4
0
 def _encode(self):
     hList = Hostlist(self._rv1NoSched["execution"]["nodelist"])
     vtx = FluxionResourcePoolV1(
         self._uniqId,
         "cluster",
         "cluster",
         "cluster0",
         0,
         self._uniqId,
         -1,
         True,
         "",
         1,
         "/cluster0",
     )
     self._add_and_tick_uniq_id(vtx, None)
     i = 0
     rdict = {}
     for entry in self._rv1NoSched["execution"]["R_lite"]:
         for rank in list(IDset(entry["rank"])):
             if rank in rdict:
                 raise Exception(f"R_lite: rank={rank} found again!")
             rdict[rank] = i
             i += 1
     for entry in self._rv1NoSched["execution"]["R_lite"]:
         self._encode_rlite(vtx.get_id(), entry, hList, rdict)
Пример #5
0
    def from_status_response(cls, resp, fmt, allocated=None):

        #  Return empty ResourceStatus object if resp not set:
        #  (mainly used for testing)
        if not resp:
            return cls()

        if allocated is None:
            allocated = IDset()

        if isinstance(resp, str):
            resp = json.loads(resp)

        rstat = cls(resp["R"])

        #  Append a line for listing all ranks/hosts
        rstat.append("all", rstat.all)

        #  "online", "offline", "exclude" keys contain idsets
        #    specifying the set of ranks in that state:
        #
        for state in ["online", "offline", "exclude"]:
            rstat.append(state, IDset(resp[state]))

        #  "drain" key contains a dict of idsets with timestamp,reason
        #
        drained = 0
        for drain_ranks, entry in resp["drain"].items():
            for ranks, state in split_draining(IDset(drain_ranks), allocated):
                #  Only include reason if it will be displayed in format
                reason = ""
                if ranks and "reason" in fmt:
                    reason = entry["reason"]

                rstat.append(state, IDset(ranks), reason)
                drained = drained + 1

        #  If no drained nodes, append an empty StatusLine
        if drained == 0:
            for state in ["drain", "draining", "drained"]:
                rstat.append(state)

        #  "avail" is computed from above
        rstat.append("avail", rstat.avail)

        return rstat
Пример #6
0
def status(args):
    valid_states = [
        "all",
        "online",
        "avail",
        "offline",
        "exclude",
        "drain",
        "draining",
        "drained",
    ]
    default_states = "avail,offline,exclude,draining,drained"
    headings = {
        "state": "STATUS",
        "nnodes": "NNODES",
        "ranks": "RANKS",
        "nodelist": "NODELIST",
        "reason": "REASON",
    }

    #  Emit list of valid states or formats if requested
    if "help" in [args.states, args.format]:
        status_help(args, valid_states, headings)

    #  Get state list from args or defaults:
    states = status_get_state_list(args, valid_states, default_states)

    #  Include reason field only with -vv
    if args.verbose >= 2:
        fmt = "{state:>10} {nnodes:>6} {reason:<25} {nodelist}"
    else:
        fmt = "{state:>10} {nnodes:>6} {nodelist}"
    if args.format:
        fmt = args.format

    #  Get payload from stdin or from resource.status RPC:
    if args.from_stdin:
        resp = sys.stdin.read()
        allocated = IDset()
    else:
        rpc = ListStatusRPC(flux.Flux())
        resp = rpc.get_status()
        allocated = rpc.get_allocated_ranks()

    rstat = ResourceStatus.from_status_response(resp, fmt, allocated)

    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")
    if not args.no_header:
        print(formatter.header())
    for line in sorted(rstat, key=lambda x: valid_states.index(x.state)):
        if line.state not in states:
            continue
        #  Skip empty lines unless --verbose or --states
        if line.nnodes == 0 and args.states is None and not args.verbose:
            continue
        print(formatter.format(line))
Пример #7
0
def waitfor_continuation(rpc, count):
    """
    Stop the reactor once the group has the right number of members.
    """
    resp = rpc.get()
    ids = IDset(resp["members"])
    if ids.count() == count:
        rpc.flux_handle.reactor_stop()
    else:
        rpc.reset()
Пример #8
0
def barrier_continuation(rpc, fullset):
    """
    Stop the reactor once the group matches fullset.
    """
    resp = rpc.get()
    ids = IDset(resp["members"])
    if ids.equal(fullset):
        rpc.flux_handle.reactor_stop()
    else:
        rpc.reset()
Пример #9
0
    def remove_ranks(self, ranks):
        """
        Remove the rank or ranks specified from the ResourceSet

        :param ranks: A flux.idset.IDset object, or number or string which
                      can be converted into an IDset, containing the ranks
                      to remove
        """
        if not isinstance(ranks, IDset):
            ranks = IDset(str(ranks))
        self.impl.remove_ranks(ranks)
        return self
Пример #10
0
def run_per_rank(name, jobid, args):
    """Run args.exec_per_rank on every rank of jobid

    If command fails on any rank then drain that rank
    """

    returncode = 0

    if args.exec_per_rank is None:
        return 0

    per_rank_cmd = args.exec_per_rank.split(",")

    processes = {}
    fail_ids = IDset()

    handle = flux.Flux()
    hostlist = flux.hostlist.Hostlist(handle.attr_get("hostlist"))

    ranks = fetch_job_ranks(handle, jobid)
    if ranks is None:
        return 1

    if args.verbose:
        LOGGER.info(
            "%s: %s: executing %s on ranks %s", jobid, name, per_rank_cmd, ranks
        )

    for rank in ranks:
        cmd = ["flux", "exec", "-qn", f"-r{rank}"] + per_rank_cmd
        processes[rank] = process_create(cmd, stderr=subprocess.PIPE)

    for rank in ranks:
        rc = processes[rank].wait()
        for line in processes[rank].stderr:
            errline = line.decode("utf-8").rstrip()
            LOGGER.error("%s (rank %d): %s", hostlist[rank], rank, errline)
        if rc != 0:
            fail_ids.set(rank)
            if rc > returncode:
                returncode = rc

    if len(fail_ids) > 0:
        LOGGER.error("%s: rank %s failed %s, draining", jobid, fail_ids, name)
        drain(handle, fail_ids, f"{name} failed for jobid {jobid}")

    if args.verbose:
        ranks.subtract(fail_ids)
        if len(ranks) > 0:
            LOGGER.info("%s: %s: completed successfully on %s", jobid, name, ranks)

    return returncode
Пример #11
0
 def get_allocated_ranks(self):
     if not self.allocated_ranks:
         #
         #  If the scheduler is not loaded, do not propagate an error,
         #   just return an empty idset for allocated ranks.
         #
         try:
             self.get()
             self.rlist = self.children[1].get()
             self.allocated_ranks = self.rlist.allocated.ranks
         except EnvironmentError:
             self.allocated_ranks = IDset()
     return self.allocated_ranks
Пример #12
0
 def _encode_rank(self, ppid, rank, children, hList, hIndex):
     hPath = f"/cluster0/{hList[hIndex]}"
     iden = self._extract_id_from_hn(hList[hIndex])
     vtx = FluxionResourcePoolV1(
         self._uniqId,
         "node",
         "node",
         hList[hIndex],
         iden,
         self._uniqId,
         rank,
         True,
         "",
         1,
         hPath,
     )
     edg = FluxionResourceRelationshipV1(ppid, vtx.get_id())
     self._add_and_tick_uniq_id(vtx, edg)
     for key, val in children.items():
         for i in IDset(val):
             self._encode_child(vtx.get_id(), hPath, rank, str(key), i)
Пример #13
0
def barrier(args):
    """
    This is functionally a barrier if run with flux exec on all broker ranks.
    If --leave is specified, leave explicitly, otherwise just disconnect.
    """
    h = flux.Flux()
    size = int(h.attr_get("size"))
    fullset = IDset("0-" + str(size - 1))

    entry = h.rpc(
        "groups.get",
        {"name": args.name},
        nodeid=0,
        flags=flux.constants.FLUX_RPC_STREAMING,
    )
    entry.then(barrier_continuation, fullset)

    h.rpc("groups.join", {"name": args.name})
    h.reactor_run()  # run until idset is full

    if args.leave:
        h.rpc("groups.leave", {"name": args.name}).get()
Пример #14
0
 def _encode_rank(self, ppid, rank, children, hList, rdict):
     if rdict[rank] >= len(hList):
         raise Exception(f"nodelist doesn't include node for rank={rank}")
     hPath = f"/cluster0/{hList[rdict[rank]]}"
     iden = self._extract_id_from_hn(hList[rdict[rank]])
     vtx = FluxionResourcePoolV1(
         self._uniqId,
         "node",
         "node",
         hList[rdict[rank]],
         iden,
         self._uniqId,
         rank,
         True,
         "",
         1,
         hPath,
     )
     edg = FluxionResourceRelationshipV1(ppid, vtx.get_id())
     self._add_and_tick_uniq_id(vtx, edg)
     for key, val in children.items():
         for i in IDset(val):
             self._encode_child(vtx.get_id(), hPath, rank, str(key), i)
Пример #15
0
def drain_list():
    headings = {
        "timestamp": "TIMESTAMP",
        "ranks": "RANK",
        "reason": "REASON",
        "nodelist": "NODELIST",
        "state": "STATE",
    }
    result = ListStatusRPC(flux.Flux())

    resp = result.get_status()
    allocated = result.get_allocated_ranks()

    rset = ResourceSet(resp["R"])
    nodelist = rset.nodelist

    lines = []
    for drain_ranks, entry in resp["drain"].items():
        for ranks, state in split_draining(IDset(drain_ranks), allocated):
            # Do not report empty or "drain" rank sets
            # Only draining & drained are reported in this view
            if not ranks or state == "drain":
                continue
            line = StatusLine(
                state,
                ranks,
                Hostlist([nodelist[i] for i in ranks]),
                entry["reason"],
                entry["timestamp"],
            )
            lines.append(line)

    fmt = "{timestamp:<20} {state:<8.8} {ranks:<8.8} {reason:<30} {nodelist}"
    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")
    print(formatter.header())
    for line in lines:
        print(formatter.format(line))
Пример #16
0
 def remove_ranks(self, ranks):
     if not isinstance(ranks, IDset):
         ranks = IDset(str(ranks))
     self.pimpl.remove_ranks(ranks)
     return self
Пример #17
0
 def _encode_rlite(self, ppid, entry, hList, rdict):
     for rank in list(IDset(entry["rank"])):
         self._encode_rank(ppid, rank, entry["children"], hList, rdict)
Пример #18
0
 def _encode_rlite(self, ppid, entry, hList, hIndex):
     for rank in list(IDset(entry["rank"])):
         hIndex += 1
         self._encode_rank(ppid, rank, entry["children"], hList, hIndex)
     return hIndex
Пример #19
0
 def _idset_update(self, state, idset):
     if state not in self.idsets:
         self.idsets[state] = IDset()
     self.idsets[state].add(idset)
Пример #20
0
 def ranks(self, hosts=None):
     if hosts is None:
         return IDset(handle=self.pimpl.ranks())
     return IDset(handle=self.pimpl.hosts_to_ranks(hosts))