예제 #1
0
def drain(args):
    """
    Send a drain request to resource module for args.targets, if args.targets
    not specified, then list currently drained targets
    """
    if args.targets is None:
        drain_list()
        return
    payload = {
        "targets": args.targets,
    }
    if args.update and args.force:
        LOGGER.error("Only one of --force and --update may be specified")
        sys.exit(1)
    if args.update:
        payload["mode"] = "update"
    elif args.force:
        payload["mode"] = "overwrite"
    if args.reason:
        payload["reason"] = " ".join(args.reason)
    RPC(
        flux.Flux(),
        "resource.drain",
        payload,
        nodeid=0,
    ).get()
예제 #2
0
파일: flux_direct.py 프로젝트: LLNL/ATS
 def __init__(self, name, npMaxH):
     self.submitted = dict()
     self.fh = flux.Flux()
     jsc.notify_status(self.fh, update_test_status, self)
     # self.broker_thread = thread.start_new_thread(run_broker, (self.fh,))
     self.cores = 0
     max_cores = 0
     self.numNodes = 0
     self.numberCoresInUse = 0
     with kvs.get_dir(self.fh, 'resource.hwloc.by_rank') as d:
         for name, rankdir in d.items():
             max_cores = max(max_cores, rankdir['Core'])
             self.cores += rankdir['Core']
             self.numNodes += 1
     self.npMax = max_cores
     # initialize the upper versions with the real core count
     super(FluxDirect, self).__init__(name, self.cores)
     # self.numberTestsRunningMax = 1 # TODO: REMOVE THIS DEBUG VALUE self.cores * 2 # for flux, this is number in the scheduling queue
     self.numberTestsRunningMax = 1000  # for flux, this is number in the scheduling queue
     self.scheduler = FluxScheduler()
     self.timer = self.fh.timer_watcher_create(
         after=self.naptime,
         repeat=self.naptime,
         callback=lambda fh, y, z, w: fh.reactor_stop(fh.get_reactor()))
     self.timer.start()
예제 #3
0
def jobtap_load(args):
    """Load a jobtap plugin into the job manager"""
    if args.plugin == "none" or args.plugin.startswith("builtin."):
        path = args.plugin
    else:
        path = os.path.abspath(args.plugin)

    try:
        resp = (flux.Flux().rpc("job-manager.jobtap", {
            "load": path,
            "conf": args.conf
        }).get())
    except FileNotFoundError:
        LOGGER.error(
            "%s not found",
            args.plugin,
        )
        sys.exit(1)
    if not args.quiet:
        print("Loaded:")
        for name in resp["plugins"]:
            print(name)
        print("Previously loaded:")
    for name in resp["previous"]:
        print(name)
예제 #4
0
 def test_no_topic_invalid(self):
     """flux_request_encode returns EINVAL with no topic string"""
     f = flux.Flux("loop://")
     with self.assertRaises(EnvironmentError) as err:
         f.request_encode(None, json_str)
     err = err.exception
     self.assertEqual(err.errno, errno.EINVAL)
예제 #5
0
def undrain(args):
    """
    Send an "undrain" request to resource module for args.targets
    """
    RPC(flux.Flux(), "resource.undrain", {
        "targets": args.targets
    }, nodeid=0).get()
예제 #6
0
def submit_bundles(f, N):
    f = flux.Flux()
    for i in range(0, N):
        print(flux.job.submit(f, compute_jobreq))
        print(flux.job.submit(f, io_jobreq))

    print("bookkeeper: all jobs submitted")
def main():
    nthreads = 2
    threads = []
    queue = Queue()
    for i in range(0, nthreads):
        thread = threading.Thread(
            target=get_events,
            args=(
                i,
                queue,
            ),
        )
        thread.start()
        threads.append(thread)

    print(f"starting {nthreads} threads", file=sys.stderr)

    # Ensure threads have subscribed to 'test-event'
    for thread in threads:
        queue.get()
        print(f"got response from {thread}", file=sys.stderr)

    print(f"{nthreads} threads started", file=sys.stderr)

    flux.Flux().event_send("test-event", "hello")

    print(f"published test-event", file=sys.stderr)

    for thread in threads:
        thread.join()

    print("Done", file=sys.stderr)
예제 #8
0
def main():

    args = parse_args()

    time0 = time.time()

    jobspec = create_test_jobspec(args)

    bulk = BulkRun(flux.Flux(), args.njobs, jobspec).run(args)

    jobs = bulk.jobs

    #  Get the job with the earliest 'submit' event:
    first = jobs[min(jobs.keys(), key=lambda x: jobs[x]["submit"].timestamp)]

    #  Get the job with the latest 'clean' event:
    last = jobs[max(jobs.keys(), key=lambda x: jobs[x]["clean"].timestamp)]

    #  Get the job with the latest 't_submit' time:
    lastsubmit = jobs[max(jobs.keys(), key=lambda x: jobs[x]["t_submit"])]
    submit_time = lastsubmit["t_submit"] - time0
    sjps = args.njobs / submit_time

    script_runtime = time.time() - time0
    job_runtime = last["clean"].timestamp - first["submit"].timestamp
    jps = args.njobs / job_runtime
    jpsb = args.njobs / script_runtime

    print(f"number of jobs: {args.njobs}")
    print(f"submit time:    {submit_time:<6.3f}s ({sjps:5.1f} job/s)")
    print(f"script runtime: {script_runtime:<6.3f}s")
    print(f"job runtime:    {job_runtime:<6.3f}s")
    print(f"throughput:     {jps:<.1f} job/s (script: {jpsb:5.1f} job/s)")
def main():
    implementation = "bulksubmit"
    start_time = time.perf_counter()
    args = setup_parser().parse_args()
    # open connection to broker
    h = flux.Flux()
    # create jobspec for sleep command
    compute_jobspec = job.JobspecV1.from_command(command=["true"],
                                                 num_tasks=1,
                                                 num_nodes=1,
                                                 cores_per_task=1)
    compute_jobspec.cwd = os.getcwd()
    done = 0
    for _ in range(args.jobcount):
        job.submit_async(h, compute_jobspec, waitable=True).then(submit_cb)
    if h.reactor_run(h.get_reactor(), 0) < 0:
        h.fatal_error("reactor start failed")
    while done < args.jobcount:
        jobid, success, errstr = job.wait(h)
        if not success:
            print("wait: {} Error: {}".format(jobid, errstr))
        done += 1
    total_time = time.perf_counter() - start_time
    print("Total seconds: {}".format(total_time))
    utils.save_timing_data(args.jobcount, total_time, implementation)
예제 #10
0
def list_handler(args):
    valid_states = ["up", "down", "allocated", "free", "all"]
    headings = {
        "state": "STATE",
        "nnodes": "NNODES",
        "ncores": "NCORES",
        "ngpus": "NGPUS",
        "ranks": "RANKS",
        "rlist": "LIST",
    }

    states = args.states.split(",")
    for state in states:
        if state not in valid_states:
            LOGGER.error("Invalid resource state %s specified", state)
            sys.exit(1)

    fmt = "{state:>10} {nnodes:>6} {ncores:>8} {ngpus:>8}"
    if args.verbose:
        fmt += " {rlist}"
    if args.format:
        fmt = args.format

    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")

    if args.from_stdin:
        resp = json.load(sys.stdin)
    else:
        resp = RPC(flux.Flux(), "sched.resource-status").get()
    resources = SchedResourceList(resp)

    if not args.no_header:
        print(formatter.header())
    for state in states:
        print(formatter.format(resources[state]))
예제 #11
0
def drain_list():
    headings = {
        "timestamp": "TIMESTAMP",
        "ranks": "RANK",
        "reason": "REASON",
        "nodelist": "NODELIST",
    }
    resp = RPC(flux.Flux(), "resource.status").get()
    rset = ResourceSet(resp["R"])
    nodelist = rset.nodelist

    lines = []
    for ranks, entry in resp["drain"].items():
        ranks = IDset(ranks)
        line = StatusLine(
            "drain",
            ranks,
            Hostlist([nodelist[i] for i in ranks]),
            entry["reason"],
            entry["timestamp"],
        )
        lines.append(line)

    fmt = "{timestamp:<20} {ranks:<8} {reason:<30} {nodelist}"
    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")
    print(formatter.header())
    for line in lines:
        print(formatter.format(line))
예제 #12
0
def jobtap_remove(args):
    """Remove jobtap plugin matching name"""
    try:
        flux.Flux().rpc("job-manager.jobtap", {"remove": args.plugin}).get()
    except FileNotFoundError:
        LOGGER.error("%s not found", args.plugin)
        sys.exit(1)
예제 #13
0
def get_root_jobinfo():
    """Fetch a mock JobInfo object for the current enclosing instance"""

    handle = flux.Flux()
    size = handle.attr_get("size")

    try:
        #  If the enclosing instance has a jobid and a parent-uri, then
        #   fill in data from job-list in the parent:
        #
        jobid = JobID(handle.attr_get("jobid"))
        parent = flux.Flux(handle.attr_get("parent-uri"))
        info = JobList(parent, ids=[jobid]).fetch_jobs().get_jobs()[0]
    except OSError:
        #  Make a best-effort attempt to create a mock job info dictionary
        uri = handle.attr_get("local-uri")
        nodelist = handle.attr_get("hostlist")
        userid = handle.attr_get("security.owner")
        info = dict(
            id=0,
            userid=int(userid),
            state=flux.constants.FLUX_JOB_STATE_RUN,
            name=".",
            ntasks=int(size),
            nnodes=int(size),
            nodelist=nodelist,
            annotations={"user": {"uri": uri}},
        )
        try:
            info["t_run"] = float(handle.attr_get("broker.starttime"))
        except OSError:
            pass

    #  If 'ranks' idset came from parent, it could be confusing,
    #   rewrite ranks to be relative to current instance, i.e.
    #   0-(size-1)
    #
    info["ranks"] = "0-{}".format(int(size) - 1)

    #  Fetch instance-specific information for the current instance:
    job = JobInfo(info).get_instance_info()

    #  If no jobid was discovered for the root instance, use RootJobID()
    if job.id == 0:
        job.id = RootJobID()

    return job
예제 #14
0
파일: groups.py 프로젝트: tpatki/flux-core
def get(args):
    """
    Get current value of group.
    This only works on rank 0, but for testing that case we have --rank.
    """
    h = flux.Flux()
    resp = h.rpc("groups.get", {"name": args.name}, nodeid=args.rank).get()
    print(resp["members"])
예제 #15
0
    def __init__(self):
        self.event_router = aurcore.event.EventRouter(name="roombot")
        self.flux = flux.Flux("roombot", admin_id=TOKENS.ADMIN_ID, parent_router=self.event_router)
        print("init!")

        @self.flux.router.endpoint(":ready")
        def rdy(event: aurcore.event.Event):
            asyncio.get_running_loop().create_task(self.clock())
def get_events(i, queue):
    f = flux.Flux()
    f.event_subscribe("test-event")
    queue.put(True)
    w = f.msg_watcher_create(cb, topic_glob="test-event", args=i)
    w.start()
    f.reactor_run()
    w.destroy()
예제 #17
0
 def setUpClass(self):
     self.f = flux.Flux()
     self.job_spec = json.dumps({
         "nnodes": 1,
         "ntasks": 1,
         "cmdline": ["sleep", "0"],
         "walltime": 15
     })
예제 #18
0
        def add_service_and_disconnect():
            import sys

            h = flux.Flux()
            try:
                h.service_register("baz").get()
            except Exception():
                sys.exit(-1)
            sys.exit(0)
예제 #19
0
def status(args):
    valid_states = [
        "all",
        "online",
        "avail",
        "offline",
        "exclude",
        "drain",
        "draining",
        "drained",
    ]
    default_states = "avail,offline,exclude,draining,drained"
    headings = {
        "state": "STATUS",
        "nnodes": "NNODES",
        "ranks": "RANKS",
        "nodelist": "NODELIST",
        "reason": "REASON",
    }

    #  Emit list of valid states or formats if requested
    if "help" in [args.states, args.format]:
        status_help(args, valid_states, headings)

    #  Get state list from args or defaults:
    states = status_get_state_list(args, valid_states, default_states)

    #  Include reason field only with -vv
    if args.verbose >= 2:
        fmt = "{state:>10} {nnodes:>6} {reason:<25} {nodelist}"
    else:
        fmt = "{state:>10} {nnodes:>6} {nodelist}"
    if args.format:
        fmt = args.format

    #  Get payload from stdin or from resource.status RPC:
    if args.from_stdin:
        resp = sys.stdin.read()
        allocated = IDset()
    else:
        rpc = ListStatusRPC(flux.Flux())
        resp = rpc.get_status()
        allocated = rpc.get_allocated_ranks()

    rstat = ResourceStatus.from_status_response(resp, fmt, allocated)

    formatter = flux.util.OutputFormat(headings, fmt, prepend="0.")
    if not args.no_header:
        print(formatter.header())
    for line in sorted(rstat, key=lambda x: valid_states.index(x.state)):
        if line.state not in states:
            continue
        #  Skip empty lines unless --verbose or --states
        if line.nnodes == 0 and args.states is None and not args.verbose:
            continue
        print(formatter.format(line))
예제 #20
0
def job_exec_start(args):
    """Start testexec job under manual override"""
    try:
        flux.Flux().rpc("job-exec.override", {
            "event": "start",
            "jobid": args.jobid
        }).get()
    except OSError as exc:
        LOGGER.error("%s", exc.strerror)
        sys.exit(1)
예제 #21
0
    def __init__(self):
        self.event_router = aurcore.event.EventRouter(name="roombot")
        self.flux = flux.Flux("pinbot",
                              admin_id=TOKENS.ADMIN_ID,
                              parent_router=self.event_router)
        print("init!")

        @self.flux.router.endpoint(":ready")
        def rdy(event: aurcore.event.Event):
            print("Ready!")
예제 #22
0
def kill(args):
    h = flux.Flux(os.environ.get("FLUX_START_URI"))
    try:
        h.rpc("start.kill", {
            "rank": int(args.rank),
            "signum": int(args.signum)
        }).get()
    except ProcessLookupError:
        LOGGER.error("rank %s broker process not found", args.rank)
        sys.exit(1)
예제 #23
0
 def test_null_handle_exception(self):
     f = flux.Flux()
     payload = {"seq": 1, "pad": "stuff"}
     future = f.rpc("cmb.ping", payload)
     resp = future.get()
     future.pimpl.handle = None
     with six.assertRaisesRegex(
         self, ValueError, r"Attempting to call a cached, bound method.*NULL handle"
     ):
         resp = future.get()
예제 #24
0
def reload(args):
    """
    Send a "reload" request to resource module
    """
    RPC(
        flux.Flux(),
        "resource.reload",
        {"path": os.path.realpath(args.path), "xml": args.xml, "force": args.force},
        nodeid=0,
    ).get()
예제 #25
0
 def __getattr__(self, attr):
     if attr == "flux":
         #  Allow one flux handle per thread, created on demand:
         try:
             return self.tls.flux
         except AttributeError:
             self.tls.flux = flux.Flux()
             return self.tls.flux
     else:
         #  Return components of the validate request as attrs
         return self.jobinfo[attr]
예제 #26
0
파일: t0010-job.py 프로젝트: trws/flux-core
    def setUpClass(self):
        self.fh = flux.Flux()

        self.jobspec_dir = os.path.abspath(
            os.path.join(os.environ["FLUX_SOURCE_DIR"], "t", "jobspec"))

        # get a valid jobspec
        basic_jobspec_fname = os.path.join(self.jobspec_dir, "valid",
                                           "basic_v1.yaml")
        with open(basic_jobspec_fname, "rb") as infile:
            basic_yaml = infile.read()
        self.basic_jobspec = yaml_to_json(basic_yaml)
예제 #27
0
def drain(args):
    """
    Send a drain request to resource module for args.idset
    """
    RPC(
        flux.Flux(),
        "resource.drain",
        {
            "idset": args.idset,
            "reason": " ".join(args.reason)
        },
    ).get()
예제 #28
0
def main():
    h = flux.Flux()

    alloc = h.rpc("sched.alloc", json.dumps({"id": 0}))
    free = h.rpc("sched.free", json.dumps({"id": 0}))
    print("Sent alloc and free requests")

    h.rpc("cmb.rmmod", json.dumps({"name": args.sched_module})).get()
    print("Removed {}".format(args.sched_module))

    expect_enosys(alloc)
    expect_enosys(free)
예제 #29
0
def run_per_rank(name, jobid, args):
    """Run args.exec_per_rank on every rank of jobid

    If command fails on any rank then drain that rank
    """

    returncode = 0

    if args.exec_per_rank is None:
        return 0

    per_rank_cmd = args.exec_per_rank.split(",")

    processes = {}
    fail_ids = IDset()

    handle = flux.Flux()
    hostlist = flux.hostlist.Hostlist(handle.attr_get("hostlist"))

    ranks = fetch_job_ranks(handle, jobid)
    if ranks is None:
        return 1

    if args.verbose:
        LOGGER.info(
            "%s: %s: executing %s on ranks %s", jobid, name, per_rank_cmd, ranks
        )

    for rank in ranks:
        cmd = ["flux", "exec", "-qn", f"-r{rank}"] + per_rank_cmd
        processes[rank] = process_create(cmd, stderr=subprocess.PIPE)

    for rank in ranks:
        rc = processes[rank].wait()
        for line in processes[rank].stderr:
            errline = line.decode("utf-8").rstrip()
            LOGGER.error("%s (rank %d): %s", hostlist[rank], rank, errline)
        if rc != 0:
            fail_ids.set(rank)
            if rc > returncode:
                returncode = rc

    if len(fail_ids) > 0:
        LOGGER.error("%s: rank %s failed %s, draining", jobid, fail_ids, name)
        drain(handle, fail_ids, f"{name} failed for jobid {jobid}")

    if args.verbose:
        ranks.subtract(fail_ids)
        if len(ranks) > 0:
            LOGGER.info("%s: %s: completed successfully on %s", jobid, name, ranks)

    return returncode
예제 #30
0
파일: groups.py 프로젝트: tpatki/flux-core
def waitfor(args):
    """
    Wait for group to have zero (or --count) members.
    """
    h = flux.Flux()
    rpc = h.rpc(
        "groups.get",
        {"name": args.name},
        nodeid=0,
        flags=flux.constants.FLUX_RPC_STREAMING,
    )
    rpc.then(waitfor_continuation, args.count)
    h.reactor_run()