Exemplo n.º 1
0
def create_test_jobspec(args):

    #  Create a test jobspec
    if not args.command:
        args.command = ["true"]
    jobspec = JobspecV1.from_command(args.command)

    #  Set any requested shell options
    if args.setopt is not None:
        for keyval in args.setopt:
            # Split into key, val with a default for 1 if no val given:
            key, val = (keyval.split("=", 1) + [1])[:2]
            try:
                val = json.loads(val)
            except (json.JSONDecodeError, TypeError):
                pass
            jobspec.setattr_shell_option(key, val)

    #  Set any requested Jobspec attributes
    if args.setattr is not None:
        for keyval in args.setattr:
            tmp = keyval.split("=", 1)
            if len(tmp) != 2:
                raise ValueError("--setattr: Missing value for attr " + keyval)
            key = tmp[0]
            try:
                val = json.loads(tmp[1])
            except (json.JSONDecodeError, TypeError):
                val = tmp[1]
            jobspec.setattr(key, val)

    if not args.exec:
        jobspec.setattr("system.exec.test.run_duration", args.runtime)

    return jobspec
Exemplo n.º 2
0
 def submitJob(self):
     compute_jobreq = JobspecV1.from_command(
         command=["sleep", "0"], num_tasks=2, num_nodes=1, cores_per_task=1
     )
     compute_jobreq.cwd = os.getcwd()
     compute_jobreq.environment = dict(os.environ)
     flux.job.submit(self.fh, compute_jobreq, waitable=True)
Exemplo n.º 3
0
 def test_as_completed(self):
     with FluxExecutor() as executor:
         jobspec = JobspecV1.from_command(["true"])
         futures = [executor.submit(jobspec) for _ in range(3)]
         for fut in cf.as_completed(futures):
             self.assertEqual(fut.result(timeout=0), 0)
             self.assertIsNone(fut.exception())
Exemplo n.º 4
0
 def test_exception_event(self):
     with FluxExecutor() as executor:
         flag = threading.Event()
         future = executor.submit(JobspecV1.from_command(["/not/a/real/app"]))
         future.add_event_callback("exception", lambda fut, event: flag.set())
         self.assertIsInstance(future.exception(), JobException)
         self.assertTrue(flag.is_set())
Exemplo n.º 5
0
    def init_jobspec(self, args):
        # If no script (reading from stdin), then use "flux" as arg[0]
        command = args.SCRIPT
        if not command:
            command = ["flux"]

        if not args.nslots:
            raise ValueError("Number of slots to allocate must be specified")

        jobspec = JobspecV1.from_command(
            command=command,
            num_tasks=args.nslots,
            cores_per_task=args.cores_per_slot,
            gpus_per_task=args.gpus_per_slot,
            num_nodes=args.nodes,
        )
        #  Start one flux-broker per node:
        jobspec.setattr_shell_option("per-resource.type", "node")

        #  Copy script contents into jobspec:
        jobspec.setattr("system.batch.script", self.read_script(args))
        jobspec.setattr("system.batch.broker-opts",
                        list_split(args.broker_opts))

        # Default output is flux-{{jobid}}.out
        # overridden by either --output=none or --output=kvs
        if not args.output:
            jobspec.setattr_shell_option("output.stdout.type", "file")
            jobspec.setattr_shell_option("output.stdout.path",
                                         "flux-{{id}}.out")
        return jobspec
Exemplo n.º 6
0
 def test_15_job_cancel(self):
     self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"])
     jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True)
     job.cancel(self.fh, jobid)
     fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0)
     return_id, success, errmsg = fut.get_status()
     self.assertEqual(return_id, jobid)
     self.assertFalse(success)
Exemplo n.º 7
0
 def test_executor_event_callbacks(self):
     with FluxExecutor() as executor:
         expected_events = set(["start", "finish", "depend", "priority", "free"])
         future = executor.submit(JobspecV1.from_command(["false"]))
         for event in executor.EVENTS:
             future.add_event_callback(
                 event, lambda fut, event: expected_events.discard(event.name)
             )
     self.assertFalse(expected_events)  # no more expected events
Exemplo n.º 8
0
 def test_20_003_job_event_watch_sync(self):
     jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"]))
     self.assertTrue(jobid > 0)
     future = job.event_watch_async(self.fh, jobid)
     self.assertIsInstance(future, job.JobEventWatchFuture)
     event = future.get_event()
     self.assertIsInstance(event, job.EventLogEvent)
     self.assertEqual(event.name, "submit")
     future.cancel()
Exemplo n.º 9
0
def main():
    # set up command-line parser
    parser = argparse.ArgumentParser(
        description="submit and wait for the completion of "
        "N bundles, each consisting of compute "
        "and io-forwarding jobs")
    parser.add_argument(
        "njobs",
        metavar="N",
        type=int,
        help="the number of bundles to submit and wait",
    )
    args = parser.parse_args()
    # set up jobspecs
    compute_jobreq = JobspecV1.from_command(command=["./compute.py", "10"],
                                            num_tasks=6,
                                            num_nodes=3,
                                            cores_per_task=2)
    compute_jobreq.cwd = os.getcwd()
    compute_jobreq.environment = dict(os.environ)
    io_jobreq = JobspecV1.from_command(command=["./io-forwarding.py", "10"],
                                       num_tasks=3,
                                       num_nodes=3,
                                       cores_per_task=1)
    io_jobreq.cwd = os.getcwd()
    io_jobreq.environment = dict(os.environ)
    # submit jobs and register event callbacks for all events
    with FluxExecutor() as executor:
        futures = [
            executor.submit(compute_jobreq) for _ in range(args.njobs // 2)
        ]
        futures.extend(
            executor.submit(io_jobreq)
            for _ in range(args.njobs // 2, args.njobs))
        print("bookkeeper: all jobs submitted")
        for fut in futures:
            # each event can have a different callback
            for event in executor.EVENTS:
                fut.add_event_callback(event, event_callback)
        print("bookkeeper: waiting until all jobs complete")
    # exiting the context manager waits for the executor to complete all futures
    print("bookkeeper: all jobs completed")
Exemplo n.º 10
0
 def test_wait(self):
     with FluxExecutor(threads=3) as executor:
         jobspec = JobspecV1.from_command(["false"])
         futures = [executor.submit(jobspec) for _ in range(3)]
         done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED)
         self._check_done(done)
         done, not_done = cf.wait(futures, return_when=cf.FIRST_EXCEPTION)
         self._check_done(done)
         done, not_done = cf.wait(futures)
         self._check_done(done)
         self.assertEqual(len(not_done), 0)
Exemplo n.º 11
0
    def init_jobspec(self, args):
        if not args.command:
            raise ValueError("job command and arguments are missing")

        return JobspecV1.from_command(
            args.command,
            num_tasks=args.ntasks,
            cores_per_task=args.cores_per_task,
            gpus_per_task=args.gpus_per_task,
            num_nodes=args.nodes,
        )
Exemplo n.º 12
0
 def test_failed_submit(self):
     with FluxExecutor(thread_name_prefix="foobar") as executor:
         jobspec = JobspecV1.from_command(["false"])
         future = executor.submit(jobspec).add_jobid_callback(
             lambda future: event.set()
         )
         event = threading.Event()
         jobid = future.jobid()
         self.assertGreater(jobid, 0)
         self.assertTrue(event.is_set())
         self.assertEqual(future.result(), 1)
         self.assertIsNone(future.exception())
Exemplo n.º 13
0
 def test_submit_after_shutdown(self):
     executor = FluxExecutor()
     executor.shutdown(wait=True)
     with self.assertRaises(RuntimeError):
         executor.submit(JobspecV1.from_command(["true"]))
     with self.assertRaises(RuntimeError):
         executor.submit(None)
     with self.assertRaises(RuntimeError):
         executor.attach(5)
     with self.assertRaises(RuntimeError):
         executor.attach(None)
     self.assertFalse(executor._broken_event.is_set())
Exemplo n.º 14
0
 def test_20_004_job_event_watch(self):
     jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"]))
     self.assertTrue(jobid > 0)
     events = []
     for event in job.event_watch(self.fh, jobid):
         self.assertIsInstance(event, job.EventLogEvent)
         self.assertTrue(hasattr(event, "timestamp"))
         self.assertTrue(hasattr(event, "name"))
         self.assertTrue(hasattr(event, "context"))
         self.assertIs(type(event.timestamp), float)
         self.assertIs(type(event.name), str)
         self.assertIs(type(event.context), dict)
         events.append(event.name)
     self.assertEqual(len(events), 10)
Exemplo n.º 15
0
    def test_16_job_kill(self):
        self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"])
        jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True)

        #  Wait for shell to fully start to avoid delay in signal
        job.event_wait(self.fh, jobid, name="start")
        job.event_wait(
            self.fh, jobid, name="shell.start", eventlog="guest.exec.eventlog"
        )
        job.kill(self.fh, jobid, signum=signal.SIGKILL)
        fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0)
        return_id, success, errmsg = fut.get_status()
        self.assertEqual(return_id, jobid)
        self.assertFalse(success)
Exemplo n.º 16
0
 def test_bad_submit_arguments(self):
     """send bad arguments to ``flux.job.submit``"""
     deq = collections.deque()
     event = threading.Event()
     thread = _FluxExecutorThread(event, deq, 0.01, (), {})
     futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)]
     jobspec = JobspecV1.from_command(["false"])
     deq.extend(((jobspec,), {"not_an_arg": 42}, f) for f in futures)
     event.set()
     thread.run()
     self.assertFalse(deq)
     self.assertEqual(0, thread._FluxExecutorThread__remaining_flux_futures)
     for fut in futures:
         self.assertIsInstance(fut.exception(), TypeError)
Exemplo n.º 17
0
 def test_as_completed(self):
     with FluxExecutor() as executor:
         jobspec = JobspecV1.from_command(["true"])
         futures = [executor.submit(jobspec) for _ in range(3)]
         attach_futures = []
         for fut in cf.as_completed(futures):
             self.assertEqual(fut.result(timeout=0), 0)
             self.assertIsNone(fut.exception())
             attach_fut = executor.attach(fut.jobid())
             self.assertEqual(fut.jobid(), attach_fut.jobid())
             attach_futures.append(attach_fut)
         for attach_fut in cf.as_completed(attach_futures):
             self.assertEqual(attach_fut.result(timeout=0), 0)
             self.assertIsNone(attach_fut.exception())
     self.assertFalse(executor._broken_event.is_set())
Exemplo n.º 18
0
 def test_cancel(self):
     with FluxExecutor() as executor:
         jobspec = JobspecV1.from_command(["false"])
         for _ in range(3):
             future = executor.submit(jobspec)
             if future.cancel():
                 self.assertFalse(future.running())
                 self.assertTrue(future.cancelled())
                 with self.assertRaises(cf.CancelledError):
                     future.jobid()
                 with self.assertRaises(cf.CancelledError):
                     future.exception()
             else:
                 self.assertEqual(future.result(), 1)
                 self.assertIsNone(future.exception())
Exemplo n.º 19
0
 def test_20_006_job_event_wait(self):
     jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"]))
     self.assertTrue(jobid > 0)
     event = job.event_wait(self.fh, jobid, "start")
     self.assertIsInstance(event, job.EventLogEvent)
     self.assertEqual(event.name, "start")
     event = job.event_wait(
         self.fh, jobid, "shell.init", eventlog="guest.exec.eventlog"
     )
     self.assertIsInstance(event, job.EventLogEvent)
     self.assertEqual(event.name, "shell.init")
     event = job.event_wait(self.fh, jobid, "clean")
     self.assertIsInstance(event, job.EventLogEvent)
     self.assertEqual(event.name, "clean")
     with self.assertRaises(OSError):
         job.event_wait(self.fh, jobid, "foo")
Exemplo n.º 20
0
 def test_cancel(self):
     deq = collections.deque()
     event = threading.Event()
     jobspec = JobspecV1.from_command(["false"])
     thread = _FluxExecutorThread(event, deq, 0.01, (), {})
     futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)]
     for fut in futures:
         deq.append(((jobspec,), {}, fut))
         fut.cancel()
     event.set()
     thread.run()
     for fut in futures:
         with self.assertRaises(cf.CancelledError):
             fut.result()
         with self.assertRaises(cf.CancelledError):
             fut.jobid()
Exemplo n.º 21
0
 def test_cancel_attach(self):
     with FluxExecutor() as executor:
         jobspec = JobspecV1.from_command(["true"])
         jobid = executor.submit(jobspec).jobid()
         for _ in range(3):
             future = executor.attach(jobid)
             if future.cancel():
                 self.assertFalse(future.running())
                 self.assertTrue(future.cancelled())
                 self.assertEqual(future.jobid(), jobid)
                 with self.assertRaises(cf.CancelledError):
                     future.exception()
             else:
                 self.assertEqual(future.result(), 0)
                 self.assertIsNone(future.exception())
     self.assertFalse(executor._broken_event.is_set())
Exemplo n.º 22
0
    def init_jobspec(self, args):

        if not args.nslots:
            raise ValueError("Number of slots to allocate must be specified")

        broker_opts = list_split(args.broker_opts)
        jobspec = JobspecV1.from_command(
            command=["flux", "broker", *broker_opts, *args.COMMAND],
            num_tasks=args.nslots,
            cores_per_task=args.cores_per_slot,
            gpus_per_task=args.gpus_per_slot,
            num_nodes=args.nodes,
        )
        jobspec.setattr_shell_option("per-resource.type", "node")
        if sys.stdin.isatty():
            jobspec.setattr_shell_option("pty", True)
        return jobspec
Exemplo n.º 23
0
 def test_20_007_job_event_wait_exception(self):
     event = None
     jobid = job.submit(
         self.fh, JobspecV1.from_command(["sleep", "0"], num_tasks=128)
     )
     self.assertTrue(jobid > 0)
     try:
         event = job.event_wait(self.fh, jobid, "start")
     except job.JobException as err:
         self.assertEqual(err.severity, 0)
         self.assertEqual(err.type, "alloc")
         self.assertGreater(err.timestamp, 0.0)
     self.assertIs(event, None)
     try:
         event = job.event_wait(self.fh, jobid, "start", raiseJobException=False)
     except OSError as err:
         self.assertEqual(err.errno, errno.ENODATA)
     self.assertIs(event, None)
Exemplo n.º 24
0
 def test_20_005_job_event_watch_with_cancel(self):
     jobid = job.submit(self.fh,
                        JobspecV1.from_command(["sleep", "3"]),
                        waitable=True)
     self.assertTrue(jobid > 0)
     events = []
     future = job.event_watch_async(self.fh, jobid)
     while True:
         event = future.get_event()
         if event is None:
             break
         if event.name == "start":
             future.cancel()
         events.append(event.name)
     self.assertEqual(event, None)
     # Should have less than the expected number of events due to cancel
     self.assertLess(len(events), 8)
     job.cancel(self.fh, jobid)
     job.wait(self.fh, jobid)
Exemplo n.º 25
0
 def test_exception_event(self):
     with FluxExecutor() as executor:
         flag = threading.Event()
         future = executor.submit(
             JobspecV1.from_command(["/not/a/real/app"]))
         future.add_event_callback("exception",
                                   lambda fut, event: flag.set())
         self.assertIsInstance(future.exception(), JobException)
         self.assertTrue(flag.is_set())
         # repeat the test, attaching to the same job
         jobid = future.jobid()
         flag = threading.Event()
         future = executor.attach(jobid)
         self.assertEqual(jobid, future.jobid())
         future.add_event_callback("exception",
                                   lambda fut, event: flag.set())
         self.assertIsInstance(future.exception(), JobException)
         self.assertTrue(flag.is_set())
     self.assertFalse(executor._broken_event.is_set())
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(
        description="Submit a command repeatedly using FluxExecutor")
    parser.add_argument(
        "-n",
        "--njobs",
        type=int,
        metavar="N",
        help="Set the total number of jobs to run",
        default=100,
    )
    parser.add_argument("command", nargs=argparse.REMAINDER)
    args = parser.parse_args()
    if not args.command:
        args.command = ["true"]
    t0 = time.perf_counter()
    label = "bulksubmit_executor"
    with FluxExecutor() as executor:
        compute_jobspec = JobspecV1.from_command(args.command)
        futures = [executor.submit(compute_jobspec) for _ in range(args.njobs)]
        # wait for the jobid for each job, as a proxy for the job being submitted
        for fut in futures:
            fut.jobid()
        # all jobs submitted - print timings
        dt = time.perf_counter() - t0
        jps = args.njobs / dt
        log(label, f"submitted {args.njobs} jobs in {dt:.2f}s. {jps:.2f}job/s")
        # wait for jobs to complete
        for i, _ in enumerate(cf.as_completed(futures)):
            if i == 0:
                log(
                    label,
                    f"First job finished in about {time.perf_counter() - t0:.3f}s",
                )
            jps = (i + 1) / (time.perf_counter() - t0)
            progress((i + 1) / args.njobs,
                     length=58,
                     suffix=f"({jps:.1f} job/s)")
    # print time summary
    dt = time.perf_counter() - t0
    log(label,
        f"Ran {args.njobs} jobs in {dt:.1f}s. {args.njobs / dt:.1f} job/s")
Exemplo n.º 27
0
    def test_20_005_1_job_event_watch_with_cancel_stop_true(self):
        jobid = job.submit(self.fh,
                           JobspecV1.from_command(["sleep", "3"]),
                           waitable=True)
        self.assertTrue(jobid > 0)
        events = []
        future = job.event_watch_async(self.fh, jobid)

        def cb(future, events):
            event = future.get_event()
            if event.name == "start":
                future.cancel(stop=True)
            events.append(event.name)

        future.then(cb, events)
        rc = self.fh.reactor_run()

        # Last event should be "start"
        self.assertEqual(events[-1], "start")
        job.cancel(self.fh, jobid)
        job.wait(self.fh, jobid)
Exemplo n.º 28
0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("njobs", nargs="?", type=int, default=10)
    parser.add_argument("window_size", nargs="?", type=int, default=2)
    args = parser.parse_args()
    print(args)
    # create jobspec for compute.py
    compute_jobspec = JobspecV1.from_command(
        command=["./compute.py", "5"], num_tasks=4, num_nodes=2, cores_per_task=2
    )
    compute_jobspec.cwd = os.getcwd()
    compute_jobspec.environment = dict(os.environ)
    # create a queue of the jobspecs to submit
    jobspec_queue = collections.deque(compute_jobspec for _ in range(args.njobs))
    futures = []  # holds incomplete futures
    with FluxExecutor() as executor:
        while jobspec_queue or futures:
            if len(futures) < args.window_size and jobspec_queue:
                fut = executor.submit(jobspec_queue.popleft())
                print(f"submit: {id(fut)}")
                futures.append(fut)
            else:
                done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED)
                futures = list(not_done)
                for fut in done:
                    if fut.exception() is not None:
                        print(
                            f"wait: {id(fut)} Error: job raised error "
                            f"{fut.exception()}"
                        )
                    elif fut.result() == 0:
                        print(f"wait: {id(fut)} Success")
                    else:
                        print(
                            f"wait: {id(fut)} Error: job returned "
                            f"exit code {fut.result()}"
                        )
Exemplo n.º 29
0
    def test_20_001_job_event_watch_async(self):
        myarg = dict(a=1, b=2)
        events = []

        def cb(future, arg):
            self.assertEqual(arg, myarg)
            event = future.get_event()
            if event is None:
                future.get_flux().reactor_stop()
                return
            self.assertIsInstance(event, job.EventLogEvent)
            events.append(event.name)

        jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"]))
        self.assertTrue(jobid > 0)
        future = job.event_watch_async(self.fh, jobid)
        self.assertIsInstance(future, job.JobEventWatchFuture)
        future.then(cb, myarg)
        rc = self.fh.reactor_run()
        self.assertGreaterEqual(rc, 0)
        self.assertEqual(len(events), 10)
        self.assertEqual(events[0], "submit")
        self.assertEqual(events[-1], "clean")
Exemplo n.º 30
0
    def test_20_002_job_event_watch_no_autoreset(self):
        jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"]))
        self.assertTrue(jobid > 0)
        future = job.event_watch_async(self.fh, jobid)
        self.assertIsInstance(future, job.JobEventWatchFuture)

        # First event should be "submit"
        event = future.get_event(autoreset=False)
        self.assertIsInstance(event, job.EventLogEvent)
        self.assertEqual(event.name, "submit")

        # get_event() again with no reset returns same event:
        event = future.get_event(autoreset=False)
        self.assertIsInstance(event, job.EventLogEvent)
        self.assertEqual(event.name, "submit")

        # reset, then get_event() should get next event
        future.reset()
        event = future.get_event(autoreset=False)
        self.assertIsInstance(event, job.EventLogEvent)
        self.assertEqual(event.name, "validate")

        future.cancel()