def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("njobs", nargs="?", type=int, default=10) args = parser.parse_args() # create jobspec for compute.py compute_jobspec = JobspecV1.from_command( command=["./compute.py", "10"], num_tasks=4, num_nodes=2, cores_per_task=2 ) compute_jobspec.cwd = os.getcwd() compute_jobspec.environment = dict(os.environ) bad_jobspec = JobspecV1.from_command(["/bin/false"]) # create an executor to submit jobs with FluxExecutor() as executor: futures = [] # submit half successful jobs and half failures for _ in range(args.njobs // 2): futures.append(executor.submit(compute_jobspec)) print(f"submit: {id(futures[-1])} compute_jobspec") for _ in range(args.njobs // 2, args.njobs): futures.append(executor.submit(bad_jobspec)) print(f"submit: {id(futures[-1])} bad_jobspec") # wait for each future in turn for fut in futures: if fut.exception() is not None: print(f"wait: {id(fut)} Error: job raised error {fut.exception()}") elif fut.result() == 0: print(f"wait: {id(fut)} Success") else: print(f"wait: {id(fut)} Error: job returned exit code {fut.result()}")
def test_22_from_batch_command(self): """Test that `from_batch_command` produces a valid jobspec""" jobid = job.submit( self.fh, JobspecV1.from_batch_command("#!/bin/sh\nsleep 0", "nested sleep") ) self.assertGreater(jobid, 0) # test that a shebang is required with self.assertRaises(ValueError): job.submit( self.fh, JobspecV1.from_batch_command("sleep 0", "nested sleep with no shebang"), )
def submitJob(self): compute_jobreq = JobspecV1.from_command( command=["sleep", "0"], num_tasks=2, num_nodes=1, cores_per_task=1 ) compute_jobreq.cwd = os.getcwd() compute_jobreq.environment = dict(os.environ) flux.job.submit(self.fh, compute_jobreq, waitable=True)
def test_submit_after_shutdown(self): executor = FluxExecutor() executor.shutdown(wait=True) with self.assertRaises(RuntimeError): executor.submit(JobspecV1.from_command(["true"])) with self.assertRaises(RuntimeError): executor.submit(None)
def init_jobspec(self, args): # If no script (reading from stdin), then use "flux" as arg[0] command = args.SCRIPT if not command: command = ["flux"] if not args.nslots: raise ValueError("Number of slots to allocate must be specified") jobspec = JobspecV1.from_command( command=command, num_tasks=args.nslots, cores_per_task=args.cores_per_slot, gpus_per_task=args.gpus_per_slot, num_nodes=args.nodes, ) # Start one flux-broker per node: jobspec.setattr_shell_option("per-resource.type", "node") # Copy script contents into jobspec: jobspec.setattr("system.batch.script", self.read_script(args)) jobspec.setattr("system.batch.broker-opts", list_split(args.broker_opts)) # Default output is flux-{{jobid}}.out # overridden by either --output=none or --output=kvs if not args.output: jobspec.setattr_shell_option("output.stdout.type", "file") jobspec.setattr_shell_option("output.stdout.path", "flux-{{id}}.out") return jobspec
def test_as_completed(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) futures = [executor.submit(jobspec) for _ in range(3)] for fut in cf.as_completed(futures): self.assertEqual(fut.result(timeout=0), 0) self.assertIsNone(fut.exception())
def test_exception_completion(self): jobspec = JobspecV1.from_command(["false"]) thread = _FluxExecutorThread(threading.Event(), threading.Event(), collections.deque(), 0.01, (), {}) fut = FluxExecutorFuture(threading.get_ident()) self.assertFalse(fut.done()) fut._set_event(EventLogEvent({"name": "start", "timestamp": 0})) self.assertFalse(fut.done()) thread._FluxExecutorThread__event_update( ShamJobEventWatchFuture( EventLogEvent({ "name": "exception", "timestamp": 0, "context": { "severity": 1, "type": "foobar" }, })), fut, ) self.assertFalse(fut.done()) thread._FluxExecutorThread__event_update( ShamJobEventWatchFuture( EventLogEvent({ "name": "exception", "timestamp": 0, "context": { "severity": 0, "type": "foobar" }, })), fut, ) self.assertTrue(fut.done()) self.assertIsInstance(fut.exception(), JobException)
def test_exception_event(self): with FluxExecutor() as executor: flag = threading.Event() future = executor.submit(JobspecV1.from_command(["/not/a/real/app"])) future.add_event_callback("exception", lambda fut, event: flag.set()) self.assertIsInstance(future.exception(), JobException) self.assertTrue(flag.is_set())
def create_test_jobspec(args): # Create a test jobspec if not args.command: args.command = ["true"] jobspec = JobspecV1.from_command(args.command) # Set any requested shell options if args.setopt is not None: for keyval in args.setopt: # Split into key, val with a default for 1 if no val given: key, val = (keyval.split("=", 1) + [1])[:2] try: val = json.loads(val) except (json.JSONDecodeError, TypeError): pass jobspec.setattr_shell_option(key, val) # Set any requested Jobspec attributes if args.setattr is not None: for keyval in args.setattr: tmp = keyval.split("=", 1) if len(tmp) != 2: raise ValueError("--setattr: Missing value for attr " + keyval) key = tmp[0] try: val = json.loads(tmp[1]) except (json.JSONDecodeError, TypeError): val = tmp[1] jobspec.setattr(key, val) if not args.exec: jobspec.setattr("system.exec.test.run_duration", args.runtime) return jobspec
def test_broken_executor(self): with FluxExecutor() as executor: executor._broken_event.set() with self.assertRaisesRegex(RuntimeError, "Executor is broken.*"): executor.submit(JobspecV1.from_command(["/not/a/real/app"])) with self.assertRaisesRegex(RuntimeError, "Executor is broken.*"): executor.attach(25979)
def test_15_job_cancel(self): self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"]) jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True) job.cancel(self.fh, jobid) fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0) return_id, success, errmsg = fut.get_status() self.assertEqual(return_id, jobid) self.assertFalse(success)
def test_executor_event_callbacks(self): with FluxExecutor() as executor: expected_events = set(["start", "finish", "depend", "priority", "free"]) future = executor.submit(JobspecV1.from_command(["false"])) for event in executor.EVENTS: future.add_event_callback( event, lambda fut, event: expected_events.discard(event.name) ) self.assertFalse(expected_events) # no more expected events
def test_20_003_job_event_watch_sync(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) event = future.get_event() self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") future.cancel()
def main(): # set up command-line parser parser = argparse.ArgumentParser( description="submit and wait for the completion of " "N bundles, each consisting of compute " "and io-forwarding jobs") parser.add_argument( "njobs", metavar="N", type=int, help="the number of bundles to submit and wait", ) args = parser.parse_args() # set up jobspecs compute_jobreq = JobspecV1.from_command(command=["./compute.py", "10"], num_tasks=6, num_nodes=3, cores_per_task=2) compute_jobreq.cwd = os.getcwd() compute_jobreq.environment = dict(os.environ) io_jobreq = JobspecV1.from_command(command=["./io-forwarding.py", "10"], num_tasks=3, num_nodes=3, cores_per_task=1) io_jobreq.cwd = os.getcwd() io_jobreq.environment = dict(os.environ) # submit jobs and register event callbacks for all events with FluxExecutor() as executor: futures = [ executor.submit(compute_jobreq) for _ in range(args.njobs // 2) ] futures.extend( executor.submit(io_jobreq) for _ in range(args.njobs // 2, args.njobs)) print("bookkeeper: all jobs submitted") for fut in futures: # each event can have a different callback for event in executor.EVENTS: fut.add_event_callback(event, event_callback) print("bookkeeper: waiting until all jobs complete") # exiting the context manager waits for the executor to complete all futures print("bookkeeper: all jobs completed")
def test_wait(self): with FluxExecutor(threads=3) as executor: jobspec = JobspecV1.from_command(["false"]) futures = [executor.submit(jobspec) for _ in range(3)] done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED) self._check_done(done) done, not_done = cf.wait(futures, return_when=cf.FIRST_EXCEPTION) self._check_done(done) done, not_done = cf.wait(futures) self._check_done(done) self.assertEqual(len(not_done), 0)
def init_jobspec(self, args): if not args.command: raise ValueError("job command and arguments are missing") return JobspecV1.from_command( args.command, num_tasks=args.ntasks, cores_per_task=args.cores_per_task, gpus_per_task=args.gpus_per_task, num_nodes=args.nodes, )
def test_submit_after_shutdown(self): executor = FluxExecutor() executor.shutdown(wait=True) with self.assertRaises(RuntimeError): executor.submit(JobspecV1.from_command(["true"])) with self.assertRaises(RuntimeError): executor.submit(None) with self.assertRaises(RuntimeError): executor.attach(5) with self.assertRaises(RuntimeError): executor.attach(None) self.assertFalse(executor._broken_event.is_set())
def test_failed_submit(self): with FluxExecutor(thread_name_prefix="foobar") as executor: jobspec = JobspecV1.from_command(["false"]) future = executor.submit(jobspec).add_jobid_callback( lambda future: event.set() ) event = threading.Event() jobid = future.jobid() self.assertGreater(jobid, 0) self.assertTrue(event.is_set()) self.assertEqual(future.result(), 1) self.assertIsNone(future.exception())
def test_bad_submit_arguments(self): """send bad arguments to ``flux.job.submit``""" deq = collections.deque() event = threading.Event() thread = _FluxExecutorThread(event, deq, 0.01, (), {}) futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)] jobspec = JobspecV1.from_command(["false"]) deq.extend(((jobspec,), {"not_an_arg": 42}, f) for f in futures) event.set() thread.run() self.assertFalse(deq) self.assertEqual(0, thread._FluxExecutorThread__remaining_flux_futures) for fut in futures: self.assertIsInstance(fut.exception(), TypeError)
def test_16_job_kill(self): self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"]) jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True) # Wait for shell to fully start to avoid delay in signal job.event_wait(self.fh, jobid, name="start") job.event_wait( self.fh, jobid, name="shell.start", eventlog="guest.exec.eventlog" ) job.kill(self.fh, jobid, signum=signal.SIGKILL) fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0) return_id, success, errmsg = fut.get_status() self.assertEqual(return_id, jobid) self.assertFalse(success)
def test_20_004_job_event_watch(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) events = [] for event in job.event_watch(self.fh, jobid): self.assertIsInstance(event, job.EventLogEvent) self.assertTrue(hasattr(event, "timestamp")) self.assertTrue(hasattr(event, "name")) self.assertTrue(hasattr(event, "context")) self.assertIs(type(event.timestamp), float) self.assertIs(type(event.name), str) self.assertIs(type(event.context), dict) events.append(event.name) self.assertEqual(len(events), 10)
def test_as_completed(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) futures = [executor.submit(jobspec) for _ in range(3)] attach_futures = [] for fut in cf.as_completed(futures): self.assertEqual(fut.result(timeout=0), 0) self.assertIsNone(fut.exception()) attach_fut = executor.attach(fut.jobid()) self.assertEqual(fut.jobid(), attach_fut.jobid()) attach_futures.append(attach_fut) for attach_fut in cf.as_completed(attach_futures): self.assertEqual(attach_fut.result(timeout=0), 0) self.assertIsNone(attach_fut.exception()) self.assertFalse(executor._broken_event.is_set())
def test_cancel(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["false"]) for _ in range(3): future = executor.submit(jobspec) if future.cancel(): self.assertFalse(future.running()) self.assertTrue(future.cancelled()) with self.assertRaises(cf.CancelledError): future.jobid() with self.assertRaises(cf.CancelledError): future.exception() else: self.assertEqual(future.result(), 1) self.assertIsNone(future.exception())
def init_jobspec(self, args): if not args.nslots: raise ValueError("Number of slots to allocate must be specified") jobspec = JobspecV1.from_nest_command( command=args.COMMAND, num_slots=args.nslots, cores_per_slot=args.cores_per_slot, gpus_per_slot=args.gpus_per_slot, num_nodes=args.nodes, broker_opts=list_split(args.broker_opts), ) if sys.stdin.isatty(): jobspec.setattr_shell_option("pty", True) return jobspec
def test_cancel(self): deq = collections.deque() event = threading.Event() jobspec = JobspecV1.from_command(["false"]) thread = _FluxExecutorThread(event, deq, 0.01, (), {}) futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)] for fut in futures: deq.append(((jobspec,), {}, fut)) fut.cancel() event.set() thread.run() for fut in futures: with self.assertRaises(cf.CancelledError): fut.result() with self.assertRaises(cf.CancelledError): fut.jobid()
def test_cancel_attach(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) jobid = executor.submit(jobspec).jobid() for _ in range(3): future = executor.attach(jobid) if future.cancel(): self.assertFalse(future.running()) self.assertTrue(future.cancelled()) self.assertEqual(future.jobid(), jobid) with self.assertRaises(cf.CancelledError): future.exception() else: self.assertEqual(future.result(), 0) self.assertIsNone(future.exception()) self.assertFalse(executor._broken_event.is_set())
def test_20_006_job_event_wait(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) event = job.event_wait(self.fh, jobid, "start") self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "start") event = job.event_wait( self.fh, jobid, "shell.init", eventlog="guest.exec.eventlog" ) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "shell.init") event = job.event_wait(self.fh, jobid, "clean") self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "clean") with self.assertRaises(OSError): job.event_wait(self.fh, jobid, "foo")
def init_jobspec(self, args): if not args.nslots: raise ValueError("Number of slots to allocate must be specified") broker_opts = list_split(args.broker_opts) jobspec = JobspecV1.from_command( command=["flux", "broker", *broker_opts, *args.COMMAND], num_tasks=args.nslots, cores_per_task=args.cores_per_slot, gpus_per_task=args.gpus_per_slot, num_nodes=args.nodes, ) jobspec.setattr_shell_option("per-resource.type", "node") if sys.stdin.isatty(): jobspec.setattr_shell_option("pty", True) return jobspec
def test_20_007_job_event_wait_exception(self): event = None jobid = job.submit( self.fh, JobspecV1.from_command(["sleep", "0"], num_tasks=128) ) self.assertTrue(jobid > 0) try: event = job.event_wait(self.fh, jobid, "start") except job.JobException as err: self.assertEqual(err.severity, 0) self.assertEqual(err.type, "alloc") self.assertGreater(err.timestamp, 0.0) self.assertIs(event, None) try: event = job.event_wait(self.fh, jobid, "start", raiseJobException=False) except OSError as err: self.assertEqual(err.errno, errno.ENODATA) self.assertIs(event, None)
def test_20_005_job_event_watch_with_cancel(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "3"]), waitable=True) self.assertTrue(jobid > 0) events = [] future = job.event_watch_async(self.fh, jobid) while True: event = future.get_event() if event is None: break if event.name == "start": future.cancel() events.append(event.name) self.assertEqual(event, None) # Should have less than the expected number of events due to cancel self.assertLess(len(events), 8) job.cancel(self.fh, jobid) job.wait(self.fh, jobid)