def exec_watch_cb(self, future, args, jobid, label=""): """Handle events in the guest.exec.eventlog""" event = future.get_event() if event and event.name == "shell.init": # Once the shell.init event is posted, then it is safe to # begin watching the output eventlog: # job.event_watch_async(self.flux_handle, jobid, eventlog="guest.output").then( self.output_watch_cb, args, jobid, label) # Events from this eventlog are no longer needed future.cancel()
def submit_cb(self, future, args, label=""): try: jobid = JobID(future.get_id()) if not args.quiet: print(jobid) except OSError as exc: print(f"{label}{exc}", file=sys.stderr) self.exitcode = 1 self.progress_update(submit_failed=True) return if args.wait or args.watch: # # If the user requested to wait for or watch all jobs # then start watching the main eventlog. # # Carry along a bit of state for each job so that exceptions # before the job is running can be handled properly # jobinfo = {"id": jobid, "state": "submit"} fut = job.event_watch_async(self.flux_handle, jobid) fut.then(self.event_watch_cb, args, jobinfo, label) self.progress_update(jobinfo, submit=True) elif self.progress: # Update progress of submission only self.progress.update(jps=self.jobs_per_sec())
def event_watch_cb(self, future, args, jobinfo, label=""): """Handle events in the main job eventlog""" jobid = jobinfo["id"] event = future.get_event() self.progress_update(jobinfo, event=event) if event is None: return if event.name == "exception": # # Handle an exception: update global exitcode and print # an error: if jobinfo["state"] == "submit": # # If job was still pending then this job failed # to execute. Treat it as failure with exitcode = 1 # jobinfo["state"] = "failed" if self.exitcode == 0: self.exitcode = 1 # Print a human readable error: exception_type = event.context["type"] note = event.context["note"] print( f"{jobid}: exception: type={exception_type} note={note}", file=sys.stderr, ) elif event.name == "start" and args.watch: # # Watch the exec eventlog if the --watch option was provided: # jobinfo["state"] = "running" job.event_watch_async(self.flux_handle, jobid, eventlog="guest.exec.eventlog").then( self.exec_watch_cb, args, jobid, label) elif event.name == "finish": # # Collect exit status and adust self.exitcode if necesary: # jobinfo["state"] = "done" status = self.status_to_exitcode(event.context["status"]) if args.verbose: print(f"{jobid}: complete: status={status}", file=sys.stderr) if status > self.exitcode: self.exitcode = status
def test_20_003_job_event_watch_sync(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) event = future.get_event() self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") future.cancel()
def test_20_005_job_event_watch_with_cancel(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "3"]), waitable=True) self.assertTrue(jobid > 0) events = [] future = job.event_watch_async(self.fh, jobid) while True: event = future.get_event() if event is None: break if event.name == "start": future.cancel() events.append(event.name) self.assertEqual(event, None) # Should have less than the expected number of events due to cancel self.assertLess(len(events), 8) job.cancel(self.fh, jobid) job.wait(self.fh, jobid)
def test_20_005_1_job_event_watch_with_cancel_stop_true(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "3"]), waitable=True) self.assertTrue(jobid > 0) events = [] future = job.event_watch_async(self.fh, jobid) def cb(future, events): event = future.get_event() if event.name == "start": future.cancel(stop=True) events.append(event.name) future.then(cb, events) rc = self.fh.reactor_run() # Last event should be "start" self.assertEqual(events[-1], "start") job.cancel(self.fh, jobid) job.wait(self.fh, jobid)
def test_20_002_job_event_watch_no_autoreset(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) # First event should be "submit" event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") # get_event() again with no reset returns same event: event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") # reset, then get_event() should get next event future.reset() event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "validate") future.cancel()
def test_20_001_job_event_watch_async(self): myarg = dict(a=1, b=2) events = [] def cb(future, arg): self.assertEqual(arg, myarg) event = future.get_event() if event is None: future.get_flux().reactor_stop() return self.assertIsInstance(event, job.EventLogEvent) events.append(event.name) jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) future.then(cb, myarg) rc = self.fh.reactor_run() self.assertGreaterEqual(rc, 0) self.assertEqual(len(events), 10) self.assertEqual(events[0], "submit") self.assertEqual(events[-1], "clean")
def test_32_job_result(self): result = {} ids = [] def cb(future, jobid): result[jobid] = future ids.append(job.submit(self.fh, JobspecV1.from_command(["true"]))) ids.append(job.submit(self.fh, JobspecV1.from_command(["false"]))) ids.append(job.submit(self.fh, JobspecV1.from_command(["nosuchprog"]))) ids.append( job.submit(self.fh, JobspecV1.from_command(["sleep", "120"]))) # Submit held job so we can cancel before RUN state ids.append( job.submit(self.fh, JobspecV1.from_command(["true"]), urgency=0)) job.cancel(self.fh, ids[4]) for jobid in ids: flux.job.result_async(self.fh, jobid).then(cb, jobid) def cancel_on_start(future, jobid): event = future.get_event() if event is None: return if event.name == "shell.start": job.cancel(self.fh, jobid) future.cancel() job.event_watch_async(self.fh, ids[3], eventlog="guest.exec.eventlog").then( cancel_on_start, ids[3]) self.fh.reactor_run() self.assertEqual(len(result.keys()), len(ids)) self.addTypeEqualityFunc(JobInfo, self.assertJobInfoEqual) self.assertEqual( result[ids[0]].get_info(), JobInfo({ "id": ids[0], "result": flux.constants.FLUX_JOB_RESULT_COMPLETED, "t_start": 1.0, "t_run": 2.0, "t_cleanup": 3.0, "waitstatus": 0, "exception_occurred": False, }), ) self.assertEqual( result[ids[1]].get_info(), JobInfo({ "id": ids[1], "result": flux.constants.FLUX_JOB_RESULT_FAILED, "t_submit": 1.0, "t_run": 2.0, "t_cleanup": 3.0, "waitstatus": 256, "exception_occurred": False, }), ) self.assertEqual( result[ids[2]].get_info(), JobInfo({ "id": ids[2], "result": flux.constants.FLUX_JOB_RESULT_FAILED, "t_submit": 1.0, "t_run": 2.0, "t_cleanup": 3.0, "waitstatus": 32512, "exception_occurred": True, "exception_type": "exec", "exception_note": "task 0.*: start failed: nosuchprog: " "No such file or directory", "exception_severity": 0, }), ) self.assertEqual( result[ids[3]].get_info(), JobInfo({ "id": ids[3], "result": flux.constants.FLUX_JOB_RESULT_CANCELED, "t_submit": 1.0, "t_run": 2.0, "t_cleanup": 3.0, "waitstatus": 36608, # 143<<8 "exception_occurred": True, "exception_type": "cancel", "exception_note": "", "exception_severity": 0, }), ) self.assertEqual( result[ids[4]].get_info(), JobInfo({ "id": ids[4], "result": flux.constants.FLUX_JOB_RESULT_CANCELED, "t_submit": 0.0, "exception_occurred": True, "exception_type": "cancel", "exception_note": "", "exception_severity": 0, }), ) # synchronous job.result() test self.assertEqual(job.result(self.fh, ids[3]), result[ids[3]].get_info())
def handle_submit(self, args, jobid): self.jobs[jobid] = {"t_submit": time.time()} fut = job.event_watch_async(self.handle, jobid) fut.then(self.event_cb, args, jobid)