def _log_error_tail(self, pk, retcode): fname = self.outfiles[pk].name if os.path.exists(fname): tail = get_tail(self.outfiles[pk].name) else: tail = '' logmsg = self.log_prefix(pk) + f'nonzero return {retcode}:\n {tail}' logger.error(logmsg) return tail
def preprocess(job): logger.debug(f'{job.cute_id} in preprocess') # Get preprocesser exe preproc_app = job.preprocess if not preproc_app: job.state = 'PREPROCESSED' return if not os.path.exists(preproc_app.split()[0]): #TODO: look for preproc in the EXE directories message = f"Preprocessor {preproc_app} does not exist on filesystem" raise BalsamTransitionError(message) # Create preprocess-specific environment envs = job.get_envs() # Run preprocesser with special environment in job working directory out = os.path.join(job.working_directory, f"preprocess.log") with open(out, 'w') as fp: fp.write(f"# Balsam Preprocessor: {preproc_app}") fp.flush() try: args = preproc_app.split() logger.info(f"{job.cute_id} preprocess Popen {args}") proc = subprocess.Popen( args, stdout=fp, stderr=subprocess.STDOUT, env=envs, cwd=job.working_directory, ) retcode = proc.wait(timeout=PREPROCESS_TIMEOUT_SECONDS) proc.communicate() except Exception as e: message = f"Preprocess failed: {e}" try: proc.kill() except: pass raise BalsamTransitionError(message) from e if retcode != 0: tail = get_tail(out) message = f"{job.cute_id} preprocess returned {retcode}:\n{tail}" raise BalsamTransitionError(message) job.state = 'PREPROCESSED' logger.debug(f"{job.cute_id} preprocess done")
def check_state(self, run): retcode = run.process.poll() if retcode is None: run.current_state = 'RUNNING' elif retcode == 0: logger.info(f"MPIRun {run.job.cute_id} done") run.current_state = 'RUN_DONE' run.outfile.close() run.free_workers() else: run.process.communicate() run.outfile.close() tail = get_tail(run.outfile.name) run.current_state = 'RUN_ERROR' run.err_msg = tail logger.info(f"MPIRun {run.job.cute_id} error code {retcode}:\n{tail}") run.free_workers() return run.current_state
def postprocess(job, *, error_handling=False, timeout_handling=False): logger.debug(f'{job.cute_id} in postprocess') if error_handling and timeout_handling: raise ValueError("Both error-handling and timeout-handling is invalid") if error_handling: logger.info(f'{job.cute_id} handling RUN_ERROR') if timeout_handling: logger.info(f'{job.cute_id} handling RUN_TIMEOUT') # Get postprocesser exe postproc_app = job.postprocess # If no postprocesssor; move on (unless in error_handling mode) if not postproc_app: if error_handling: message = f"{job.cute_id} handle error: no postprocessor found!" raise BalsamTransitionError(message) elif timeout_handling: job.state = 'RESTART_READY' logger.warning( f'{job.cute_id} unhandled job timeout: marked RESTART_READY') return else: job.state = 'POSTPROCESSED', logger.debug(f'{job.cute_id} no postprocess: skipped') return if not os.path.exists(postproc_app.split()[0]): #TODO: look for postproc in the EXE directories message = f"Postprocessor {postproc_app} does not exist on filesystem" raise BalsamTransitionError(message) # Create postprocess-specific environment envs = job.get_envs(timeout=timeout_handling, error=error_handling) # Run postprocesser with special environment in job working directory out = os.path.join(job.working_directory, f"postprocess.log") with open(out, 'w') as fp: fp.write(f"# Balsam Postprocessor: {postproc_app}\n") if timeout_handling: fp.write("# Invoked to handle RUN_TIMEOUT\n") if error_handling: fp.write("# Invoked to handle RUN_ERROR\n") fp.flush() try: args = postproc_app.split() logger.info(f"{job.cute_id} postprocess Popen {args}") proc = subprocess.Popen( args, stdout=fp, stderr=subprocess.STDOUT, env=envs, cwd=job.working_directory, ) retcode = proc.wait(timeout=POSTPROCESS_TIMEOUT_SECONDS) proc.communicate() except Exception as e: message = f"Postprocess failed: {e}" try: proc.kill() except: pass raise BalsamTransitionError(message) from e if retcode != 0: tail = get_tail(out, nlines=30) message = f"{job.cute_id} postprocess returned {retcode}:\n{tail}" raise BalsamTransitionError(message) job.refresh_from_db() # If postprocessor handled error or timeout, it should have changed job's # state. If it failed to do this, mark FAILED. Otherwise, POSTPROCESSED. if error_handling and job.state == 'RUN_ERROR': message = f"{job.cute_id} Error handling didn't fix job state: marking FAILED" raise BalsamTransitionError(message) if timeout_handling and job.state == 'RUN_TIMEOUT': message = f"{job.cute_id} Timeout handling didn't change job state: marking FAILED" raise BalsamTransitionError(message) # Only move the state along to POSTPROCESSED if the job is still in RUN_DONE # and the post.py returned normally. Otherwise, post.py might mark a job # FAILED, and you override it with POSTPROCESSED, breaking the workflow. if job.state == 'RUN_DONE': job.state = 'POSTPROCESSED' logger.debug(f"{job.cute_id} postprocess done")