def run(self, *user_flags): self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags), ' : '.join(mpi_progs)) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def collate(self): # Set the stacksize to be unlimited res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY)) collate_config = self.expt.config.get('collate', {}) # The mpi flag implies using mppnccombine-fast mpi = collate_config.get('mpi', False) if mpi: default_exe = 'mppnccombine-fast' else: default_exe = 'mppnccombine' # Locate the FMS collation tool # Check config for collate executable mppnc_path = collate_config.get('exe') if mppnc_path is None: for f in os.listdir(self.expt.lab.bin_path): if f == default_exe: mppnc_path = os.path.join(self.expt.lab.bin_path, f) break else: if not os.path.isabs(mppnc_path): mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path) assert mppnc_path, 'No mppnccombine program found' # Check config for collate command line options collate_flags = collate_config.get('flags') if collate_flags is None: if mpi: collate_flags = '-r' else: collate_flags = '-n4 -z -m -r' if mpi: # The output file is the first argument after the flags # and mppnccombine-fast uses an explicit -o flag to specify # the output collate_flags = " ".join([collate_flags, '-o']) mpi_module = envmod.lib_update(mppnc_path, 'libmpi.so') # Import list of collated files to ignore collate_ignore = collate_config.get('ignore') if collate_ignore is None: collate_ignore = [] elif type(collate_ignore) != list: collate_ignore = [collate_ignore] # Generate collated file list and identify the first tile tile_fnames = [f for f in os.listdir(self.output_path) if f[-4:].isdigit() and f[-8:-4] == '.nc.'] tile_fnames.sort() mnc_tiles = defaultdict(list) for t_fname in tile_fnames: t_base, t_ext = os.path.splitext(t_fname) t_ext = t_ext.lstrip('.') # Skip any files listed in the ignore list if t_base in collate_ignore: continue mnc_tiles[t_base].append(t_fname) cpucount = int(collate_config.get('ncpus', multiprocessing.cpu_count())) if mpi: # Default to one for mpi nprocesses = int(collate_config.get('threads', 1)) else: nprocesses = int(collate_config.get('threads', cpucount)) ncpusperprocess = int(cpucount/nprocesses) if ncpusperprocess == 1 and mpi: print("Warning: running collate with mpirun on a single processor") pool = multiprocessing.Pool(processes=nprocesses) # Collate each tileset into a single file results = [] codes = [] outputs = [] for nc_fname in mnc_tiles: nc_path = os.path.join(self.output_path, nc_fname) # Remove the collated file if it already exists, since it is # probably from a failed collation attempt # TODO: Validate this somehow if os.path.isfile(nc_path): os.remove(nc_path) cmd = ' '.join([mppnc_path, collate_flags, nc_fname, ' '.join(mnc_tiles[nc_fname])]) if mpi: cmd = "mpirun -n {n} {cmd}".format( n=ncpusperprocess, cmd=cmd ) print(cmd) results.append( pool.apply_async(cmdthread, args=(cmd, self.output_path))) pool.close() pool.join() for result in results: rc, op = result.get() codes.append(rc) outputs.append(op) # TODO: Categorise the return codes if any(rc is not None for rc in codes): for p, rc, op in zip(count(), codes, outputs): if rc is not None: print('payu: error: Thread {p} crashed with error code ' '{rc}.'.format(p, rc), file=sys.stderr) print(' Error message:', file=sys.stderr) print(msg, file=sys.stderr) sys.exit(-1)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = str(self.run_id)[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) error_script = self.userscripts.get('error') if error_script: self.run_userscript(error_script) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def collate(self): # Set the stacksize to be unlimited res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY)) collate_config = self.expt.config.get('collate', {}) # The mpi flag implies using mppnccombine-fast mpi = collate_config.get('mpi', False) if mpi: # Must use envmod to be able to load mpi modules for collation envmod.setup() self.expt.load_modules() default_exe = 'mppnccombine-fast' else: default_exe = 'mppnccombine' # Locate the FMS collation tool # Check config for collate executable mppnc_path = collate_config.get('exe') if mppnc_path is None: for f in os.listdir(self.expt.lab.bin_path): if f == default_exe: mppnc_path = os.path.join(self.expt.lab.bin_path, f) break else: if not os.path.isabs(mppnc_path): mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path) assert mppnc_path, 'No mppnccombine program found' # Check config for collate command line options collate_flags = collate_config.get('flags') if collate_flags is None: if mpi: collate_flags = '-r' else: collate_flags = '-n4 -z -m -r' if mpi: # The output file is the first argument after the flags # and mppnccombine-fast uses an explicit -o flag to specify # the output collate_flags = " ".join([collate_flags, '-o']) envmod.lib_update(mppnc_path, 'libmpi.so') # Import list of collated files to ignore collate_ignore = collate_config.get('ignore') if collate_ignore is None: collate_ignore = [] elif type(collate_ignore) != list: collate_ignore = [collate_ignore] # Generate collated file list and identify the first tile tile_fnames = {} fnames = Fms.get_uncollated_files(self.output_path) tile_fnames[self.output_path] = fnames print(tile_fnames) if (collate_config.get('restart', False) and self.prior_restart_path is not None): # Add uncollated restart files fnames = Fms.get_uncollated_files(self.prior_restart_path) tile_fnames[self.prior_restart_path] = fnames # mnc_tiles = defaultdict(list) mnc_tiles = defaultdict(defaultdict(list).copy) for t_dir in tile_fnames: for t_fname in tile_fnames[t_dir]: t_base, t_ext = os.path.splitext(t_fname) t_ext = t_ext.lstrip('.') # Skip any files listed in the ignore list if t_base in collate_ignore: continue mnc_tiles[t_dir][t_base].append(t_fname) # print(mnc_tiles) if mpi and collate_config.get('glob', True): for t_base in mnc_tiles: globstr = "{}.*".format(t_base) # Try an equivalent glob and check the same files are returned mnc_glob = fnmatch.filter(os.listdir(self.output_path), globstr) if mnc_tiles[t_base] == sorted(mnc_glob): mnc_tiles[t_base] = [globstr, ] print("Note: using globstr ({}) for collating {}" .format(globstr, t_base)) else: print("Warning: cannot use globstr {} to collate {}" .format(globstr, t_base)) if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT: print("Warning: large number of tiles: {} " .format(len(mnc_tiles[t_base]))) print("Warning: collation will be slow and may fail") cpucount = int(collate_config.get('ncpus', multiprocessing.cpu_count())) if mpi: # Default to one for mpi nprocesses = int(collate_config.get('threads', 1)) else: nprocesses = int(collate_config.get('threads', cpucount)) ncpusperprocess = int(cpucount/nprocesses) if ncpusperprocess == 1 and mpi: print("Warning: running collate with mpirun on a single processor") pool = multiprocessing.Pool(processes=nprocesses) # Collate each tileset into a single file results = [] codes = [] outputs = [] for output_path in mnc_tiles: for nc_fname in mnc_tiles[output_path]: nc_path = os.path.join(output_path, nc_fname) # Remove the collated file if it already exists, since it is # probably from a failed collation attempt # TODO: Validate this somehow if os.path.isfile(nc_path): os.remove(nc_path) cmd = ' '.join([mppnc_path, collate_flags, nc_fname, ' '.join(mnc_tiles[output_path][nc_fname])]) if mpi: cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd) print(cmd) results.append( pool.apply_async(cmdthread, args=(cmd, output_path))) pool.close() pool.join() for result in results: rc, op = result.get() codes.append(rc) outputs.append(op) # TODO: Categorise the return codes if any(rc is not None for rc in codes): for p, rc, op in zip(count(), codes, outputs): if rc is not None: print('payu: error: Thread {p} crashed with error code ' '{rc}.'.format(p=p, rc=rc), file=sys.stderr) print(' Error message:', file=sys.stderr) print(op.decode(), file=sys.stderr) sys.exit(-1)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = self.run_id[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def collate(self): # Set the stacksize to be unlimited res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY)) collate_config = self.expt.config.get('collate', {}) # The mpi flag implies using mppnccombine-fast mpi = collate_config.get('mpi', False) if mpi: # Must use envmod to be able to load mpi modules for collation envmod.setup() self.expt.load_modules() default_exe = 'mppnccombine-fast' else: default_exe = 'mppnccombine' # Locate the FMS collation tool # Check config for collate executable mppnc_path = collate_config.get('exe') if mppnc_path is None: for f in os.listdir(self.expt.lab.bin_path): if f == default_exe: mppnc_path = os.path.join(self.expt.lab.bin_path, f) break else: if not os.path.isabs(mppnc_path): mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path) assert mppnc_path, 'No mppnccombine program found' # Check config for collate command line options collate_flags = collate_config.get('flags') if collate_flags is None: if mpi: collate_flags = '-r' else: collate_flags = '-n4 -z -m -r' if mpi: # The output file is the first argument after the flags # and mppnccombine-fast uses an explicit -o flag to specify # the output collate_flags = " ".join([collate_flags, '-o']) envmod.lib_update(mppnc_path, 'libmpi.so') # Import list of collated files to ignore collate_ignore = collate_config.get('ignore') if collate_ignore is None: collate_ignore = [] elif type(collate_ignore) != list: collate_ignore = [collate_ignore] # Generate collated file list and identify the first tile tile_fnames = {} fnames = Fms.get_uncollated_files(self.output_path) tile_fnames[self.output_path] = fnames print(tile_fnames) if (collate_config.get('restart', False) and self.prior_restart_path is not None): # Add uncollated restart files fnames = Fms.get_uncollated_files(self.prior_restart_path) tile_fnames[self.prior_restart_path] = fnames # mnc_tiles = defaultdict(list) mnc_tiles = defaultdict(defaultdict(list).copy) for t_dir in tile_fnames: for t_fname in tile_fnames[t_dir]: t_base, t_ext = os.path.splitext(t_fname) t_ext = t_ext.lstrip('.') # Skip any files listed in the ignore list if t_base in collate_ignore: continue mnc_tiles[t_dir][t_base].append(t_fname) # print(mnc_tiles) if mpi and collate_config.get('glob', True): for t_base in mnc_tiles: globstr = "{}.*".format(t_base) # Try an equivalent glob and check the same files are returned mnc_glob = fnmatch.filter(os.listdir(self.output_path), globstr) if mnc_tiles[t_base] == sorted(mnc_glob): mnc_tiles[t_base] = [globstr, ] print("Note: using globstr ({}) for collating {}" .format(globstr, t_base)) else: print("Warning: cannot use globstr {} to collate {}" .format(globstr, t_base)) if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT: print("Warning: large number of tiles: {} " .format(len(mnc_tiles[t_base]))) print("Warning: collation will be slow and may fail") cpucount = int(collate_config.get('ncpus', multiprocessing.cpu_count())) if mpi: # Default to one for mpi nprocesses = int(collate_config.get('threads', 1)) else: nprocesses = int(collate_config.get('threads', cpucount)) ncpusperprocess = int(cpucount/nprocesses) if ncpusperprocess == 1 and mpi: print("Warning: running collate with mpirun on a single processor") pool = multiprocessing.Pool(processes=nprocesses) # Collate each tileset into a single file results = [] codes = [] outputs = [] for output_path in mnc_tiles: for nc_fname in mnc_tiles[output_path]: nc_path = os.path.join(output_path, nc_fname) # Remove the collated file if it already exists, since it is # probably from a failed collation attempt # TODO: Validate this somehow if os.path.isfile(nc_path): os.remove(nc_path) cmd = ' '.join([mppnc_path, collate_flags, nc_fname, ' '.join(mnc_tiles[output_path][nc_fname])]) if mpi: cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd) print(cmd) results.append( pool.apply_async(cmdthread, args=(cmd, output_path))) pool.close() pool.join() for result in results: rc, op = result.get() codes.append(rc) outputs.append(op) # TODO: Categorise the return codes if any(rc is not None for rc in codes): for p, rc, op in zip(count(), codes, outputs): if rc is not None: print('payu: error: Thread {p} crashed with error code ' '{rc}.'.format(p=p, rc=rc), file=sys.stderr) print(' Error message:', file=sys.stderr) print(op.decode(), file=sys.stderr) sys.exit(-1)
def run(self, *user_flags): self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) mpi_flags = self.config.get('mpirun') # Correct an empty mpirun entry if mpi_flags is None: mpi_flags = [] if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = '-npersocket {}'.format(model_npernode / 2) else: npernode_flag = '-npernode {}'.format(model_npernode) if self.config.get('scalasca', False): npernode_flag = '\"{}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.wrapper: model_prog.append(prof.wrapper) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags), ' : '.join(mpi_progs)) oss = self.config.get('openspeedshop') if oss: oss_runcmd = oss.get('runcmd') if not oss_runcmd: print('payu: error: OpenSpeedShop requires an executable.') sys.exit(1) oss_hwc = oss.get('hwc') if oss_runcmd.startswith('osshwc') and not oss_hwc: print('payu: error: This OSS command requires hardware ' 'counters.') sys.exit(1) cmd = '{} "{}" {}'.format(oss_runcmd, cmd, oss_hwc) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here if rc != 0: sys.exit('payu: Model exited with error code {}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def run(self, *user_flags): self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) mpi_flags = self.config.get('mpirun') # Correct an empty mpirun entry if mpi_flags is None: mpi_flags = [] if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = '-npersocket {}'.format(model_npernode / 2) else: npernode_flag = '-npernode {}'.format(model_npernode) if self.config.get('scalasca', False): npernode_flag = '\"{}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: model_prog.append(prof.wrapper) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags), ' : '.join(mpi_progs)) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here if rc != 0: sys.exit('payu: Model exited with error code {}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)