def archive(self): mkdir_p(self.restart_path) # Need to figure out the end date of the model. nml_path = os.path.join(self.work_path, 'namelists') nml = f90nml.read(nml_path) resubmit_inc = nml['NLSTCALL']['RUN_RESUBMIT_INC'] runtime = um_time_to_time(resubmit_inc) runtime = datetime.timedelta(seconds=runtime) basis_time = nml['NLSTCALL']['MODEL_BASIS_TIME'] init_date = um_date_to_date(basis_time) end_date = date_to_um_dump_date(init_date + runtime) restart_dump = os.path.join(self.work_path, 'aiihca.da{0}'.format(end_date)) f_dst = os.path.join(self.restart_path, self.restart) if os.path.exists(restart_dump): shutil.copy(restart_dump, f_dst) else: print('payu: error: Model has not produced a restart dump file' '{} does not exist. ' 'Check DUMPFREQim in namelists'.format(restart_dump))
def setup(self): super(Oasis, self).setup() # Copy OASIS data to the other submodels # TODO: Parse namcouple to determine filelist # TODO: Let users map files to models input_files = [ f for f in os.listdir(self.work_path) if f not in self.config_files ] for model in self.expt.models: # Skip the oasis self-reference if model == self: continue # Skip models without a work_path (like access) if not hasattr(model, 'work_path'): continue mkdir_p(model.work_path) for f_name in (self.config_files + input_files): f_path = os.path.join(self.work_path, f_name) f_sympath = os.path.join(model.work_path, f_name) make_symlink(f_path, f_sympath) if self.expt.runtime: # TODO: Implement runtime patch to namcouple pass
def setup(self): # FMS initialisation super(Mom, self).setup() if not self.top_level_model: # Make log dir mkdir_p(os.path.join(self.work_path, 'log')) input_nml_path = os.path.join(self.work_path, 'input.nml') input_nml = f90nml.read(input_nml_path) use_core2iaf = self.config.get('core2iaf') if use_core2iaf: self.core2iaf_setup() # Set the runtime if self.expt.runtime: ocean_solo_nml = input_nml['ocean_solo_nml'] ocean_solo_nml['years'] = self.expt.runtime['years'] ocean_solo_nml['months'] = self.expt.runtime['months'] ocean_solo_nml['days'] = self.expt.runtime['days'] ocean_solo_nml['seconds'] = self.expt.runtime.get('seconds', 0) input_nml.write(input_nml_path, force=True) # Construct the land CPU mask if self.expt.config.get('mask_table', False): self.create_mask_table(input_nml)
def setup(self, force_archive=False): # Confirm that no output path already exists if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') mkdir_p(self.work_path) if force_archive: mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Archive the payu config # TODO: This just copies the existing config.yaml file, but we should # reconstruct a new file including default values config_src = os.path.join(self.control_path, 'config.yaml') config_dst = os.path.join(self.work_path) shutil.copy(config_src, config_dst) # Stripe directory in Lustre # TODO: Make this more configurable do_stripe = self.config.get('stripedio', False) if do_stripe: cmd = 'lfs setstripe -c 8 -s 8m {0}'.format(self.work_path) sp.check_call(shlex.split(cmd)) make_symlink(self.work_path, self.work_sym_path) # Set up all file manifests self.manifest.setup() for model in self.models: model.setup() # Call the macro-model setup if len(self.models) > 1: self.model.setup() # Use manifest to populate work directory self.manifest.make_links() # Copy manifests to work directory so they archived on completion self.manifest.copy_manifests(os.path.join(self.work_path,'manifests')) setup_script = self.userscripts.get('setup') if setup_script: self.run_userscript(setup_script) # Profiler setup expt_profs = self.config.get('profilers', []) if not isinstance(expt_profs, list): expt_profs = [expt_profs] for prof_name in expt_profs: ProfType = payu.profilers.index[prof_name] prof = ProfType(self) self.profilers.append(prof) # Testing prof.setup()
def setup(self): super(Oasis, self).setup() # Copy OASIS data to the other submodels # TODO: Parse namcouple to determine filelist # TODO: Let users map files to models input_files = [f for f in os.listdir(self.work_path) if f not in self.config_files] for model in self.expt.models: # Skip the oasis self-reference if model == self: continue # Skip models without a work_path (like access) if not hasattr(model, 'work_path'): continue mkdir_p(model.work_path) for f_name in (self.config_files + input_files): f_path = os.path.join(self.work_path, f_name) f_sympath = os.path.join(model.work_path, f_name) make_symlink(f_path, f_sympath) if self.expt.runtime: # TODO: Implement runtime patch to namcouple pass
def archive(self): mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {} {}'.format(self.work_path, self.output_path) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [ d for d in os.listdir(self.archive_path) if d.startswith('restart') ] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if os.path.isdir(res_path): shutil.rmtree(res_path) if self.config.get('collate', True): cmd = 'payu collate -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = 'payu profile -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def sweep(self, hard_sweep=False): # TODO: Fix the IO race conditions! if hard_sweep: if os.path.isdir(self.archive_path): print('Removing archive path {0}'.format(self.archive_path)) cmd = 'rm -rf {0}'.format(self.archive_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.archive_sym_path): print('Removing symlink {0}'.format(self.archive_sym_path)) os.remove(self.archive_sym_path) if os.path.isdir(self.work_path): print('Removing work path {0}'.format(self.work_path)) cmd = 'rm -rf {0}'.format(self.work_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.work_sym_path): print('Removing symlink {0}'.format(self.work_sym_path)) os.remove(self.work_sym_path) # TODO: model outstreams and pbs logs need to be handled separately default_job_name = os.path.basename(os.getcwd()) short_job_name = str(self.config.get('jobname', default_job_name))[:15] logs = [ f for f in os.listdir(os.curdir) if os.path.isfile(f) and ( f.startswith(short_job_name + '.o') or f.startswith(short_job_name + '.e') or f.startswith(short_job_name[:13] + '_c.o') or f.startswith(short_job_name[:13] + '_c.e') or f.startswith(short_job_name[:13] + '_p.o') or f.startswith(short_job_name[:13] + '_p.e') ) ] pbs_log_path = os.path.join(self.archive_path, 'pbs_logs') legacy_pbs_log_path = os.path.join(self.control_path, 'pbs_logs') if os.path.isdir(legacy_pbs_log_path): # TODO: New path may still exist! assert not os.path.isdir(pbs_log_path) print('payu: Moving pbs_logs to {0}'.format(pbs_log_path)) shutil.move(legacy_pbs_log_path, pbs_log_path) else: mkdir_p(pbs_log_path) for f in logs: print('Moving log {0}'.format(f)) shutil.move(f, os.path.join(pbs_log_path, f)) # Remove stdout/err for f in (self.stdout_fname, self.stderr_fname): if os.path.isfile(f): os.remove(f)
def sweep(self, hard_sweep=False): # TODO: Fix the IO race conditions! # TODO: model outstreams and pbs logs need to be handled separately default_job_name = os.path.basename(os.getcwd()) short_job_name = str(self.config.get('jobname', default_job_name))[:15] logs = [ f for f in os.listdir(os.curdir) if os.path.isfile(f) and ( f.startswith(short_job_name + '.o') or f.startswith(short_job_name + '.e') or f.startswith(short_job_name[:13] + '_c.o') or f.startswith(short_job_name[:13] + '_c.e') or f.startswith(short_job_name[:13] + '_p.o') or f.startswith(short_job_name[:13] + '_p.e') ) ] pbs_log_path = os.path.join(self.archive_path, 'pbs_logs') legacy_pbs_log_path = os.path.join(self.control_path, 'pbs_logs') if os.path.isdir(legacy_pbs_log_path): # TODO: New path may still exist! assert not os.path.isdir(pbs_log_path) print('payu: Moving pbs_logs to {0}'.format(pbs_log_path)) shutil.move(legacy_pbs_log_path, pbs_log_path) else: mkdir_p(pbs_log_path) for f in logs: print('Moving log {0}'.format(f)) shutil.move(f, os.path.join(pbs_log_path, f)) if hard_sweep: if os.path.isdir(self.archive_path): print('Removing archive path {0}'.format(self.archive_path)) cmd = 'rm -rf {0}'.format(self.archive_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.archive_sym_path): print('Removing symlink {0}'.format(self.archive_sym_path)) os.remove(self.archive_sym_path) # Remove stdout/err and yaml dumps for f in self.output_fnames: if os.path.isfile(f): os.remove(f) if os.path.isdir(self.work_path): print('Removing work path {0}'.format(self.work_path)) cmd = 'rm -rf {0}'.format(self.work_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.work_sym_path): print('Removing symlink {0}'.format(self.work_sym_path)) os.remove(self.work_sym_path)
def setup(self, force_archive=False): # Confirm that no output path already exists if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') mkdir_p(self.work_path) if force_archive: mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Archive the payu config # TODO: This just copies the existing config.yaml file, but we should # reconstruct a new file including default values config_src = os.path.join(self.control_path, 'config.yaml') config_dst = os.path.join(self.work_path) shutil.copy(config_src, config_dst) # Stripe directory in Lustre # TODO: Make this more configurable do_stripe = self.config.get('stripedio', False) if do_stripe: cmd = 'lfs setstripe -c 8 -s 8m {0}'.format(self.work_path) sp.check_call(shlex.split(cmd)) make_symlink(self.work_path, self.work_sym_path) # Set up all file manifests self.manifest.setup() for model in self.models: model.setup() # Call the macro-model setup if len(self.models) > 1: self.model.setup() self.manifest.check_manifests() # Copy manifests to work directory so they archived on completion manifest_path = os.path.join(self.work_path, 'manifests') self.manifest.copy_manifests(manifest_path) setup_script = self.userscripts.get('setup') if setup_script: self.run_userscript(setup_script) # Profiler setup expt_profs = self.config.get('profilers', []) if not isinstance(expt_profs, list): expt_profs = [expt_profs] for prof_name in expt_profs: ProfType = payu.profilers.index[prof_name] prof = ProfType(self) self.profilers.append(prof) # Testing prof.setup()
def copy_manifests(self, path): mkdir_p(path) try: for mf in self.manifests: self.manifests[mf].copy(path) except IOError: pass
def archive(self): # Need to parse the data namelist file to access the # endTime data_path = os.path.join(self.work_path, 'data') data_nml = self.read_namelist(data_path) # Save model time to restart next run with open(os.path.join(self.restart_path, self.restart_calendar_file), 'w') as restart_file: restart = {'endtime': data_nml['parm03']['endTime']} restart_file.write(yaml.dump(restart, default_flow_style=False)) # Remove symbolic links to input or pickup files: for f in os.listdir(self.work_path): f_path = os.path.join(self.work_path, f) if os.path.islink(f_path): os.remove(f_path) # Move files outside of mnc_* directories mnc_paths = [ os.path.join(self.work_path, d) for d in os.listdir(self.work_path) if d.startswith('mnc_') ] for path in mnc_paths: for f in os.listdir(path): f_path = os.path.join(path, f) sh.move(f_path, self.work_path) os.rmdir(path) mkdir_p(self.restart_path) # Move pickups but don't include intermediate pickupts ('ckpt's) restart_files = [ f for f in os.listdir(self.work_path) if f.startswith('pickup') and not f.split('.')[1].startswith('ckpt') ] # Tar and compress the output files stdout_files = [ f for f in os.listdir(self.work_path) if f.startswith('STDOUT.') ] if stdout_files: cmd = 'tar -C {0} -c -j -f {1}'.format( self.work_path, os.path.join(self.work_path, 'STDOUT.tar.bz2')) rc = sp.Popen(shlex.split(cmd) + stdout_files).wait() assert rc == 0 for f in stdout_files: os.remove(os.path.join(self.work_path, f)) for f in restart_files: f_src = os.path.join(self.work_path, f) sh.move(f_src, self.restart_path)
def archive(self): mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {} {}'.format(self.work_path, self.output_path) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history)): res_path = os.path.join(self.archive_path, res_dir) shutil.rmtree(res_path) if self.config.get('collate', True): cmd = 'payu collate -i {} -l {}'.format(self.counter, self.lab.basepath) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = 'payu profile -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def build_model(self): if not self.repo_url: return # Check to see if executable already exists. if self.exec_path and os.path.isfile(self.exec_path): print('payu: warning: {0} will be overwritten.' ''.format(self.exec_path)) # First step is always to go to the codebase. curdir = os.getcwd() # Do the build. First check whether there is a build command in the # config. If not check for the model default, otherwise just run make. try: build_path = self.config['build']['path_to_build_command'] except KeyError: if self.build_path: build_path = self.build_path else: build_path = './' os.chdir(os.path.join(self.codebase_path, build_path)) try: cmd = self.config['build']['command'] except KeyError: if self.build_command: cmd = self.build_command else: cmd = 'make' print('Running command {0}'.format(cmd)) sp.check_call(shlex.split(cmd)) try: build_exec_path = os.path.join(self.codebase_path, self.config['build']['exec_path']) except KeyError: if self.build_exec_path: build_exec_path = self.build_exec_path else: build_exec_path = self.codebase_path # Copy new executable to bin dir if self.exec_path: # Create the bin path if it doesn't exist mkdir_p(self.expt.lab.bin_path) build_exec_path = os.path.join(build_exec_path, self.exec_name) shutil.copy(build_exec_path, self.exec_path) os.chdir(curdir)
def build_model(self): if not self.repo_url: return # Check to see if executable already exists. if self.exec_path and os.path.isfile(self.exec_path): print('payu: warning: {} will be overwritten.' ''.format(self.exec_path)) # First step is always to go to the codebase. curdir = os.getcwd() # Do the build. First check whether there is a build command in the # config. If not check for the model default, otherwise just run make. try: build_path = self.config['build']['path_to_build_command'] except KeyError: if self.build_path: build_path = self.build_path else: build_path = './' os.chdir(os.path.join(self.codebase_path, build_path)) try: cmd = self.config['build']['command'] except KeyError: if self.build_command: cmd = self.build_command else: cmd = 'make' print('Running command {}'.format(cmd)) sp.check_call(shlex.split(cmd)) try: build_exec_path = os.path.join(self.codebase_path, self.config['build']['exec_path']) except KeyError: if self.build_exec_path: build_exec_path = self.build_exec_path else: build_exec_path = self.codebase_path # Copy new executable to bin dir if self.exec_path: # Create the bin path if it doesn't exist mkdir_p(self.expt.lab.bin_path) build_exec_path = os.path.join(build_exec_path, self.exec_name) shutil.copy(build_exec_path, self.exec_path) os.chdir(curdir)
def postprocess(self): gmon_dir = os.path.join(self.expt.work_path, 'gmon') mkdir_p(gmon_dir) gmon_fnames = [f for f in os.listdir(self.expt.work_path) if f.startswith('gmon.out')] for gmon in gmon_fnames: f_src = os.path.join(self.expt.work_path, gmon) f_dst = os.path.join(gmon_dir, gmon) shutil.move(f_src, f_dst)
def setup(self): # FMS initialisation super(Mom, self).setup() if not self.top_level_model: # Make log dir mkdir_p(os.path.join(self.work_path, 'log')) input_nml_path = os.path.join(self.work_path, 'input.nml') input_nml = f90nml.read(input_nml_path) # Set the runtime if self.expt.runtime: ocean_solo_nml = input_nml['ocean_solo_nml'] ocean_solo_nml['years'] = self.expt.runtime['years'] ocean_solo_nml['months'] = self.expt.runtime['months'] ocean_solo_nml['days'] = self.expt.runtime['days'] ocean_solo_nml['seconds'] = self.expt.runtime.get('seconds', 0) input_nml.write(input_nml_path, force=True) # Construct the land CPU mask if self.expt.config.get('mask_table', False): # NOTE: This function actually creates a mask table using the # `check_mask` command line tool. But it is not very usable # since you need to know the number of masked CPUs to submit # the job. It needs a rethink of the submission process. self.create_mask_table(input_nml) # NOTE: Don't expect this to be here forever... # Attempt to set a mask table from the input if self.config.get('mask', False): mask_path = os.path.join(self.work_input_path, 'ocean_mask_table') # Remove any existing mask # (If no reference mask is available, then we will not use one) if os.path.isfile(mask_path): os.remove(mask_path) # Reference mask table assert ('layout' in input_nml['ocean_model_nml']) nx, ny = input_nml['ocean_model_nml'].get('layout') n_masked_cpus = nx * ny - self.config.get('ncpus') mask_table_fname = 'mask_table.{nmask}.{nx}x{ny}'.format( nmask=n_masked_cpus, nx=nx, ny=ny) ref_mask_path = os.path.join(self.work_input_path, mask_table_fname) # Set (or replace) mask table if reference is available if os.path.isfile(ref_mask_path): make_symlink(ref_mask_path, mask_path)
def archive(self): # TODO: Determine the exchange files restart_files = ['a2i.nc', 'i2a.nc', 'i2o.nc', 'o2i.nc'] mkdir_p(self.restart_path) for f in restart_files: f_src = os.path.join(self.work_path, f) f_dst = os.path.join(self.restart_path, f) if os.path.exists(f_src): shutil.copy2(f_src, f_dst)
def archive(self): # TODO: Determine the exchange files restart_files = ['a2i.nc', 'i2a.nc', 'i2o.nc', 'o2i.nc'] mkdir_p(self.restart_path) for f in restart_files: f_src = os.path.join(self.work_path, f) f_dst = os.path.join(self.restart_path, f) if os.path.exists(f_src): shutil.move(f_src, f_dst)
def postprocess(self): gmon_dir = os.path.join(self.expt.work_path, 'gmon') mkdir_p(gmon_dir) gmon_fnames = [ f for f in os.listdir(self.expt.work_path) if f.startswith('gmon.out') ] for gmon in gmon_fnames: f_src = os.path.join(self.expt.work_path, gmon) f_dst = os.path.join(gmon_dir, gmon) shutil.move(f_src, f_dst)
def sweep(self, hard_sweep=False): # TODO: Fix the IO race conditions! if hard_sweep: if os.path.isdir(self.archive_path): print('Removing archive path {}'.format(self.archive_path)) cmd = 'rm -rf {}'.format(self.archive_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.archive_sym_path): print('Removing symlink {}'.format(self.archive_sym_path)) os.remove(self.archive_sym_path) if os.path.isdir(self.work_path): print('Removing work path {}'.format(self.work_path)) cmd = 'rm -rf {}'.format(self.work_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.work_sym_path): print('Removing symlink {}'.format(self.work_sym_path)) os.remove(self.work_sym_path) # TODO: model outstreams and pbs logs need to be handled separately default_job_name = os.path.basename(os.getcwd()) short_job_name = self.config.get('jobname', default_job_name)[:15] logs = [ f for f in os.listdir(os.curdir) if os.path.isfile(f) and ( f == self.stdout_fname or f == self.stderr_fname or f.startswith(short_job_name + '.o') or f.startswith(short_job_name + '.e') or f.startswith(short_job_name[:13] + '_c.o') or f.startswith(short_job_name[:13] + '_c.e') or f.startswith(short_job_name[:13] + '_p.o') or f.startswith(short_job_name[:13] + '_p.e') ) ] pbs_log_path = os.path.join(os.curdir, 'pbs_logs') mkdir_p(pbs_log_path) for f in logs: print('Moving log {}'.format(f)) os.rename(f, os.path.join(pbs_log_path, f))
def initialize(self): """Create the laboratory directories.""" # Create laboratory directories mkdir_p(self.archive_path) mkdir_p(self.bin_path) mkdir_p(self.codebase_path) mkdir_p(self.input_basepath)
def sweep(self, hard_sweep=False): # TODO: Fix the IO race conditions! if hard_sweep: if os.path.isdir(self.archive_path): print('Removing archive path {}'.format(self.archive_path)) cmd = 'rm -rf {}'.format(self.archive_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.archive_sym_path): print('Removing symlink {}'.format(self.archive_sym_path)) os.remove(self.archive_sym_path) if os.path.isdir(self.work_path): print('Removing work path {}'.format(self.work_path)) cmd = 'rm -rf {}'.format(self.work_path) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0 if os.path.islink(self.work_sym_path): print('Removing symlink {}'.format(self.work_sym_path)) os.remove(self.work_sym_path) # TODO: model outstreams and pbs logs need to be handled separately default_job_name = os.path.basename(os.getcwd()) short_job_name = self.config.get('jobname', default_job_name)[:15] logs = [ f for f in os.listdir(os.curdir) if os.path.isfile(f) and (f == self.stdout_fname or f == self.stderr_fname or f.startswith('slurm-') #f.startswith(short_job_name + '.o') or #f.startswith(short_job_name + '.e') or #f.startswith(short_job_name[:13] + '_c.o') or #f.startswith(short_job_name[:13] + '_c.e') ) ] pbs_log_path = os.path.join(os.curdir, 'slurm_logs') mkdir_p(pbs_log_path) for f in logs: print('Moving log {}'.format(f)) os.rename(f, os.path.join(pbs_log_path, f))
def archive(self): super(UnifiedModel, self).archive() # Delete all the stdout log files except the root PE # Sorts to ensure root PE is first entry files = sorted(glob.glob( os.path.join(self.work_path, 'atm.fort6.pe*')), key=lambda name: int(name.rpartition('.')[-1][2:])) if len(files) > 1: for f_path in files[1:]: os.remove(f_path) mkdir_p(self.restart_path) # Need to figure out the end date of the model. nml_path = os.path.join(self.work_path, 'namelists') nml = f90nml.read(nml_path) resubmit_inc = nml['NLSTCALL']['RUN_RESUBMIT_INC'] runtime = um_time_to_time(resubmit_inc) # runtime = datetime.timedelta(seconds=runtime) basis_time = nml['NLSTCALL']['MODEL_BASIS_TIME'] init_date = um_date_to_date(basis_time) end_date = cal.date_plus_seconds(init_date, runtime, cal.GREGORIAN) # Save model time to restart next run with open(os.path.join(self.restart_path, self.restart_calendar_file), 'w') as restart_file: restart_file.write(yaml.dump({'end_date': end_date}, default_flow_style=False)) end_date = date_to_um_dump_date(end_date) restart_dump = os.path.join(self.work_path, 'aiihca.da{0}'.format(end_date)) f_dst = os.path.join(self.restart_path, self.restart) if os.path.exists(restart_dump): shutil.copy(restart_dump, f_dst) else: print('payu: error: Model has not produced a restart dump file:\n' '{} does not exist.\n' 'Check DUMPFREQim in namelists'.format(restart_dump))
def archive(self, **kwargs): # Remove symbolic links for f in os.listdir(self.work_input_path): f_path = os.path.join(self.work_input_path, f) if os.path.islink(f_path): os.remove(f_path) # Archive the restart files mkdir_p(self.restart_path) restart_files = [f for f in os.listdir(self.work_path) if f.endswith('lastday.nc')] for f in restart_files: f_src = os.path.join(self.work_path, f) shutil.move(f_src, self.restart_path)
def archive(self): # Remove symbolic links to input or pickup files: for f in os.listdir(self.work_path): f_path = os.path.join(self.work_path, f) if os.path.islink(f_path): os.remove(f_path) # Move files outside of mnc_* directories mnc_paths = [ os.path.join(self.work_path, d) for d in os.listdir(self.work_path) if d.startswith('mnc_') ] for path in mnc_paths: for f in os.listdir(path): f_path = os.path.join(path, f) sh.move(f_path, self.work_path) os.rmdir(path) mkdir_p(self.restart_path) # Move pickups but don't include intermediate pickupts ('ckpt's) restart_files = [ f for f in os.listdir(self.work_path) if f.startswith('pickup') and not f.split('.')[1].startswith('ckpt') ] # Tar and compress the output files stdout_files = [ f for f in os.listdir(self.work_path) if f.startswith('STDOUT.') ] if stdout_files: cmd = 'tar -C {0} -c -j -f {1}'.format( self.work_path, os.path.join(self.work_path, 'STDOUT.tar.bz2')) rc = sp.Popen(shlex.split(cmd) + stdout_files).wait() assert rc == 0 for f in stdout_files: os.remove(os.path.join(self.work_path, f)) for f in restart_files: f_src = os.path.join(self.work_path, f) sh.move(f_src, self.restart_path)
def archive(self, **kwargs): # Remove symbolic links for f in os.listdir(self.work_input_path): f_path = os.path.join(self.work_input_path, f) if os.path.islink(f_path): os.remove(f_path) # Archive the restart files mkdir_p(self.restart_path) restart_files = [ f for f in os.listdir(self.work_path) if f.endswith('lastday.nc') ] for f in restart_files: f_src = os.path.join(self.work_path, f) shutil.move(f_src, self.restart_path)
def archive(self): super(UnifiedModel, self).archive() # Delete all the stdout log files except the root PE # Sorts to ensure root PE is first entry files = sorted(glob.glob(os.path.join(self.work_path, 'atm.fort6.pe*')), key=lambda name: int(name.rpartition('.')[-1][2:])) if len(files) > 1: for f_path in files[1:]: os.remove(f_path) mkdir_p(self.restart_path) # Need to figure out the end date of the model. nml_path = os.path.join(self.work_path, 'namelists') nml = f90nml.read(nml_path) resubmit_inc = nml['NLSTCALL']['RUN_RESUBMIT_INC'] runtime = um_time_to_time(resubmit_inc) # runtime = datetime.timedelta(seconds=runtime) basis_time = nml['NLSTCALL']['MODEL_BASIS_TIME'] init_date = um_date_to_date(basis_time) end_date = cal.date_plus_seconds(init_date, runtime, cal.GREGORIAN) # Save model time to restart next run with open(os.path.join(self.restart_path, self.restart_calendar_file), 'w') as restart_file: restart_file.write( yaml.dump({'end_date': end_date}, default_flow_style=False)) end_date = date_to_um_dump_date(end_date) restart_dump = os.path.join(self.work_path, 'aiihca.da{0}'.format(end_date)) f_dst = os.path.join(self.restart_path, self.restart) if os.path.exists(restart_dump): shutil.copy(restart_dump, f_dst) else: print('payu: error: Model has not produced a restart dump file:\n' '{} does not exist.\n' 'Check DUMPFREQim in namelists'.format(restart_dump))
def archive(self): # Remove symbolic links to input or pickup files: for f in os.listdir(self.work_path): f_path = os.path.join(self.work_path, f) if os.path.islink(f_path): os.remove(f_path) # Move files outside of mnc_* directories mnc_paths = [os.path.join(self.work_path, d) for d in os.listdir(self.work_path) if d.startswith('mnc_')] for path in mnc_paths: for f in os.listdir(path): f_path = os.path.join(path, f) sh.move(f_path, self.work_path) os.rmdir(path) mkdir_p(self.restart_path) # Move pickups but don't include intermediate pickupts ('ckpt's) restart_files = [f for f in os.listdir(self.work_path) if f.startswith('pickup') and not f.split('.')[1].startswith('ckpt')] # Tar and compress the output files stdout_files = [f for f in os.listdir(self.work_path) if f.startswith('STDOUT.')] if stdout_files: cmd = 'tar -C {0} -c -j -f {1}'.format( self.work_path, os.path.join(self.work_path, 'STDOUT.tar.bz2')) rc = sp.Popen(shlex.split(cmd) + stdout_files).wait() assert rc == 0 for f in stdout_files: os.remove(os.path.join(self.work_path, f)) for f in restart_files: f_src = os.path.join(self.work_path, f) sh.move(f_src, self.restart_path)
def archive(self): mkdir_p(self.restart_path) # Need to figure out the end date of the model. nml_path = os.path.join(self.work_path, "namelists") nml = f90nml.read(nml_path) resubmit_inc = nml["NLSTCALL"]["RUN_RESUBMIT_INC"] runtime = um_time_to_time(resubmit_inc) runtime = datetime.timedelta(seconds=runtime) basis_time = nml["NLSTCALL"]["MODEL_BASIS_TIME"] init_date = um_date_to_date(basis_time) end_date = date_to_um_dump_date(init_date + runtime) restart_dump = os.path.join(self.work_path, "aiihca.da{}".format(end_date)) f_dst = os.path.join(self.restart_path, self.restart) shutil.copy(restart_dump, f_dst)
def archive(self): mkdir_p(self.restart_path) # Need to figure out the end date of the model. nml_path = os.path.join(self.work_path, 'namelists') nml = f90nml.read(nml_path) resubmit_inc = nml['NLSTCALL']['RUN_RESUBMIT_INC'] runtime = um_time_to_time(resubmit_inc) runtime = datetime.timedelta(seconds=runtime) basis_time = nml['NLSTCALL']['MODEL_BASIS_TIME'] init_date = um_date_to_date(basis_time) end_date = date_to_um_dump_date(init_date + runtime) restart_dump = os.path.join(self.work_path, 'aiihca.da{}'.format(end_date)) f_dst = os.path.join(self.restart_path, self.restart) shutil.copy(restart_dump, f_dst)
def __init__(self, config, reproduce): # Manifest control configuration self.manifest_config = config # Not currently supporting specifying hash functions self.fast_hashes = self.manifest_config.get('fasthash', fast_hashes) self.full_hashes = self.manifest_config.get('fullhash', full_hashes) if type(self.fast_hashes) is str: self.fast_hashes = [ self.fast_hashes, ] if type(self.full_hashes) is str: self.full_hashes = [ self.full_hashes, ] self.ignore = self.manifest_config.get('ignore', ['.*']) if isinstance(self.ignore, str): self.ignore = [self.ignore] # Initialise manifests and reproduce flags self.manifests = {} self.have_manifest = {} reproduce_config = self.manifest_config.get('reproduce', {}) self.reproduce = {} for mf in ['input', 'restart', 'exe']: self.init_mf(mf) self.reproduce[mf] = reproduce_config.get(mf, reproduce) # Make sure the manifests directory exists mkdir_p(os.path.dirname(self.manifests['exe'].path)) # Set flag to auto-scan input directories self.scaninputs = self.manifest_config.get('scaninputs', True) if self.reproduce['input'] and self.scaninputs: print("scaninputs set to False when reproduce input is True") self.scaninputs = False
def __init__(self, expt, reproduce): # Inherit experiment configuration self.expt = expt # Manifest control configuration self.manifest_config = self.expt.config.get('manifest', {}) # Not currently supporting specifying hash functions # self.hash_functions = manifest_config.get( # 'hashfns', # ['nchash','binhash','md5'] # ) self.ignore = self.manifest_config.get('ignore', ['.*']) if isinstance(self.ignore, str): self.ignore = [self.ignore] # Initialise manifests self.manifests = {} for mf in ['input', 'restart', 'exe']: self.manifests[mf] = PayuManifest(os.path.join( 'manifests', '{}.yaml'.format(mf)), ignore=self.ignore) self.have_manifest = {} for mf in self.manifests: self.have_manifest[mf] = False # Set reproduce flags self.reproduce_config = self.manifest_config.get('reproduce', {}) self.reproduce = {} for mf in self.manifests.keys(): self.reproduce[mf] = self.reproduce_config.get(mf, reproduce) # Make sure the manifests directory exists mkdir_p(os.path.dirname(self.manifests['exe'].path)) self.scaninputs = self.manifest_config.get('scaninputs', True)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Add restart files from prior run to restart manifest if not self.expt.manifest.have_manifest[ 'restart'] and self.prior_restart_path: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_orig = os.path.join(self.prior_restart_path, f_name) f_link = os.path.join(self.work_init_path_local, f_name) self.expt.manifest.add_filepath('restart', f_link, f_orig, self.copy_inputs) # Add input files to manifest if we don't already have a populated # input manifest, or we specify scan_inputs is True (default) if not self.expt.manifest.have_manifest[ 'input'] or self.expt.manifest.scaninputs: # Add files to manifest for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_orig = os.path.join(input_path, f_name) f_link = os.path.join(self.work_input_path_local, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_link): self.expt.manifest.add_filepath( 'input', f_link, f_orig, self.copy_inputs) # Make symlink to executable in work directory if self.exec_path: # Add to exe manifest (this is always done so any change in exe path will be picked up) self.expt.manifest.add_filepath('exe', self.exec_path_local, self.exec_path) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Link restart files from prior run if self.prior_restart_path and not self.expt.repeat_run: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_restart = os.path.join(self.prior_restart_path, f_name) f_input = os.path.join(self.work_init_path, f_name) if self.copy_restarts: shutil.copy(f_restart, f_input) else: make_symlink(f_restart, f_input) # Link input data for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_input = os.path.join(input_path, f_name) f_work_input = os.path.join(self.work_input_path, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_work_input): if self.copy_inputs: shutil.copy(f_input, f_work_input) else: make_symlink(f_input, f_work_input) # Some models overwrite their own input for restarts # (e.g. OASIS) if self.make_inputs_writeable: os.chmod(f_work_input, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP) t_step = self.config.get('timestep') if t_step: self.set_timestep(t_step)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Add restart files from prior run to restart manifest if not self.expt.manifest.have_manifest['restart'] and self.prior_restart_path: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_orig = os.path.join(self.prior_restart_path, f_name) f_link = os.path.join(self.work_init_path_local, f_name) self.expt.manifest.add_filepath('restart',f_link,f_orig,self.copy_inputs) # Add input files to manifest if we don't already have a populated # input manifest, or we specify scan_inputs is True (default) if not self.expt.manifest.have_manifest['input'] or self.expt.manifest.scaninputs: # Add files to manifest for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_orig = os.path.join(input_path, f_name) f_link = os.path.join(self.work_input_path_local,f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_link): self.expt.manifest.add_filepath('input',f_link,f_orig,self.copy_inputs) # Make symlink to executable in work directory if self.exec_path: # Add to exe manifest (this is always done so any change in exe path will be picked up) self.expt.manifest.add_filepath('exe',self.exec_path_local,self.exec_path) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Link restart files from prior run if self.prior_restart_path and not self.expt.repeat_run: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_restart = os.path.join(self.prior_restart_path, f_name) f_input = os.path.join(self.work_init_path, f_name) if self.copy_restarts: shutil.copy(f_restart, f_input) else: make_symlink(f_restart, f_input) # Link input data for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_input = os.path.join(input_path, f_name) f_work_input = os.path.join(self.work_input_path, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_work_input): if self.copy_inputs: shutil.copy(f_input, f_work_input) else: make_symlink(f_input, f_work_input) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def archive(self): # Create an empty restart directory mkdir_p(self.restart_path) shutil.rmtree(self.work_input_path)
def setup(self): print("Setting up {model}".format(model=self.name)) # Create experiment directory structure mkdir_p(self.work_init_path) mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Add restart files from prior run to restart manifest if (not self.expt.manifest.have_manifest['restart'] and self.prior_restart_path): restart_files = self.get_prior_restart_files() for f_name in restart_files: f_orig = os.path.join(self.prior_restart_path, f_name) f_link = os.path.join(self.work_init_path_local, f_name) self.expt.manifest.add_filepath('restart', f_link, f_orig, self.copy_restarts) # Add input files to manifest if we don't already have a populated # input manifest, or we specify scaninputs is True (default) if (not self.expt.manifest.have_manifest['input'] or self.expt.manifest.scaninputs): for input_path in self.input_paths: if os.path.isfile(input_path): # Build a mock walk iterator for a single file fwalk = iter([(os.path.dirname(input_path), [], [os.path.basename(input_path)])]) # Overwrite the input_path as a directory input_path = os.path.dirname(input_path) else: fwalk = os.walk(input_path) for path, dirs, files in fwalk: workrelpath = os.path.relpath(path, input_path) subdir = os.path.normpath( os.path.join(self.work_input_path_local, workrelpath)) if not os.path.exists(subdir): os.mkdir(subdir) for f_name in files: f_orig = os.path.join(path, f_name) f_link = os.path.join(self.work_input_path_local, workrelpath, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_link): self.expt.manifest.add_filepath( 'input', f_link, f_orig, self.copy_inputs) # Make symlink to executable in work directory if self.exec_path: # Add to exe manifest (this is always done so any change in exe # path will be picked up) self.expt.manifest.add_filepath('exe', self.exec_path_local, self.exec_path) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def setup(self): super(Yatm, self).setup() # Make log dir mkdir_p(os.path.join(self.work_path, 'log'))
def archive(self): # Create an empty restart directory mkdir_p(self.restart_path)
def setup(self): print("Setting up {model}".format(model=self.name)) # Create experiment directory structure mkdir_p(self.work_init_path) mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Add restart files from prior run to restart manifest if (not self.expt.manifest.have_manifest['restart'] and self.prior_restart_path): restart_files = self.get_prior_restart_files() for f_name in restart_files: f_orig = os.path.join(self.prior_restart_path, f_name) f_link = os.path.join(self.work_init_path_local, f_name) self.expt.manifest.add_filepath( 'restart', f_link, f_orig, self.copy_restarts ) # Add input files to manifest if we don't already have a populated # input manifest, or we specify scan_inputs is True (default) if (not self.expt.manifest.have_manifest['input'] or self.expt.manifest.scaninputs): for input_path in self.input_paths: if os.path.isfile(input_path): # Build a mock walk iterator for a single file fwalk = iter([( os.path.dirname(input_path), [], [os.path.basename(input_path)] )]) # Overwrite the input_path as a directory input_path = os.path.dirname(input_path) else: fwalk = os.walk(input_path) for path, dirs, files in fwalk: workrelpath = os.path.relpath(path, input_path) subdir = os.path.normpath( os.path.join(self.work_input_path_local, workrelpath) ) if not os.path.exists(subdir): os.mkdir(subdir) for f_name in files: f_orig = os.path.join(path, f_name) f_link = os.path.join( self.work_input_path_local, workrelpath, f_name ) # Do not use input file if it is in RESTART if not os.path.exists(f_link): self.expt.manifest.add_filepath( 'input', f_link, f_orig, self.copy_inputs ) # Make symlink to executable in work directory if self.exec_path: # Add to exe manifest (this is always done so any change in exe # path will be picked up) self.expt.manifest.add_filepath( 'exe', self.exec_path_local, self.exec_path ) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = self.run_id[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def setup(self): # FMS initialisation super(Mom, self).setup() if not self.top_level_model: # Make log dir mkdir_p(os.path.join(self.work_path, 'log')) input_nml_path = os.path.join(self.work_path, 'input.nml') input_nml = f90nml.read(input_nml_path) use_core2iaf = self.config.get('core2iaf') if use_core2iaf: self.core2iaf_setup() # Set the runtime if self.expt.runtime: ocean_solo_nml = input_nml['ocean_solo_nml'] ocean_solo_nml['years'] = self.expt.runtime['years'] ocean_solo_nml['months'] = self.expt.runtime['months'] ocean_solo_nml['days'] = self.expt.runtime['days'] ocean_solo_nml['seconds'] = self.expt.runtime.get('seconds', 0) input_nml.write(input_nml_path, force=True) # Construct the land CPU mask if self.expt.config.get('mask_table', False): # NOTE: This function actually creates a mask table using the # `check_mask` command line tool. But it is not very usable # since you need to know the number of masked CPUs to submit # the job. It needs a rethink of the submission process. self.create_mask_table(input_nml) # NOTE: Don't expect this to be here forever... # Attempt to set a mask table from the input if self.config.get('mask', False): mask_path = os.path.join(self.work_input_path, 'ocean_mask_table') # Remove any existing mask # (If no reference mask is available, then we will not use one) if os.path.isfile(mask_path): os.remove(mask_path) # Reference mask table assert('layout' in input_nml['ocean_model_nml']) nx, ny = input_nml['ocean_model_nml'].get('layout') n_masked_cpus = nx * ny - self.config.get('ncpus') mask_table_fname = 'mask_table.{nmask}.{nx}x{ny}'.format( nmask=n_masked_cpus, nx=nx, ny=ny ) ref_mask_path = os.path.join(self.work_input_path, mask_table_fname) # Set (or replace) mask table if reference is available if os.path.isfile(ref_mask_path): make_symlink(ref_mask_path, mask_path)
def run(self, *user_flags): self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags), ' : '.join(mpi_progs)) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def archive(self): if not self.config.get('archive', True): print('payu: not archiving due to config.yaml setting.') return # Check there is a work directory, otherwise bail if not os.path.exists(self.work_sym_path): sys.exit('payu: error: No work directory to archive.') mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {work} {output}'.format( work=self.work_path, output=self.output_path ) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if os.path.isdir(res_path): shutil.rmtree(res_path) collate_config = self.config.get('collate', {}) if collate_config.get('enable', True): cmd = '{python} {payu} collate -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = '{python} {payu} profile -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def github_setup(self): """Set up authentication keys and API tokens.""" github_auth = self.authenticate() github_username = github_auth[0] expt_name = self.config.get('name', self.expt.name) expt_description = self.expt.config.get('description') if not expt_description: expt_description = input('Briefly describe the experiment: ') assert isinstance(expt_description, str) expt_private = self.config.get('private', False) # 1. Create the organisation if needed github_api_url = 'https://api.github.com' org_name = self.config.get('organization') if org_name: repo_target = org_name # Check if org exists org_query_url = os.path.join(github_api_url, 'orgs', org_name) org_req = requests.get(org_query_url) if org_req.status_code == 404: # NOTE: Orgs cannot be created via the API print('payu: github organization {org} does not exist.' ''.format(org=org_name)) print(' You must first create this on the website.') elif org_req.status_code == 200: # TODO: Confirm that the user can interact with the repo pass else: # TODO: Exit with grace print('payu: abort!') sys.exit(-1) repo_query_url = os.path.join(github_api_url, 'orgs', org_name, 'repos') repo_api_url = os.path.join(github_api_url, 'repos', org_name, expt_name) else: repo_target = github_username # Create repo in user account repo_query_url = os.path.join(github_api_url, 'user', 'repos') repo_api_url = os.path.join(github_api_url, 'repos', github_username, expt_name) # 2. Create the remote repository user_repos = [] page = 1 while True: repo_params = {'page': page, 'per_page': 100} repo_query = requests.get(repo_query_url, auth=github_auth, params=repo_params) assert repo_query.status_code == 200 if repo_query.json(): user_repos.extend(list(r['name'] for r in repo_query.json())) page += 1 else: break if expt_name not in user_repos: repo_config = { 'name': expt_name, 'description': expt_description, 'private': expt_private, 'has_issues': True, 'has_downloads': True, 'has_wiki': False } repo_gen = requests.post(repo_query_url, json.dumps(repo_config), auth=github_auth) assert repo_gen.status_code == 201 # 3. Check if remote is set git_remote_out = sp.check_output(shlex.split('git remote -v'), cwd=self.expt.control_path) git_remotes = dict([(r.split()[0], r.split()[1]) for r in git_remote_out.split('\n') if r]) remote_name = self.config.get('remote', 'payu') remote_url = os.path.join('ssh://[email protected]', repo_target, self.expt.name + '.git') if remote_name not in git_remotes: cmd = ('git remote add {name} {url}' ''.format(name=remote_name, url=remote_url)) sp.check_call(shlex.split(cmd), cwd=self.expt.control_path) elif git_remotes[remote_name] != remote_url: print('payu: error: Existing remote URL does not match ' 'the proposed URL.') print('payu: error: To delete the old remote, type ' '`git remote rm {name}`.'.format(name=remote_name)) sys.exit(-1) # 4. Generate a payu-specific SSH key default_ssh_key = 'id_rsa_payu_' + expt_name ssh_key = self.config.get('sshid', default_ssh_key) ssh_dir = os.path.join(os.path.expanduser('~'), '.ssh', 'payu') mkdir_p(ssh_dir) ssh_keypath = os.path.join(ssh_dir, ssh_key) if not os.path.isfile(ssh_keypath): cmd = 'ssh-keygen -t rsa -f {key} -q -P ""'.format(key=ssh_key) sp.check_call(shlex.split(cmd), cwd=ssh_dir) # 5. Deploy key to repo with open(ssh_keypath + '.pub') as keyfile: pubkey = ' '.join(keyfile.read().split()[:-1]) # TODO: Get this from github? repo_keys_url = os.path.join(repo_api_url, 'keys') keys_req = requests.get(repo_keys_url, auth=github_auth) assert keys_req.status_code == 200 if not any(k['key'] == pubkey for k in keys_req.json()): add_key_param = {'title': 'payu', 'key': pubkey} add_key_req = requests.post(repo_keys_url, auth=github_auth, json=add_key_param) assert add_key_req.status_code == 201
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def archive(self): if not self.config.get('archive', True): print('payu: not archiving due to config.yaml setting.') return # Check there is a work directory, otherwise bail if not os.path.exists(self.work_sym_path): sys.exit('payu: error: No work directory to archive.') mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') movetree(self.work_path, self.output_path) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if (os.path.isdir(res_path) and not os.path.islink(res_path)): shutil.rmtree(res_path) # Ensure dynamic library support for subsequent python calls ld_libpaths = os.environ['LD_LIBRARY_PATH'] py_libpath = sysconfig.get_config_var('LIBDIR') if py_libpath not in ld_libpaths.split(':'): os.environ['LD_LIBRARY_PATH'] = ':'.join([py_libpath, ld_libpaths]) collate_config = self.config.get('collate', {}) if collate_config.get('enable', True): cmd = '{python} {payu} collate -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = '{python} {payu} profile -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = str(self.run_id)[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) error_script = self.userscripts.get('error') if error_script: self.run_userscript(error_script) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def setup(self): super(Cice5, self).setup() # Make log dir mkdir_p(os.path.join(self.work_path, 'log'))