Пример #1
0
    def run(self, *user_flags):

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags),
                                ' : '.join(mpi_progs))

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd),
                            stdout=f_out,
                            stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
Пример #2
0
    def collate(self):

        # Set the stacksize to be unlimited
        res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY))

        collate_config = self.expt.config.get('collate', {})

        # The mpi flag implies using mppnccombine-fast
        mpi = collate_config.get('mpi', False)

        if mpi:
            default_exe = 'mppnccombine-fast'
        else:
            default_exe = 'mppnccombine'

        # Locate the FMS collation tool
        # Check config for collate executable
        mppnc_path = collate_config.get('exe')
        if mppnc_path is None:
            for f in os.listdir(self.expt.lab.bin_path):
                if f == default_exe:
                    mppnc_path = os.path.join(self.expt.lab.bin_path, f)
                    break
        else:
            if not os.path.isabs(mppnc_path):
                mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path)

        assert mppnc_path, 'No mppnccombine program found'

        # Check config for collate command line options
        collate_flags = collate_config.get('flags')
        if collate_flags is None:
            if mpi:
                collate_flags = '-r'
            else:
                collate_flags = '-n4 -z -m -r'

        if mpi:
            # The output file is the first argument after the flags
            # and mppnccombine-fast uses an explicit -o flag to specify
            # the output
            collate_flags = " ".join([collate_flags, '-o'])
            mpi_module = envmod.lib_update(mppnc_path, 'libmpi.so')

        # Import list of collated files to ignore
        collate_ignore = collate_config.get('ignore')
        if collate_ignore is None:
            collate_ignore = []
        elif type(collate_ignore) != list:
            collate_ignore = [collate_ignore]

        # Generate collated file list and identify the first tile
        tile_fnames = [f for f in os.listdir(self.output_path)
                       if f[-4:].isdigit() and f[-8:-4] == '.nc.']

        tile_fnames.sort()

        mnc_tiles = defaultdict(list)
        for t_fname in tile_fnames:
            t_base, t_ext = os.path.splitext(t_fname)
            t_ext = t_ext.lstrip('.')

            # Skip any files listed in the ignore list
            if t_base in collate_ignore:
                continue

            mnc_tiles[t_base].append(t_fname)

        cpucount = int(collate_config.get('ncpus',
                       multiprocessing.cpu_count()))

        if mpi:
            # Default to one for mpi
            nprocesses = int(collate_config.get('threads', 1))
        else:
            nprocesses = int(collate_config.get('threads', cpucount))

        ncpusperprocess = int(cpucount/nprocesses)

        if ncpusperprocess == 1 and mpi:
            print("Warning: running collate with mpirun on a single processor")

        pool = multiprocessing.Pool(processes=nprocesses)

        # Collate each tileset into a single file
        results = []
        codes = []
        outputs = []
        for nc_fname in mnc_tiles:
            nc_path = os.path.join(self.output_path, nc_fname)

            # Remove the collated file if it already exists, since it is
            # probably from a failed collation attempt
            # TODO: Validate this somehow
            if os.path.isfile(nc_path):
                os.remove(nc_path)

            cmd = ' '.join([mppnc_path, collate_flags, nc_fname,
                            ' '.join(mnc_tiles[nc_fname])])
            if mpi:
                cmd = "mpirun -n {n} {cmd}".format(
                    n=ncpusperprocess,
                    cmd=cmd
                )

            print(cmd)
            results.append(
                pool.apply_async(cmdthread, args=(cmd, self.output_path)))

        pool.close()
        pool.join()

        for result in results:
            rc, op = result.get()
            codes.append(rc)
            outputs.append(op)

        # TODO: Categorise the return codes
        if any(rc is not None for rc in codes):
            for p, rc, op in zip(count(), codes, outputs):
                if rc is not None:
                    print('payu: error: Thread {p} crashed with error code '
                          '{rc}.'.format(p, rc), file=sys.stderr)
                    print(' Error message:', file=sys.stderr)
                    print(msg, file=sys.stderr)
            sys.exit(-1)
Пример #3
0
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path_local:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(
                    model.exec_path_local,
                    'libmpi.so'
                )

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)

            # Use the full path to symlinked exec_name in work as some
            # older MPI libraries complained executable was not in PATH
            model_prog.append(os.path.join(model.work_path, model.exec_name))

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        # TODO: Consider making this default
        if self.config.get('coredump', False):
            enable_core_dump()

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # Dump out environment
        with open(self.env_fname, 'w') as file:
            file.write(yaml.dump(dict(os.environ), default_flow_style=False))

        self.runlog.create_manifest()
        if self.runlog.enabled:
            self.runlog.commit()

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        print(cmd)
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        f_out.close()
        f_err.close()

        self.finish_time = datetime.datetime.now()

        info = get_job_info()

        if info is None:
            # Not being run under PBS, reverse engineer environment
            info = {
                'PAYU_PATH': os.path.dirname(self.payu_path)
            }

        # Add extra information to save to jobinfo
        info.update(
            {
                'PAYU_CONTROL_DIR': self.control_path,
                'PAYU_RUN_ID': self.run_id,
                'PAYU_CURRENT_RUN': self.counter,
                'PAYU_N_RUNS':  self.n_runs,
                'PAYU_JOB_STATUS': rc,
                'PAYU_START_TIME': self.start_time.isoformat(),
                'PAYU_FINISH_TIME': self.finish_time.isoformat(),
                'PAYU_WALLTIME': "{0} s".format(
                    (self.finish_time - self.start_time).total_seconds()
                ),
            }
        )

        # Dump job info
        with open(self.job_fname, 'w') as file:
            file.write(yaml.dump(info, default_flow_style=False))

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = get_job_id(short=False)

            if job_id == '':
                job_id = str(self.run_id)[:6]

            for fname in self.output_fnames:

                src = os.path.join(self.control_path, fname)

                stem, suffix = os.path.splitext(fname)
                dest = os.path.join(error_log_dir,
                                    ".".join((stem, job_id)) + suffix)

                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            error_script = self.userscripts.get('error')
            if error_script:
                self.run_userscript(error_script)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in self.output_fnames:
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
Пример #4
0
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
Пример #5
0
    def collate(self):

        # Set the stacksize to be unlimited
        res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY))

        collate_config = self.expt.config.get('collate', {})

        # The mpi flag implies using mppnccombine-fast
        mpi = collate_config.get('mpi', False)

        if mpi:
            # Must use envmod to be able to load mpi modules for collation
            envmod.setup()
            self.expt.load_modules()
            default_exe = 'mppnccombine-fast'
        else:
            default_exe = 'mppnccombine'

        # Locate the FMS collation tool
        # Check config for collate executable
        mppnc_path = collate_config.get('exe')
        if mppnc_path is None:
            for f in os.listdir(self.expt.lab.bin_path):
                if f == default_exe:
                    mppnc_path = os.path.join(self.expt.lab.bin_path, f)
                    break
        else:
            if not os.path.isabs(mppnc_path):
                mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path)

        assert mppnc_path, 'No mppnccombine program found'

        # Check config for collate command line options
        collate_flags = collate_config.get('flags')
        if collate_flags is None:
            if mpi:
                collate_flags = '-r'
            else:
                collate_flags = '-n4 -z -m -r'

        if mpi:
            # The output file is the first argument after the flags
            # and mppnccombine-fast uses an explicit -o flag to specify
            # the output
            collate_flags = " ".join([collate_flags, '-o'])
            envmod.lib_update(mppnc_path, 'libmpi.so')

        # Import list of collated files to ignore
        collate_ignore = collate_config.get('ignore')
        if collate_ignore is None:
            collate_ignore = []
        elif type(collate_ignore) != list:
            collate_ignore = [collate_ignore]

        # Generate collated file list and identify the first tile
        tile_fnames = {}
        fnames = Fms.get_uncollated_files(self.output_path)
        tile_fnames[self.output_path] = fnames

        print(tile_fnames)

        if (collate_config.get('restart', False) and
                self.prior_restart_path is not None):
            # Add uncollated restart files
            fnames = Fms.get_uncollated_files(self.prior_restart_path)
            tile_fnames[self.prior_restart_path] = fnames

        # mnc_tiles = defaultdict(list)
        mnc_tiles = defaultdict(defaultdict(list).copy)
        for t_dir in tile_fnames:
            for t_fname in tile_fnames[t_dir]:
                t_base, t_ext = os.path.splitext(t_fname)
                t_ext = t_ext.lstrip('.')

                # Skip any files listed in the ignore list
                if t_base in collate_ignore:
                    continue

                mnc_tiles[t_dir][t_base].append(t_fname)

        # print(mnc_tiles)

        if mpi and collate_config.get('glob', True):
            for t_base in mnc_tiles:
                globstr = "{}.*".format(t_base)
                # Try an equivalent glob and check the same files are returned
                mnc_glob = fnmatch.filter(os.listdir(self.output_path),
                                          globstr)
                if mnc_tiles[t_base] == sorted(mnc_glob):
                    mnc_tiles[t_base] = [globstr, ]
                    print("Note: using globstr ({}) for collating {}"
                          .format(globstr, t_base))
                else:
                    print("Warning: cannot use globstr {} to collate {}"
                          .format(globstr, t_base))
                    if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT:
                        print("Warning: large number of tiles: {} "
                              .format(len(mnc_tiles[t_base])))
                        print("Warning: collation will be slow and may fail")

        cpucount = int(collate_config.get('ncpus',
                       multiprocessing.cpu_count()))

        if mpi:
            # Default to one for mpi
            nprocesses = int(collate_config.get('threads', 1))
        else:
            nprocesses = int(collate_config.get('threads', cpucount))

        ncpusperprocess = int(cpucount/nprocesses)

        if ncpusperprocess == 1 and mpi:
            print("Warning: running collate with mpirun on a single processor")

        pool = multiprocessing.Pool(processes=nprocesses)

        # Collate each tileset into a single file
        results = []
        codes = []
        outputs = []
        for output_path in mnc_tiles:
            for nc_fname in mnc_tiles[output_path]:
                nc_path = os.path.join(output_path, nc_fname)

                # Remove the collated file if it already exists, since it is
                # probably from a failed collation attempt
                # TODO: Validate this somehow
                if os.path.isfile(nc_path):
                    os.remove(nc_path)

                cmd = ' '.join([mppnc_path, collate_flags, nc_fname,
                                ' '.join(mnc_tiles[output_path][nc_fname])])
                if mpi:
                    cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd)

                print(cmd)
                results.append(
                    pool.apply_async(cmdthread, args=(cmd, output_path)))

        pool.close()
        pool.join()

        for result in results:
            rc, op = result.get()
            codes.append(rc)
            outputs.append(op)

        # TODO: Categorise the return codes
        if any(rc is not None for rc in codes):
            for p, rc, op in zip(count(), codes, outputs):
                if rc is not None:
                    print('payu: error: Thread {p} crashed with error code '
                          '{rc}.'.format(p=p, rc=rc), file=sys.stderr)
                    print(' Error message:', file=sys.stderr)
                    print(op.decode(), file=sys.stderr)
            sys.exit(-1)
Пример #6
0
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path_local:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(
                    model.exec_path_local,
                    'libmpi.so'
                )

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)

            # Use the full path to symlinked exec_name in work as some
            # older MPI libraries complained executable was not in PATH
            model_prog.append(os.path.join(model.work_path, model.exec_name))

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        # TODO: Consider making this default
        if self.config.get('coredump', False):
            enable_core_dump()

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # Dump out environment
        with open(self.env_fname, 'w') as file:
            file.write(yaml.dump(dict(os.environ), default_flow_style=False))

        self.runlog.create_manifest()
        if self.runlog.enabled:
            self.runlog.commit()

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        print(cmd)
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        f_out.close()
        f_err.close()

        self.finish_time = datetime.datetime.now()

        info = get_job_info()

        if info is None:
            # Not being run under PBS, reverse engineer environment
            info = {
                'PAYU_PATH': os.path.dirname(self.payu_path)
            }

        # Add extra information to save to jobinfo
        info.update(
            {
                'PAYU_CONTROL_DIR': self.control_path,
                'PAYU_RUN_ID': self.run_id,
                'PAYU_CURRENT_RUN': self.counter,
                'PAYU_N_RUNS':  self.n_runs,
                'PAYU_JOB_STATUS': rc,
                'PAYU_START_TIME': self.start_time.isoformat(),
                'PAYU_FINISH_TIME': self.finish_time.isoformat(),
                'PAYU_WALLTIME': "{0} s".format(
                    (self.finish_time - self.start_time).total_seconds()
                ),
            }
        )

        # Dump job info
        with open(self.job_fname, 'w') as file:
            file.write(yaml.dump(info, default_flow_style=False))

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = get_job_id(short=False)

            if job_id == '':
                job_id = self.run_id[:6]

            for fname in self.output_fnames:

                src = os.path.join(self.control_path, fname)

                stem, suffix = os.path.splitext(fname)
                dest = os.path.join(error_log_dir,
                                    ".".join((stem, job_id)) + suffix)

                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in self.output_fnames:
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
Пример #7
0
    def collate(self):

        # Set the stacksize to be unlimited
        res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY))

        collate_config = self.expt.config.get('collate', {})

        # The mpi flag implies using mppnccombine-fast
        mpi = collate_config.get('mpi', False)

        if mpi:
            # Must use envmod to be able to load mpi modules for collation
            envmod.setup()
            self.expt.load_modules()
            default_exe = 'mppnccombine-fast'
        else:
            default_exe = 'mppnccombine'

        # Locate the FMS collation tool
        # Check config for collate executable
        mppnc_path = collate_config.get('exe')
        if mppnc_path is None:
            for f in os.listdir(self.expt.lab.bin_path):
                if f == default_exe:
                    mppnc_path = os.path.join(self.expt.lab.bin_path, f)
                    break
        else:
            if not os.path.isabs(mppnc_path):
                mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path)

        assert mppnc_path, 'No mppnccombine program found'

        # Check config for collate command line options
        collate_flags = collate_config.get('flags')
        if collate_flags is None:
            if mpi:
                collate_flags = '-r'
            else:
                collate_flags = '-n4 -z -m -r'

        if mpi:
            # The output file is the first argument after the flags
            # and mppnccombine-fast uses an explicit -o flag to specify
            # the output
            collate_flags = " ".join([collate_flags, '-o'])
            envmod.lib_update(mppnc_path, 'libmpi.so')

        # Import list of collated files to ignore
        collate_ignore = collate_config.get('ignore')
        if collate_ignore is None:
            collate_ignore = []
        elif type(collate_ignore) != list:
            collate_ignore = [collate_ignore]

        # Generate collated file list and identify the first tile
        tile_fnames = {}
        fnames = Fms.get_uncollated_files(self.output_path)
        tile_fnames[self.output_path] = fnames

        print(tile_fnames)

        if (collate_config.get('restart', False) and
                self.prior_restart_path is not None):
            # Add uncollated restart files
            fnames = Fms.get_uncollated_files(self.prior_restart_path)
            tile_fnames[self.prior_restart_path] = fnames

        # mnc_tiles = defaultdict(list)
        mnc_tiles = defaultdict(defaultdict(list).copy)
        for t_dir in tile_fnames:
            for t_fname in tile_fnames[t_dir]:
                t_base, t_ext = os.path.splitext(t_fname)
                t_ext = t_ext.lstrip('.')

                # Skip any files listed in the ignore list
                if t_base in collate_ignore:
                    continue

                mnc_tiles[t_dir][t_base].append(t_fname)

        # print(mnc_tiles)

        if mpi and collate_config.get('glob', True):
            for t_base in mnc_tiles:
                globstr = "{}.*".format(t_base)
                # Try an equivalent glob and check the same files are returned
                mnc_glob = fnmatch.filter(os.listdir(self.output_path),
                                          globstr)
                if mnc_tiles[t_base] == sorted(mnc_glob):
                    mnc_tiles[t_base] = [globstr, ]
                    print("Note: using globstr ({}) for collating {}"
                          .format(globstr, t_base))
                else:
                    print("Warning: cannot use globstr {} to collate {}"
                          .format(globstr, t_base))
                    if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT:
                        print("Warning: large number of tiles: {} "
                              .format(len(mnc_tiles[t_base])))
                        print("Warning: collation will be slow and may fail")

        cpucount = int(collate_config.get('ncpus',
                       multiprocessing.cpu_count()))

        if mpi:
            # Default to one for mpi
            nprocesses = int(collate_config.get('threads', 1))
        else:
            nprocesses = int(collate_config.get('threads', cpucount))

        ncpusperprocess = int(cpucount/nprocesses)

        if ncpusperprocess == 1 and mpi:
            print("Warning: running collate with mpirun on a single processor")

        pool = multiprocessing.Pool(processes=nprocesses)

        # Collate each tileset into a single file
        results = []
        codes = []
        outputs = []
        for output_path in mnc_tiles:
            for nc_fname in mnc_tiles[output_path]:
                nc_path = os.path.join(output_path, nc_fname)

                # Remove the collated file if it already exists, since it is
                # probably from a failed collation attempt
                # TODO: Validate this somehow
                if os.path.isfile(nc_path):
                    os.remove(nc_path)

                cmd = ' '.join([mppnc_path, collate_flags, nc_fname,
                                ' '.join(mnc_tiles[output_path][nc_fname])])
                if mpi:
                    cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd)

                print(cmd)
                results.append(
                    pool.apply_async(cmdthread, args=(cmd, output_path)))

        pool.close()
        pool.join()

        for result in results:
            rc, op = result.get()
            codes.append(rc)
            outputs.append(op)

        # TODO: Categorise the return codes
        if any(rc is not None for rc in codes):
            for p, rc, op in zip(count(), codes, outputs):
                if rc is not None:
                    print('payu: error: Thread {p} crashed with error code '
                          '{rc}.'.format(p=p, rc=rc), file=sys.stderr)
                    print(' Error message:', file=sys.stderr)
                    print(op.decode(), file=sys.stderr)
            sys.exit(-1)
Пример #8
0
    def run(self, *user_flags):

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        mpi_flags = self.config.get('mpirun')
        # Correct an empty mpirun entry
        if mpi_flags is None:
            mpi_flags = []

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = '-npersocket {}'.format(model_npernode / 2)
                else:
                    npernode_flag = '-npernode {}'.format(model_npernode)

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.wrapper:
                    model_prog.append(prof.wrapper)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags),
                                ' : '.join(mpi_progs))

        oss = self.config.get('openspeedshop')
        if oss:
            oss_runcmd = oss.get('runcmd')
            if not oss_runcmd:
                print('payu: error: OpenSpeedShop requires an executable.')
                sys.exit(1)

            oss_hwc = oss.get('hwc')
            if oss_runcmd.startswith('osshwc') and not oss_hwc:
                print('payu: error: This OSS command requires hardware '
                      'counters.')
                sys.exit(1)

            cmd = '{} "{}" {}'.format(oss_runcmd, cmd, oss_hwc)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd),
                            stdout=f_out,
                            stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        if rc != 0:
            sys.exit('payu: Model exited with error code {}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
Пример #9
0
    def run(self, *user_flags):

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        mpi_flags = self.config.get('mpirun')
        # Correct an empty mpirun entry
        if mpi_flags is None:
            mpi_flags = []

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = '-npersocket {}'.format(model_npernode / 2)
                else:
                    npernode_flag = '-npernode {}'.format(model_npernode)

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                model_prog.append(prof.wrapper)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{} {} {}'.format(mpi_runcmd,
                                ' '.join(mpi_flags),
                                ' : '.join(mpi_progs))
        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        if rc != 0:
            sys.exit('payu: Model exited with error code {}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)