示例#1
0
 def clear_rundir(self):
     #sh.cd(self.workdir)
     try:
         sh.rm(['-r', self.rundir])
     except sh.ErrorReturnCode:
         self.log.warning(
             'Tried to remove run directory but it doesnt exist')
     mkdir(self.rundir)
     self.log.info('Emptied run directory %r' % self.rundir)
示例#2
0
    def run(self,
            i,
            restart_file=None,
            use_restart=True,
            multi_node=False,
            num_cores=8,
            overwrite_data=False,
            save_run=False,
            run_idb=False,
            nice_score=0):
        """Run the model.
            `num_cores`: Number of mpi cores to distribute over.
            `restart_file` (optional): A path to a valid restart archive.  If None and `use_restart=True`,
                                       restart file (i-1) will be used.
            `save_run`:  If True, copy the entire working directory over to GFDL_DATA
                         so that the run can rerun without the python script.
                         (This uses a lot of data storage!)

        """

        self.clear_rundir()

        indir = P(self.rundir, 'INPUT')
        outdir = P(self.datadir, self.runfmt % i)
        resdir = P(self.rundir, 'RESTART')

        if os.path.isdir(outdir):
            if overwrite_data:
                self.log.warning(
                    'Data for run %d already exists and overwrite_data is True. Overwriting.'
                    % i)
                sh.rm('-r', outdir)
            else:
                self.log.warn(
                    'Data for run %d already exists but overwrite_data is False. Stopping.'
                    % i)
                return False

        # make the output run folder and copy over the input files
        mkdir([indir, resdir, self.restartdir])

        self.codebase.write_source_control_status(
            P(self.rundir, 'git_hash_used.txt'))
        self.write_namelist(self.rundir)
        self.write_field_table(self.rundir)
        self.write_diag_table(self.rundir)

        for filename in self.inputfiles:
            sh.cp([filename, P(indir, os.path.split(filename)[1])])

        mpirun_opts = ''

        if multi_node:
            mpirun_opts += ' -bootstrap pbsdsh -f $PBS_NODEFILE'

        if use_restart:
            if not restart_file:
                # get the restart from previous iteration
                restart_file = self.get_restart_file(i - 1)
            if not os.path.isfile(restart_file):
                self.log.error('Restart file not found, expecting file %r' %
                               restart_file)
                raise IOError('Restart file not found, expecting file %r' %
                              restart_file)
            else:
                self.log.info('Using restart file %r' % restart_file)

            self.extract_restart_archive(restart_file, indir)
        else:
            self.log.info('Running without restart file')
            restart_file = None

        vars = {
            'rundir': self.rundir,
            'execdir': self.codebase.builddir,
            'executable': self.codebase.executable_name,
            'env_source': self.env_source,
            'mpirun_opts': mpirun_opts,
            'num_cores': num_cores,
            'run_idb': run_idb,
            'nice_score': nice_score
        }

        runscript = self.templates.get_template('run.sh')

        # employ the template to create a runscript
        t = runscript.stream(**vars).dump(P(self.rundir, 'run.sh'))

        def _outhandler(line):
            handled = self.emit('run:output', self, line)
            if not handled:  # only log the output when no event handler is used
                self.log_output(line)

        self.emit('run:ready', self, i)
        self.log.info("Beginning run %d" % i)
        try:
            #for line in sh.bash(P(self.rundir, 'run.sh'), _iter=True, _err_to_out=True):
            proc = sh.bash(P(self.rundir, 'run.sh'),
                           _bg=True,
                           _out=_outhandler,
                           _err_to_out=True)
            self.log.info('process running as {}'.format(proc.process.pid))
            proc.wait()
            completed = True
        except KeyboardInterrupt as e:
            self.log.error("Manual interrupt, killing process.")
            proc.process.terminate()
            proc.wait()
            #log.info("Cleaning run directory.")
            #self.clear_rundir()
            raise e
        except sh.ErrorReturnCode as e:
            completed = False
            self.log.error("Run %d failed. See log for details." % i)
            self.log.error("Error: %r" % e)
            self.emit('run:failed', self)
            raise FailedRunError()

        self.emit('run:completed', self, i)
        self.log.info('Run %d complete' % i)
        mkdir(outdir)

        if num_cores > 1:
            # use postprocessing tool to combine the output from several cores
            codebase_combine_script = P(self.codebase.builddir,
                                        'mppnccombine_run.sh')
            if not os.path.exists(codebase_combine_script):
                self.log.warning(
                    'combine script does not exist in the commit you are running Isca from.  Falling back to using $GFDL_BASE mppnccombine_run.sh script'
                )
                sh.ln('-s',
                      P(GFDL_BASE, 'postprocessing', 'mppnccombine_run.sh'),
                      codebase_combine_script)
            combinetool = sh.Command(codebase_combine_script)
            for file in self.diag_table.files:
                netcdf_file = '%s.nc' % file
                filebase = P(self.rundir, netcdf_file)
                combinetool(self.codebase.builddir, filebase)
                # copy the combined netcdf file into the data archive directory
                sh.cp(filebase, P(outdir, netcdf_file))
                # remove all netcdf fragments from the run directory
                sh.rm(glob.glob(filebase + '*'))
                self.log.debug('%s combined and copied to data directory' %
                               netcdf_file)

            for restart in glob.glob(P(resdir, '*.res.nc.0000')):
                restartfile = restart.replace('.0000', '')
                combinetool(self.codebase.builddir, restartfile)
                sh.rm(glob.glob(restartfile + '.????'))
                self.log.debug("Restart file %s combined" % restartfile)

            self.emit('run:combined', self)

        # make the restart archive and delete the restart files
        self.make_restart_archive(self.get_restart_file(i), resdir)
        sh.rm('-r', resdir)

        if save_run:
            # copy the complete run directory to GFDL_DATA so that the run can
            # be recreated without the python script if required
            mkdir(resdir)
            sh.cp(['-a', self.rundir, outdir])
        else:
            # just save some useful diagnostic information
            self.write_namelist(outdir)
            self.write_field_table(outdir)
            self.write_diag_table(outdir)
            self.codebase.write_source_control_status(
                P(outdir, 'git_hash_used.txt'))

        self.clear_rundir()

        return True
示例#3
0
 def clear_workdir(self):
     self.rm_workdir()
     mkdir(self.workdir)
     self.log.info('Emptied working directory %r' % self.workdir)