def __init__(self): # Try to figure out if we are indeed using the TCL version try: completed = os_ext.run_command('modulecmd -V') except OSError as e: raise ReframeError('could not find a sane Tmod installation: %s' % e) version_match = re.search(r'^VERSION=(\S+)', completed.stdout, re.MULTILINE) tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout, re.MULTILINE) if version_match is None or tcl_version_match is None: raise ReframeError('could not find a sane Tmod installation') self._version = version_match.group(1) self._command = 'modulecmd python' try: # Try the Python bindings now completed = os_ext.run_command(self._command) except OSError as e: raise ReframeError('could not get the Python bindings for Tmod: ' % e) if re.search(r'Unknown shell type', completed.stderr): raise ReframeError( 'Python is not supported by this Tmod installation')
def cancel(self): getlogger().debug('cancelling job (id=%s)' % self._jobid) if self._jobid is None: raise ReframeError('no job is spawned yet') os_ext.run_command('scancel %s' % self._jobid, check=True, timeout=settings.job_submit_timeout) self._is_cancelling = True
def _update_state(self): time_from_submit = datetime.now() - self.submit_time rem_wait = self.squeue_delay - time_from_submit.total_seconds() if rem_wait > 0: time.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid job id. completed = os_ext.run_command( 'squeue -h -j %s -O state,exit_code,reason' % self._jobid) output = completed.stdout.strip() if not output: # Assume that job has finished self._state = (SLURM_JOB_CANCELLED if self._cancelled else SLURM_JOB_COMPLETED) # Set exit code manually, if not set already by the polling if self._exitcode is None: self._exitcode = 0 return # There is no reliable way to get the exit code, so we always capture # it, just in case we are lucky enough and get its actual value while # the job has finished but is still showing up in the queue (e.g., when # it is 'COMPLETING') state, exitcode, reason = output.split(maxsplit=2) self._state = SlurmJobState(state) self._exitcode = int(exitcode) if not self._is_cancelling and self._state in self._pending_states: self._check_and_cancel(reason)
def _cancel_if_blocked(self): if self._is_cancelling or self._state not in self._pending_states: return completed = os_ext.run_command('squeue -h -j %s -o %%r' % self._jobid, check=True) if not completed.stdout: # Can't retrieve job's state. Perhaps it has finished already and # does not show up in the output of squeue return self._check_and_cancel(completed.stdout)
def submit(self): cmd = 'sbatch %s' % self.script_filename completed = os_ext.run_command(cmd, check=True, timeout=settings.job_submit_timeout) jobid_match = re.search('Submitted batch job (?P<jobid>\d+)', completed.stdout) if not jobid_match: raise JobSubmissionError(command=cmd, stdout=completed.stdout, stderr=completed.stderr, exitcode=completed.returncode) self._jobid = int(jobid_match.group('jobid'))
def _compile_file(self, source_file, executable, lang, options): if not executable: # default executable, same as source_file without the extension executable = os.path.join(os.path.dirname(source_file), source_file.rsplit('.')[:-1][0]) if not lang: lang = self.guess_language(source_file) # Replace None's with empty strings cppflags = self.cppflags or '' cflags = self.cflags or '' cxxflags = self.cxxflags or '' fflags = self.fflags or '' ldflags = self.ldflags or '' flags = [cppflags] if lang == 'C': compiler = self.cc flags.append(cflags) elif lang == 'C++': compiler = self.cxx flags.append(cxxflags) elif lang == 'Fortran': compiler = self.ftn flags.append(fflags) elif lang == 'CUDA': compiler = 'nvcc' flags.append(cxxflags) else: raise ReframeError('Unknown language') # Append include search path flags += ['-I' + d for d in self.include_search_path] cmd = ('%s %s %s -o %s %s %s' % (compiler, ' '.join(flags), source_file, executable, ldflags, options)) try: return os_ext.run_command(cmd, check=True) except CommandError as e: raise CompilationError(command=e.command, stdout=e.stdout, stderr=e.stderr, exitcode=e.exitcode, environ=self)
def _update_state(self): """Check the status of the job.""" completed = os_ext.run_command( 'sacct -S %s -P -j %s -o jobid,state,exitcode' % (datetime.now().strftime('%F'), self._jobid), check=True) state_match = re.search( r'^(?P<jobid>\d+)\|(?P<state>\S+)([^\|]*)\|' r'(?P<exitcode>\d+)\:(?P<signal>\d+)', completed.stdout, re.MULTILINE) if state_match is None: getlogger().debug('job state not matched (stdout follows)\n%s' % completed.stdout) return self._state = SlurmJobState(state_match.group('state')) self._cancel_if_blocked() if self._state in self._completion_states: self._exitcode = int(state_match.group('exitcode'))
def autodetect_system(site_config): """Auto-detect system""" import re import socket # Try to detect directly the cluster name from /etc/xthostname (Cray # specific) try: hostname = os_ext.run_command('cat /etc/xthostname', check=True).stdout except ReframeError: # Try to figure it out with the standard method hostname = socket.gethostname() # Go through the supported systems and try to match the hostname for system in site_config.systems.values(): for hostname_patt in system.hostnames: if re.match(hostname_patt, hostname): return system return None
def _compile_dir(self, source_dir, makefile, options): if makefile: cmd = 'make -C %s -f %s %s ' % (source_dir, makefile, options) else: cmd = 'make -C %s %s ' % (source_dir, options) # Pass a set of predefined options to the Makefile if self.propagate: flags = ["CC='%s'" % self.cc, "CXX='%s'" % self.cxx, "FC='%s'" % self.ftn] # Explicitly check against None here; the user may explicitly want # to clear the flags if self.cppflags is not None: flags.append("CPPFLAGS='%s'" % self.cppflags) if self.cflags is not None: flags.append("CFLAGS='%s'" % self.cflags) if self.cxxflags is not None: flags.append("CXXFLAGS='%s'" % self.cxxflags) if self.fflags is not None: flags.append("FFLAGS='%s'" % self.fflags) if self.ldflags is not None: flags.append("LDFLAGS='%s'" % self.ldflags) cmd += ' '.join(flags) try: return os_ext.run_command(cmd, check=True) except CommandError as e: raise CompilationError(command=e.command, stdout=e.stdout, stderr=e.stderr, exitcode=e.exitcode, environ=self)
def test_command_timeout(self): try: os_ext.run_command('sleep 3', timeout=2) self.fail('Expected timeout') except CommandError as e: self.assertEqual(e.timeout, 2)
def test_command_success(self): completed = os_ext.run_command('echo foobar') self.assertEqual(completed.returncode, 0) self.assertEqual(completed.stdout, 'foobar\n')
def _run_module_command(self, *args): command = [self._command, *args] return os_ext.run_command(' '.join(command))