def __init__(self): # Try to figure out if we are indeed using LMOD lmod_cmd = os.getenv('LMOD_CMD') if lmod_cmd is None: raise ConfigError('could not find a sane Lmod installation: ' 'environment variable LMOD_CMD is not defined') try: completed = os_ext.run_command('%s --version' % lmod_cmd) except OSError as e: raise ConfigError( 'could not find a sane Lmod installation: %s' % e) version_match = re.search(r'.*Version\s*(\S+)', completed.stderr, re.MULTILINE) if version_match is None: raise ConfigError('could not retrieve Lmod version') self._version = version_match.group(1) self._command = '%s python ' % lmod_cmd try: # Try the Python bindings now completed = os_ext.run_command(self._command) except OSError as e: raise ConfigError( 'could not get the Python bindings for Lmod: ' % e) if re.search(r'Unknown shell type', completed.stderr): raise ConfigError('Python is not supported by ' 'this Lmod installation')
def __init__(self): # Try to figure out if we are indeed using the TCL version try: completed = os_ext.run_command('modulecmd -V') except OSError as e: raise ConfigError('could not find a sane Tmod installation: %s' % e) from e version_match = re.search(r'^VERSION=(\S+)', completed.stdout, re.MULTILINE) tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout, re.MULTILINE) if version_match is None or tcl_version_match is None: raise ConfigError('could not find a sane Tmod installation') self._version = version_match.group(1) self._command = 'modulecmd python' try: # Try the Python bindings now completed = os_ext.run_command(self._command) except OSError as e: raise ConfigError('could not get the Python bindings for Tmod: ' % e) from e if re.search(r'Unknown shell type', completed.stderr): raise ConfigError( 'Python is not supported by this Tmod installation')
def test_command_timeout(self): try: os_ext.run_command('sleep 3', timeout=2) except SpawnedProcessTimeout as e: assert e.timeout == 2 # Try to get the string repr. of the exception: see bug #658 s = str(e) else: pytest.fail('expected timeout')
def test_command_timeout(self): with pytest.raises( SpawnedProcessTimeout, match=r"command 'sleep 3' timed out after 2s") as exc_info: os_ext.run_command('sleep 3', timeout=2) assert exc_info.value.timeout == 2 # Try to get the string repr. of the exception: see bug #658 s = str(exc_info.value)
def test_trap_error(script_file): with shell.generate_script(script_file, trap_errors=True) as gen: gen.write('false') gen.write('echo hello') with pytest.raises(SpawnedProcessError) as cm: os_ext.run_command(str(script_file), check=True) exc = cm.value assert 'hello' not in exc.stdout assert 1 == exc.exitcode assert "-reframe: command `false' failed (exit code: 1)" in exc.stdout
def test_trap_error(self): with shell.generate_script(self.script_file.name, trap_errors=True) as gen: gen.write('false') gen.write('echo hello') with self.assertRaises(SpawnedProcessError) as cm: os_ext.run_command(self.script_file.name, check=True) exc = cm.exception self.assertNotIn('hello', exc.stdout) self.assertEqual(1, exc.exitcode) self.assertIn("-reframe: command `false' failed (exit code: 1)", exc.stdout)
def _get_reservation_nodes(self, reservation): completed = os_ext.run_command('scontrol -a show res %s' % reservation, check=True) node_match = re.search(r'(Nodes=\S+)', completed.stdout) if node_match: reservation_nodes = node_match[1] else: raise JobError("could not extract the nodes names for " "reservation '%s'" % valid_reservation) completed = os_ext.run_command( 'scontrol -a show -o %s' % reservation_nodes, check=True) node_descriptions = completed.stdout.splitlines() return {SlurmNode(descr) for descr in node_descriptions}
def _update_state(self, job): time_from_submit = datetime.now() - self._submit_time rem_wait = self._squeue_delay - time_from_submit.total_seconds() if rem_wait > 0: time.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid # job id. completed = os_ext.run_command('squeue -h -j %s -o "%%T|%%N|%%r"' % job.jobid) state_match = list(re.finditer(r'^(?P<state>\S+)\|(?P<nodespec>\S*)\|' r'(?P<reason>.+)', completed.stdout)) if not state_match: # Assume that job has finished job.state = 'CANCELLED' if self._cancelled else 'COMPLETED' # Set exit code manually, if not set already by the polling if job.exitcode is None: job.exitcode = 0 return # Join the states with ',' in case of job arrays job.state = ','.join(s.group('state') for s in state_match) # Use ',' to join nodes to be consistent with Slurm syntax self._set_nodelist( job, ','.join(s.group('nodespec') for s in state_match) ) if not self._is_cancelling and not slurm_state_pending(job.state): for s in state_match: self._check_and_cancel(job, s.group('reason'))
def completion_time(self, job): if (self._completion_time or not slurm_state_completed(job.state)): return self._completion_time with rt.temp_environment(variables={'SLURM_TIME_FORMAT': '%s'}): completed = os_ext.run_command( 'sacct -S %s -P -j %s -o jobid,end' % (self._submit_time.strftime('%F'), job.jobid), log=False) state_match = list( re.finditer(r'^(?P<jobid>%s)\|(?P<end>\S+)' % self._state_patt, completed.stdout, re.MULTILINE)) if not state_match: return None completion_times = [] for s in state_match: with suppress(ValueError): completion_times.append(float(s.group('end'))) if completion_times: self._completion_time = max(completion_times) return self._completion_time
def _update_state(self): time_from_submit = datetime.now() - self.submit_time rem_wait = self.squeue_delay - time_from_submit.total_seconds() if rem_wait > 0: time.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid # job id. completed = os_ext.run_command('squeue -h -j %s -o "%%T|%%N|%%r"' % self._jobid) state_match = re.search( r'^(?P<state>\S+)\|(?P<nodespec>\S*)\|' r'(?P<reason>.+)', completed.stdout) if state_match is None: # Assume that job has finished self._state = (SLURM_JOB_CANCELLED if self._cancelled else SLURM_JOB_COMPLETED) # Set exit code manually, if not set already by the polling if self._exitcode is None: self._exitcode = 0 return self._state = SlurmJobState(state_match.group('state')) self._set_nodelist(state_match.group('nodespec')) if not self._is_cancelling and self._state in self._pending_states: self._check_and_cancel(state_match.group('reason'))
def __init__(self): self._command = 'modulecmd python' try: completed = os_ext.run_command(self._command + ' -V', check=True) except OSError as e: raise ConfigError( 'could not find a sane TMod4 installation') from e except SpawnedProcessError as e: raise ConfigError( 'could not get the Python bindings for TMod4') from e version_match = re.match(r'^Modules Release (\S+)\s+', completed.stderr) if not version_match: raise ConfigError('could not retrieve the TMod4 version') version = version_match.group(1) try: ver_major, ver_minor, *_ = [int(v) for v in version.split('.')] except ValueError: raise ConfigError( 'could not parse TMod4 version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError( 'unsupported TMod4 version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version
def _cray_cle_version(): completed = os_ext.run_command('cat /etc/opt/cray/release/cle-release') matched = re.match(r'^RELEASE=(\S+)', completed.stdout) if matched is None: return None return matched.group(1)
def test_trap_exit(script_file): with shell.generate_script(script_file, trap_exit=True) as gen: gen.write('echo hello') completed = os_ext.run_command(str(script_file), check=True) assert 'hello' in completed.stdout assert 0 == completed.returncode assert '-reframe: script exiting with exit code: 0' in completed.stdout
def _get_all_nodes(self): try: completed = os_ext.run_command('scontrol -a show -o nodes', check=True) except SpawnedProcessError as e: raise JobError('could not retrieve node information') from e node_descriptions = completed.stdout.splitlines() return {SlurmNode(descr) for descr in node_descriptions}
def test_trap_exit(self): with shell.generate_script(self.script_file.name, trap_exit=True) as gen: gen.write('echo hello') completed = os_ext.run_command(self.script_file.name, check=True) self.assertIn('hello', completed.stdout) self.assertEqual(0, completed.returncode) self.assertIn("-reframe: script exiting with exit code: 0", completed.stdout)
def _get_nodes_by_name(self, nodespec): try: completed = os_ext.run_command( 'scontrol -a show -o node %s' % nodespec, check=True) except SpawnedProcessError as e: raise JobError('could not retrieve the node description ' 'of nodes: %s' % nodespec) from e node_descriptions = completed.stdout.splitlines() return {SlurmNode(descr) for descr in node_descriptions}
def __init__(self): # Try to figure out if we are indeed using the TCL version try: modulecmd = os.getenv('MODULESHOME') modulecmd = os.path.join(modulecmd, 'modulecmd.tcl') completed = os_ext.run_command(modulecmd) except OSError as e: raise ConfigError( 'could not find a sane TMod31 installation: %s' % e) from e version_match = re.search(r'Release Tcl (\S+)', completed.stderr, re.MULTILINE) tcl_version_match = version_match if version_match is None or tcl_version_match is None: raise ConfigError('could not find a sane TMod31 installation') version = version_match.group(1) try: ver_major, ver_minor, *_ = [int(v) for v in version.split('.')] except ValueError: raise ConfigError( 'could not parse TMod31 version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError( 'unsupported TMod version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version self._command = '%s python' % modulecmd try: # Try the Python bindings now completed = os_ext.run_command(self._command) except OSError as e: raise ConfigError( 'could not get the Python bindings for TMod31: ' % e) from e if re.search(r'Unknown shell type', completed.stderr): raise ConfigError( 'Python is not supported by this TMod installation')
def _get_nodes_by_name(self, nodespec): completed = os_ext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() nodes_avail = set() for descr in node_descriptions: try: nodes_avail.add(SlurmNode(descr)) except JobError: pass return nodes_avail
def _run_module_command(self, *args, msg=None): command = ' '.join([self._command, *args]) try: completed = os_ext.run_command(command, check=True) except SpawnedProcessError as e: raise EnvironError(msg) from e if self._module_command_failed(completed): err = SpawnedProcessError(command, completed.stdout, completed.stderr, completed.returncode) raise EnvironError(msg) from err return completed
def __init__(self): # Try to figure out if we are indeed using the TCL version try: completed = os_ext.run_command('modulecmd -V') except OSError as e: raise ConfigError('could not find a sane TMod installation') from e version_match = re.search(r'^VERSION=(\S+)', completed.stdout, re.MULTILINE) tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout, re.MULTILINE) if version_match is None or tcl_version_match is None: raise ConfigError('could not find a sane TMod installation') version = version_match.group(1) try: ver_major, ver_minor = [int(v) for v in version.split('.')[:2]] except ValueError: raise ConfigError('could not parse TMod version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError('unsupported TMod version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version self._command = 'modulecmd python' try: # Try the Python bindings now completed = os_ext.run_command(self._command) except OSError as e: raise ConfigError('could not get the Python bindings for TMod: ' % e) from e if re.search(r'Unknown shell type', completed.stderr): raise ConfigError( 'Python is not supported by this TMod installation')
def _get_excluded_node_names(self): if not self.sched_exclude_nodelist: return set() command = 'scontrol show -o node %s' % self.sched_exclude_nodelist try: completed = os_ext.run_command(command, check=True) except SpawnedProcessError as e: raise JobError('could not retrieve the node description ' 'of nodes: %s' % self.sched_exclude_nodelist) from e node_descriptions = completed.stdout.splitlines() slurm_nodes = (SlurmNode(descr) for descr in node_descriptions) return {n.name for n in slurm_nodes}
def _exec_module_command(self, *args, msg=None): command = ' '.join([self._command, *args]) completed = os_ext.run_command(command, check=True) namespace = {} exec(completed.stdout, {}, namespace) if not namespace['_mlstatus']: # _mlstatus is set by the TMod4 Python bindings if msg is None: msg = 'modules system command failed: ' if isinstance(completed.args, str): msg += completed.args else: msg += ' '.join(completed.args) raise EnvironError(msg)
def poll(self, *jobs): if not jobs: return m = max(job.submit_time for job in jobs) time_from_last_submit = time.time() - m rem_wait = self.SQUEUE_DELAY - time_from_last_submit if rem_wait > 0: time.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid # job id. completed = os_ext.run_command( f'squeue -h -j {",".join(job.jobid for job in jobs)} ' f'-o "%%i|%%T|%%N|%%r"' ) # We need the match objects, so we have to use finditer() state_match = list(re.finditer( fr'^(?P<jobid>{self._state_patt})\|(?P<state>\S+)\|' fr'(?P<nodespec>\S*)\|(?P<reason>.+)', completed.stdout, re.MULTILINE) ) jobinfo = {} for s in state_match: jobid = s.group('jobid').split('_')[0] jobinfo.setdefault(jobid, []).append(s) for job in jobs: if job is None: continue try: job_match = jobinfo[job.jobid] except KeyError: job._state = 'CANCELLED' if job.is_cancelling else 'COMPLETED' if job.exitcode is None: job._exitcode = 0 continue # Join the states with ',' in case of job arrays job._state = ','.join(s.group('state') for s in job_match) self._cancel_if_blocked( job, [s.group('reason') for s in state_match] ) self._cancel_if_pending_too_long(job)
def _compile_file(self, source_file, executable, lang, options): if not executable: # default executable, same as source_file without the extension executable = os.path.join(os.path.dirname(source_file), source_file.rsplit('.')[:-1][0]) if not lang: lang = self.guess_language(source_file) # Replace None's with empty strings cppflags = self.cppflags or '' cflags = self.cflags or '' cxxflags = self.cxxflags or '' fflags = self.fflags or '' ldflags = self.ldflags or '' flags = [cppflags] if lang == 'C': compiler = self.cc flags.append(cflags) elif lang == 'C++': compiler = self.cxx flags.append(cxxflags) elif lang == 'Fortran': compiler = self.ftn flags.append(fflags) elif lang == 'CUDA': compiler = 'nvcc' flags.append(cxxflags) else: raise EnvironError('Unknown language: %s' % lang) # Append include search path flags += ['-I' + d for d in self.include_search_path] cmd = ('%s %s %s -o %s %s %s' % (compiler, ' '.join(flags), source_file, executable, ldflags, options)) try: return os_ext.run_command(cmd, check=True) except SpawnedProcessError as e: # Re-raise as compilation error raise CompilationError(command=e.command, stdout=e.stdout, stderr=e.stderr, exitcode=e.exitcode) from None
def _update_state(self, job): '''Check the status of the job.''' completed = os_ext.run_command('qstat -f %s' % job.jobid) # Depending on the configuration, completed jobs will remain on the job # list for a limited time, or be removed upon completion. # If qstat cannot find the jobid, it returns code 153. if completed.returncode == 153: getlogger().debug( 'jobid not known by scheduler, assuming job completed') job.state = 'COMPLETED' return if completed.returncode != 0: raise JobError('qstat failed: %s' % completed.stderr, job.jobid) nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)', completed.stdout, re.MULTILINE) if nodelist_match: nodespec = nodelist_match.group('nodespec') nodespec = re.sub(r'[\n\t]*', '', nodespec) self._set_nodelist(job, nodespec) state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])', completed.stdout, re.MULTILINE) if not state_match: getlogger().debug('job state not found (stdout follows)\n%s' % completed.stdout) return state = state_match.group('state') job.state = JOB_STATES[state] if job.state == 'COMPLETED': code_match = re.search( r'^\s*exit_status = (?P<code>\d+)', completed.stdout, re.MULTILINE, ) if not code_match: return job.exitcode = int(code_match.group('code'))
def _autodetect_system(self): """Auto-detect system.""" # Try to detect directly the cluster name from /etc/xthostname (Cray # specific) try: hostname = os_ext.run_command('cat /etc/xthostname', check=True).stdout except SpawnedProcessError: # Try to figure it out with the standard method hostname = socket.gethostname() # Go through the supported systems and try to match the hostname for system in self._site_config.systems.values(): for hostname_patt in system.hostnames: if re.match(hostname_patt, hostname): return system raise SystemAutodetectionError
def autodetect_system(site_config): """Auto-detect system""" import re import socket # Try to detect directly the cluster name from /etc/xthostname (Cray # specific) try: hostname = os_ext.run_command('cat /etc/xthostname', check=True).stdout except ReframeError: # Try to figure it out with the standard method hostname = socket.gethostname() # Go through the supported systems and try to match the hostname for system in site_config.systems.values(): for hostname_patt in system.hostnames: if re.match(hostname_patt, hostname): return system return None
def _compile_dir(self, source_dir, makefile, options): if makefile: cmd = 'make -C %s -f %s %s ' % (source_dir, makefile, options) else: cmd = 'make -C %s %s ' % (source_dir, options) # Pass a set of predefined options to the Makefile if self.propagate: flags = [ "CC='%s'" % self.cc, "CXX='%s'" % self.cxx, "FC='%s'" % self.ftn ] # Explicitly check against None here; the user may explicitly want # to clear the flags if self.cppflags is not None: flags.append("CPPFLAGS='%s'" % self.cppflags) if self.cflags is not None: flags.append("CFLAGS='%s'" % self.cflags) if self.cxxflags is not None: flags.append("CXXFLAGS='%s'" % self.cxxflags) if self.fflags is not None: flags.append("FFLAGS='%s'" % self.fflags) if self.ldflags is not None: flags.append("LDFLAGS='%s'" % self.ldflags) cmd += ' '.join(flags) try: return os_ext.run_command(cmd, check=True) except SpawnedProcessError as e: # Re-raise as compilation error raise CompilationError(command=e.command, stdout=e.stdout, stderr=e.stderr, exitcode=e.exitcode) from None
def completion_time(self, job): if (self._completion_time or not slurm_state_completed(job.state)): return self._completion_time with env.temp_environment(variables={'SLURM_TIME_FORMAT': 'standard'}): completed = os_ext.run_command( 'sacct -S %s -P -j %s -o jobid,end' % (datetime.now().strftime('%F'), job.jobid), log=False) state_match = list( re.finditer(r'^(?P<jobid>%s)\|(?P<end>\S+)' % self._state_patt, completed.stdout, re.MULTILINE)) if not state_match: return None self._completion_time = max( datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S') for s in state_match) return self._completion_time
def _run_command(self, cmd, timeout=None): """Run command cmd and re-raise any exception as a JobError.""" try: return os_ext.run_command(cmd, check=True, timeout=timeout) except SpawnedProcessError as e: raise JobError(jobid=self._jobid) from e