def __init__(self): # Try to figure out if we are indeed using LMOD self._lmod_cmd = os.getenv('LMOD_CMD') if self._lmod_cmd is None: raise ConfigError('could not find a sane Lmod installation: ' 'environment variable LMOD_CMD is not defined') try: completed = osext.run_command(f'{self._lmod_cmd} --version') except OSError as e: raise ConfigError( 'could not find a sane Lmod installation: %s' % e) version_match = re.search(r'.*Version\s*(\S+)', completed.stderr, re.MULTILINE) if version_match is None: raise ConfigError('could not retrieve Lmod version') self._version = version_match.group(1) try: # Try the Python bindings now completed = osext.run_command(self.modulecmd()) except OSError as e: raise ConfigError( 'could not get the Python bindings for Lmod: ' % e) if re.search(r'Unknown shell type', completed.stderr): raise ConfigError('Python is not supported by ' 'this Lmod installation') self._extra_module_paths = []
def git_only(): try: osext.run_command('git --version', check=True, log=False) except (SpawnedProcessError, FileNotFoundError): pytest.skip('no git installation found on system') try: osext.run_command('git status', check=True, log=False) except (SpawnedProcessError, FileNotFoundError): pytest.skip('not inside a git repository')
def test_command_timeout(): with pytest.raises(SpawnedProcessTimeout, match=r"command 'sleep 3' timed out " r'after 2s') as exc_info: osext.run_command('sleep 3', timeout=2) assert exc_info.value.timeout == 2 # Try to get the string repr. of the exception: see bug #658 str(exc_info.value)
def test_trap_error(script_file): with shell.generate_script(script_file, trap_errors=True) as gen: gen.write('false') gen.write('echo hello') with pytest.raises(SpawnedProcessError) as cm: osext.run_command(str(script_file), check=True) exc = cm.value assert 'hello' not in exc.stdout assert 1 == exc.exitcode assert "-reframe: command `false' failed (exit code: 1)" in exc.stdout
def __init__(self): try: completed = osext.run_command(self.modulecmd('-V'), check=True) except OSError as e: raise ConfigError( 'could not find a sane TMod4 installation') from e except SpawnedProcessError as e: raise ConfigError( 'could not get the Python bindings for TMod4') from e version_match = re.match(r'^Modules Release (\S+)\s+', completed.stderr) if not version_match: raise ConfigError('could not retrieve the TMod4 version') version = version_match.group(1) try: ver_major, ver_minor = [int(v) for v in version.split('.')[:2]] except ValueError: raise ConfigError( 'could not parse TMod4 version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError( 'unsupported TMod4 version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version self._extra_module_paths = []
def _cray_cle_version(): completed = osext.run_command('cat /etc/opt/cray/release/cle-release') matched = re.match(r'^RELEASE=(\S+)', completed.stdout) if matched is None: return None return matched.group(1)
def test_command_stdin(tmp_path): with open(tmp_path / 'in.txt', 'w') as fp: fp.write('hello') with open(tmp_path / 'in.txt') as fp: completed = osext.run_command('cat', stdin=fp) assert completed.stdout == 'hello'
def test_trap_exit(script_file): with shell.generate_script(script_file, trap_exit=True) as gen: gen.write('echo hello') completed = osext.run_command(str(script_file), check=True) assert 'hello' in completed.stdout assert 0 == completed.returncode assert '-reframe: script exiting with exit code: 0' in completed.stdout
def _execute(self, cmd, *args): modulecmd = self.modulecmd(cmd, *args) completed = osext.run_command(modulecmd) if re.search(r'\bERROR\b', completed.stderr) is not None: raise SpawnedProcessError(modulecmd, completed.stdout, completed.stderr, completed.returncode) exec(self.process(completed.stdout)) return completed.stderr
def __init__(self): # Try to figure out if we are indeed using the TCL version try: completed = osext.run_command('spack -V') except OSError as e: raise ConfigError( 'could not find a sane Spack installation') from e self._version = completed.stdout.strip() self._name_format = '{name}/{version}-{hash}'
def __init__(self): # Try to figure out if we are indeed using the TCL version try: modulecmd = os.getenv('MODULESHOME') modulecmd = os.path.join(modulecmd, 'modulecmd.tcl') completed = osext.run_command(modulecmd) except OSError as e: raise ConfigError( 'could not find a sane TMod31 installation: %s' % e) from e version_match = re.search(r'Release Tcl (\S+)', completed.stderr, re.MULTILINE) tcl_version_match = version_match if version_match is None or tcl_version_match is None: raise ConfigError('could not find a sane TMod31 installation') version = version_match.group(1) try: ver_major, ver_minor = [int(v) for v in version.split('.')[:2]] except ValueError: raise ConfigError( 'could not parse TMod31 version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError( 'unsupported TMod version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version self._command = '%s python' % modulecmd try: # Try the Python bindings now completed = osext.run_command(self._command) except OSError as e: raise ConfigError( 'could not get the Python bindings for TMod31: ' % e) from e if re.search(r'Unknown shell type', completed.stderr): raise ConfigError( 'Python is not supported by this TMod installation')
def __init__(self): # Try to figure out if we are indeed using the TCL version try: completed = osext.run_command('modulecmd -V') except OSError as e: raise ConfigError( 'could not find a sane TMod installation') from e version_match = re.search(r'^VERSION=(\S+)', completed.stdout, re.MULTILINE) tcl_version_match = re.search(r'^TCL_VERSION=(\S+)', completed.stdout, re.MULTILINE) if version_match is None or tcl_version_match is None: raise ConfigError('could not find a sane TMod installation') version = version_match.group(1) try: ver_major, ver_minor = [int(v) for v in version.split('.')[:2]] except ValueError: raise ConfigError( 'could not parse TMod version string: ' + version) from None if (ver_major, ver_minor) < self.MIN_VERSION: raise ConfigError( 'unsupported TMod version: %s (required >= %s)' % (version, self.MIN_VERSION)) self._version = version try: # Try the Python bindings now completed = osext.run_command(self.modulecmd()) except OSError as e: raise ConfigError( 'could not get the Python bindings for TMod: ' % e) from e if re.search(r'Unknown shell type', completed.stderr): raise ConfigError( 'Python is not supported by this TMod installation')
def _execute(self, cmd, *args): modulecmd = self.modulecmd(cmd, *args) completed = osext.run_command(modulecmd, check=False) namespace = {} exec(self.process(completed.stdout), {}, namespace) # _mlstatus is set by the TMod4 only if the command was unsuccessful, # but Lmod sets it always if not namespace.get('_mlstatus', True): raise SpawnedProcessError(modulecmd, completed.stdout, completed.stderr, completed.returncode) return completed.stderr
def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] if not jobs: return m = max(job.submit_time for job in jobs) time_from_last_submit = time.time() - m rem_wait = self.SQUEUE_DELAY - time_from_last_submit if rem_wait > 0: time.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid # job id. completed = osext.run_command( f'squeue -h -j {",".join(job.jobid for job in jobs)} ' f'-o "%%i|%%T|%%N|%%r"') # We need the match objects, so we have to use finditer() state_match = list( re.finditer( fr'^(?P<jobid>{self._jobid_patt})\|(?P<state>\S+)\|' fr'(?P<nodespec>\S*)\|(?P<reason>.+)', completed.stdout, re.MULTILINE)) jobinfo = {} for s in state_match: jobid = s.group('jobid').split('_')[0] jobinfo.setdefault(jobid, []).append(s) for job in jobs: if job is None: continue try: job_match = jobinfo[job.jobid] except KeyError: job._state = 'CANCELLED' if job.is_cancelling else 'COMPLETED' continue # Join the states with ',' in case of job arrays job._state = ','.join(s.group('state') for s in job_match) self._cancel_if_blocked(job, [s.group('reason') for s in state_match]) self._cancel_if_pending_too_long(job)
def _execute(self, cmd, *args): modulecmd = self.modulecmd(cmd, *args) completed = osext.run_command(modulecmd) if re.search(r'\bERROR\b', completed.stderr) is not None: raise SpawnedProcessError(modulecmd, completed.stdout, completed.stderr, completed.returncode) exec_match = re.search(r"^exec\s'(\S+)'", completed.stdout, re.MULTILINE) if exec_match is None: raise ConfigError('could not use the python bindings') with open(exec_match.group(1), 'r') as content_file: cmd = content_file.read() exec(self.process(cmd)) return completed.stderr
def test_command_success(): completed = osext.run_command('echo foobar') assert completed.returncode == 0 assert completed.stdout == 'foobar\n'
def _sysctl_topo(): try: exec_output = osext.run_command('sysctl hw machdep.cpu', check=True) except (FileNotFoundError, SpawnedProcessError): return {} cpuinfo = {'topology': {}} match = re.search(r'hw\.ncpu: (?P<num_cpus>\d+)', exec_output.stdout) if match: num_cpus = int(match.group('num_cpus')) match = re.search(r'hw\.physicalcpu: (?P<num_cores>\d+)', exec_output.stdout) if match: num_cores = int(match.group('num_cores')) match = re.search(r'hw\.packages: (?P<num_sockets>\d+)', exec_output.stdout) if match: num_sockets = int(match.group('num_sockets')) cpuinfo['num_sockets'] = num_sockets match = re.search(r'hw\.cacheconfig:(?P<cacheconfig>(\s\d+)*)', exec_output.stdout) if match: cacheconfig = list(map(int, match.group('cacheconfig').split())) match = re.search(r'hw\.cachesize:(?P<cachesize>(\s\d+)*)', exec_output.stdout) if match: cachesize = list(map(int, match.group('cachesize').split())) match = re.search(r'hw\.cachelinesize: (?P<linesize>\d+)', exec_output.stdout) if match: linesize = int(match.group('linesize')) # index 0 is referring to memory cache_associativity = [0] for i in range(1, len(cachesize)): if cachesize[i] == 0: break match = re.search( rf'machdep\.cpu\.cache\.L{i}_associativity: ' rf'(?P<associativity>\d+)', exec_output.stdout) assoc = int(match.group('associativity')) if match else 0 cache_associativity.append(assoc) num_cpus_per_socket = num_cpus // num_sockets num_cpus_per_core = num_cpus // num_cores # Fill in the cpuinfo cpuinfo['num_cpus'] = num_cpus cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket cpuinfo['num_cpus_per_core'] = num_cpus_per_core cpuinfo['topology']['numa_nodes'] = [_str_from_bits(range(num_cpus))] cpuinfo['topology']['sockets'] = [ _str_from_bits(range(start, start + num_cpus_per_socket)) for start in range(0, num_cpus, num_cpus_per_socket) ] cpuinfo['topology']['cores'] = [ _str_from_bits(range(start, start + num_cpus_per_core)) for start in range(0, num_cpus, num_cpus_per_core) ] cpuinfo['topology']['caches'] = [] for i in range(1, len(cache_associativity)): t = { 'type': f'L{i}', 'size': cachesize[i], 'linesize': linesize, 'associativity': cache_associativity[i], 'num_cpus': cacheconfig[i], 'cpusets': [ _str_from_bits(range(start, start + cacheconfig[i])) for start in range(0, num_cpus, cacheconfig[i]) ] } cpuinfo['topology']['caches'].append(t) return cpuinfo
def _execute(self, cmd, *args): modulecmd = self.modulecmd(cmd, *args) completed = osext.run_command(modulecmd, check=True) return completed.stdout
def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] if not jobs: return user = osext.osuser() completed = osext.run_command(f'qstat -xml -u {user}') if completed.returncode != 0: raise JobSchedulerError( f'qstat failed with exit code {completed.returncode} ' f'(standard error follows):\n{completed.stderr}') # Index the jobs to poll on their jobid jobs_to_poll = {job.jobid: job for job in jobs} # Parse the XML root = ET.fromstring(completed.stdout) # We are iterating over the returned XML and update the status of the # jobs relevant to ReFrame; the naming convention of variables matches # that of SGE's XML output known_jobs = set() # jobs known to the SGE scheduler for queue_info in root: # Reads the XML and prints jobs with status belonging to user. if queue_info is None: raise JobSchedulerError('could not retrieve queue information') for job_list in queue_info: if job_list.find("JB_owner").text != user: # Not a job of this user. continue jobid = job_list.find("JB_job_number").text if jobid not in jobs_to_poll: # Not a reframe job continue state = job_list.find("state").text job = jobs_to_poll[jobid] known_jobs.add(job) # For the list of known statuses see `man 5 sge_status` # (https://arc.liv.ac.uk/SGE/htmlman/htmlman5/sge_status.html) if state in ['r', 'hr', 't', 'Rr', 'Rt']: job._state = 'RUNNING' elif state in ['qw', 'Rq', 'hqw', 'hRwq']: job._state = 'PENDING' elif state in [ 's', 'ts', 'S', 'tS', 'T', 'tT', 'Rs', 'Rts', 'RS', 'RtS', 'RT', 'RtT' ]: job._state = 'SUSPENDED' elif state in ['Eqw', 'Ehqw', 'EhRqw']: job._state = 'ERROR' elif state in [ 'dr', 'dt', 'dRr', 'dRt', 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT' ]: job._state = 'DELETING' elif state == 'z': job._state = 'COMPLETED' # Mark any "unknown" job as completed unknown_jobs = set(jobs) - known_jobs for job in unknown_jobs: self.log(f'Job {job.jobid} not known to scheduler, ' f'assuming job completed') job._state = 'COMPLETED'
def _get_nodes_by_name(self, nodespec): completed = osext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions)
def test_command_success_cmd_seq(): completed = osext.run_command(['echo', 'foobar']) assert completed.returncode == 0 assert completed.stdout == 'foobar\n'
def poll(self, *jobs): def output_ready(job): # We report a job as finished only when its stdout/stderr are # written back to the working directory stdout = os.path.join(job.workdir, job.stdout) stderr = os.path.join(job.workdir, job.stderr) return os.path.exists(stdout) and os.path.exists(stderr) if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] if not jobs: return completed = osext.run_command( f'qstat -f {" ".join(job.jobid for job in jobs)}') # Depending on the configuration, completed jobs will remain on the job # list for a limited time, or be removed upon completion. # If qstat cannot find any of the job IDs, it will return 153. # Otherwise, it will return with return code 0 and print information # only for the jobs it could find. if completed.returncode in (153, 35): self.log(f'Return code is {completed.returncode}') for job in jobs: job._state = 'COMPLETED' if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True return if completed.returncode != 0: raise JobSchedulerError( f'qstat failed with exit code {completed.returncode} ' f'(standard error follows):\n{completed.stderr}') # Store information for each job separately jobinfo = {} for job_raw_info in completed.stdout.split('\n\n'): jobid_match = re.search(r'^Job Id:\s*(?P<jobid>\S+)', job_raw_info, re.MULTILINE) if jobid_match: jobid = jobid_match.group('jobid') jobinfo[jobid] = job_raw_info for job in jobs: if job.jobid not in jobinfo: self.log(f'Job {job.jobid} not known to scheduler') job._state = 'COMPLETED' if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True continue info = jobinfo[job.jobid] state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])', info, re.MULTILINE) if not state_match: self.log(f'Job state not found (job info follows):\n{info}') continue state = state_match.group('state') job._state = JOB_STATES[state] nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)', info, re.MULTILINE) if nodelist_match: nodespec = nodelist_match.group('nodespec') nodespec = re.sub(r'[\n\t]*', '', nodespec) self._update_nodelist(job, nodespec) if job.state == 'COMPLETED': exitcode_match = re.search( r'^\s*exit_status = (?P<code>\d+)', info, re.MULTILINE, ) if exitcode_match: job._exitcode = int(exitcode_match.group('code')) # We report a job as finished only when its stdout/stderr are # written back to the working directory done = job.cancelled or output_ready(job) if done: job._completed = True elif (job.state in ['QUEUED', 'HELD', 'WAITING'] and job.max_pending_time): if (time.time() - job.submit_time >= job.max_pending_time): self.cancel(job) job._exception = JobError('maximum pending time exceeded', job.jobid)
def test_command_error_cmd_seq(): with pytest.raises(SpawnedProcessError, match=r"command 'false' failed with exit code 1"): osext.run_command(['false'], check=True)