def get_system_info(self, hostname=None): du = DshUtils() # getting RAM size in gb mem_info = du.cat(hostname, "/proc/meminfo") if mem_info['rc'] != 0: _msg = 'failed to get content of /proc/meminfo of host: ' self.logger.error(_msg + hostname) else: got_mem_available = False for i in mem_info['out']: if "MemTotal" in i: self.system_total_ram = float(i.split()[1]) / (2**20) elif "MemAvailable" in i: mem_available = float(i.split()[1]) / (2**20) got_mem_available = True break elif "MemFree" in i: mem_free = float(i.split()[1]) / (2**20) elif "Buffers" in i: buffers = float(i.split()[1]) / (2**20) elif i.startswith("Cached"): cached = float(i.split()[1]) / (2**20) if got_mem_available: self.system_ram = mem_available else: self.system_ram = mem_free + buffers + cached # getting disk size in gb pbs_conf = du.parse_pbs_config(hostname) pbs_home_info = du.run_cmd(hostname, cmd=['df', '-k', pbs_conf['PBS_HOME']]) if pbs_home_info['rc'] != 0: _msg = 'failed to get output of df -k command of host: ' self.logger.error(_msg + hostname) else: disk_info = pbs_home_info['out'] disk_size = disk_info[1].split() self.system_disk = float(disk_size[3]) / (2**20) self.system_disk_used_percent = float(disk_size[4].rstrip('%'))
def get_system_info(self, hostname=None): du = DshUtils() # getting RAM size in gb mem_info = du.cat(hostname, "/proc/meminfo") if mem_info['rc'] != 0: _msg = 'failed to get content of /proc/meminfo of host: ' self.logger.error(_msg + hostname) else: for i in mem_info['out']: if "MemAvailable" in i: self.system_ram = float(i.split()[1]) / (2**20) break # getting disk size in gb pbs_conf = du.parse_pbs_config(hostname) pbs_home_info = du.run_cmd(hostname, cmd=['df', '-k', pbs_conf['PBS_HOME']]) if pbs_home_info['rc'] != 0: _msg = 'failed to get output of df -k command of host: ' self.logger.error(_msg + hostname) else: disk_info = pbs_home_info['out'] disk_size = disk_info[1].split() self.system_disk = float(disk_size[3]) / (2**20)
def check_hardware_status_and_core_files(self, test): """ function checks hardware status and core files every 5 minutes """ du = DshUtils() systems = list(self.param_dict['servers']) systems.extend(self.param_dict['moms']) systems.extend(self.param_dict['comms']) systems = list(set(systems)) if hasattr(test, 'test'): _test = test.test elif hasattr(test, 'context'): _test = test.context else: return None for name in ['servers', 'moms', 'comms', 'clients']: mlist = None if (hasattr(_test, name) and (getattr(_test, name, None) is not None)): mlist = getattr(_test, name).values() if mlist: for mc in mlist: platform = mc.platform if ((platform not in ['linux', 'shasta', 'cray']) and (mc.hostname in systems)): systems.remove(mc.hostname) self.hardware_report_timer = Timer( 300, self.check_hardware_status_and_core_files, args=(test, )) self.hardware_report_timer.start() for hostname in systems: hr = SystemInfo() hr.get_system_info(hostname) # monitors disk used_disk_percent = getattr(hr, 'system_disk_used_percent', None) if used_disk_percent is None: _msg = hostname _msg += ": unable to get disk info" self.hardware_report_timer.cancel() raise SkipTest(_msg) elif 70 <= used_disk_percent < 95: _msg = hostname + ": disk usage is at " _msg += str(used_disk_percent) + "%" _msg += ", disk cleanup is recommended." self.logger.warning(_msg) elif used_disk_percent >= 95: _msg = hostname + ":disk usage > 95%, skipping the test(s)" self.hardware_report_timer.cancel() raise SkipTest(_msg) # checks for core files pbs_conf = du.parse_pbs_config(hostname) mom_priv_path = os.path.join(pbs_conf["PBS_HOME"], "mom_priv") if du.isdir(hostname=hostname, path=mom_priv_path): mom_priv_files = du.listdir(hostname=hostname, path=mom_priv_path, sudo=True, fullpath=False) if fnmatch.filter(mom_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += mom_priv_path self.logger.warning(_msg) server_priv_path = os.path.join(pbs_conf["PBS_HOME"], "server_priv") if du.isdir(hostname=hostname, path=server_priv_path): server_priv_files = du.listdir(hostname=hostname, path=server_priv_path, sudo=True, fullpath=False) if fnmatch.filter(server_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += server_priv_path self.logger.warning(_msg) sched_priv_path = os.path.join(pbs_conf["PBS_HOME"], "sched_priv") if du.isdir(hostname=hostname, path=sched_priv_path): sched_priv_files = du.listdir(hostname=hostname, path=sched_priv_path, sudo=True, fullpath=False) if fnmatch.filter(sched_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += sched_priv_path self.logger.warning(_msg) for u in PBS_ALL_USERS: user_home_files = du.listdir(hostname=hostname, path=u.home, sudo=True, fullpath=False, runas=u.name) if user_home_files and fnmatch.filter(user_home_files, "core*"): _msg = hostname + ": user-" + str(u) _msg += ": core files found in " self.logger.warning(_msg + u.home)
class Job(ResourceResv): """ PBS Job. Attributes and Resources :param username: Job username :type username: str or None :param attrs: Job attributes :type attrs: Dictionary :param jobname: Name of the PBS job :type jobname: str or None """ dflt_attributes = { ATTR_N: 'STDIN', ATTR_j: 'n', ATTR_m: 'a', ATTR_p: '0', ATTR_r: 'y', ATTR_k: 'oe', } runtime = 100 du = DshUtils() def __init__(self, username=TEST_USER, attrs={}, jobname=None): self.platform = self.du.get_platform() self.server = {} self.script = None self.script_body = None if username is not None: self.username = str(username) else: self.username = None self.du = None self.interactive_handle = None if self.platform == 'cray' or self.platform == 'craysim': if 'Resource_List.select' in attrs: select = attrs['Resource_List.select'] attrs['Resource_List.select'] = self.add_cray_vntype(select) elif 'Resource_List.vntype' not in attrs: attrs['Resource_List.vntype'] = 'cray_compute' PBSObject.__init__(self, None, attrs, self.dflt_attributes) if jobname is not None: self.custom_attrs[ATTR_N] = jobname self.attributes[ATTR_N] = jobname self.set_variable_list(self.username) self.set_sleep_time(100) def __del__(self): del self.__dict__ def add_cray_vntype(self, select=None): """ Cray specific function to add vntype as ``cray_compute`` to each select chunk :param select: PBS select statement :type select: str or None """ ra = [] r = select.split('+') for i in r: select = PbsTypeSelect(i) novntype = 'vntype' not in select.resources nohost = 'host' not in select.resources novnode = 'vnode' not in select.resources if novntype and nohost and novnode: i = i + ":vntype=cray_compute" ra.append(i) select_str = '' for l in ra: select_str = select_str + "+" + l select_str = select_str[1:] return select_str def set_attributes(self, a={}): """ set attributes and custom attributes on this job. custom attributes are used when converting attributes to CLI. In case of Cray platform if 'Resource_List.vntype' is set already then remove it and add vntype value to each chunk of a select statement. :param a: Attribute dictionary :type a: Dictionary """ if isinstance(a, list): a = OrderedDict(a) self.attributes = OrderedDict(list(self.dflt_attributes.items()) + list(self.attributes.items()) + list(a.items())) if self.platform == 'cray' or self.platform == 'craysim': s = 'Resource_List.select' in a v = 'Resource_List.vntype' in self.custom_attrs if s and v: del self.custom_attrs['Resource_List.vntype'] select = a['Resource_List.select'] a['Resource_List.select'] = self.add_cray_vntype(select) self.custom_attrs = OrderedDict(list(self.custom_attrs.items()) + list(a.items())) def set_variable_list(self, user=None, workdir=None): """ Customize the ``Variable_List`` job attribute to ``<user>`` """ if user is None: userinfo = pwd.getpwuid(os.getuid()) user = userinfo[0] homedir = userinfo[5] else: try: homedir = pwd.getpwnam(user)[5] except Exception: homedir = "" self.username = user s = ['PBS_O_HOME=' + homedir] s += ['PBS_O_LANG=en_US.UTF-8'] s += ['PBS_O_LOGNAME=' + user] s += ['PBS_O_PATH=/usr/bin:/bin:/usr/bin:/usr/local/bin'] s += ['PBS_O_MAIL=/var/spool/mail/' + user] s += ['PBS_O_SHELL=/bin/bash'] s += ['PBS_O_SYSTEM=Linux'] if workdir is not None: wd = workdir else: wd = os.getcwd() s += ['PBS_O_WORKDIR=' + str(wd)] self.attributes[ATTR_v] = ",".join(s) self.set_attributes() def set_sleep_time(self, duration): """ Set the sleep duration for this job. :param duration: The duration, in seconds, to sleep :type duration: int """ self.set_execargs('/bin/sleep', duration) def set_execargs(self, executable, arguments=None): """ Set the executable and arguments to use for this job :param executable: path to an executable. No checks are made. :type executable: str :param arguments: arguments to executable. :type arguments: str or list or int """ msg = ['job: executable set to ' + str(executable)] if arguments is not None: msg += [' with arguments: ' + str(arguments)] self.logger.info("".join(msg)) self.attributes[ATTR_executable] = executable if arguments is not None: args = '' xml_beginargs = '<jsdl-hpcpa:Argument>' xml_endargs = '</jsdl-hpcpa:Argument>' if isinstance(arguments, list): for a in arguments: args += xml_beginargs + str(a) + xml_endargs elif isinstance(arguments, str): args = xml_beginargs + arguments + xml_endargs elif isinstance(arguments, int): args = xml_beginargs + str(arguments) + xml_endargs self.attributes[ATTR_Arglist] = args else: self.unset_attributes([ATTR_Arglist]) self.set_attributes() def create_script(self, body=None, asuser=None, hostname=None): """ Create a job script from a given body of text into a temporary location :param body: the body of the script :type body: str or None :param asuser: Optionally the user to own this script, defaults ot current user :type asuser: str or None :param hostname: The host on which the job script is to be created :type hostname: str or None """ if body is None: return None if isinstance(body, list): body = '\n'.join(body) if self.platform == 'cray' or self.platform == 'craysim': body = body.split("\n") for i, line in enumerate(body): if line.startswith("#PBS") and "select=" in line: if 'Resource_List.vntype' in self.attributes: self.unset_attributes(['Resource_List.vntype']) line_arr = line.split(" ") for j, element in enumerate(line_arr): select = element.startswith("select=") lselect = element.startswith("-lselect=") if select or lselect: if lselect: sel_str = element[9:] else: sel_str = element[7:] sel_str = self.add_cray_vntype(select=sel_str) if lselect: line_arr[j] = "-lselect=" + sel_str else: line_arr[j] = "select=" + sel_str body[i] = " ".join(line_arr) body = '\n'.join(body) # If the user has a userhost, the job will run from there # so the script should be made there if self.username: user = PbsUser.get_user(self.username) if user.host: hostname = user.host asuser = user.name self.script_body = body if self.du is None: self.du = DshUtils() # First create the temporary file as current user and only change # its mode once the current user has written to it fn = self.du.create_temp_file(hostname, prefix='PtlPbsJobScript', asuser=asuser, body=body) self.du.chmod(hostname, fn, mode=0o755) self.script = fn return fn def create_subjob_id(self, job_array_id, subjob_index): """ insert subjob index into the square brackets of job array id :param job_array_id: PBS parent array job id :type job_array_id: str :param subjob_index: index of subjob :type subjob_index: int :returns: subjob id string """ idx = job_array_id.find('[]') return job_array_id[:idx + 1] + str(subjob_index) + \ job_array_id[idx + 1:] def create_eatcpu_job(self, duration=None, hostname=None): """ Create a job that eats cpu indefinitely or for the given duration of time :param duration: The duration, in seconds, to sleep :type duration: int :param hostname: hostname on which to execute the job :type hostname: str or None """ if self.du is None: self.du = DshUtils() shebang_line = '#!' + self.du.which(hostname, exe='python3') body = """ import signal import sys x = 0 def receive_alarm(signum, stack): sys.exit() signal.signal(signal.SIGALRM, receive_alarm) if (len(sys.argv) > 1): input_time = sys.argv[1] print('Terminating after %s seconds' % input_time) signal.alarm(int(input_time)) else: print('Running indefinitely') while True: x += 1 """ script_body = shebang_line + body script_path = self.du.create_temp_file(hostname=hostname, body=script_body, suffix='.py') pbs_conf = self.du.parse_pbs_config(hostname) shell_path = os.path.join(pbs_conf['PBS_EXEC'], 'bin', 'pbs_python') a = {ATTR_S: shell_path} self.set_attributes(a) mode = 0o755 if not self.du.chmod(hostname=hostname, path=script_path, mode=mode, sudo=True): raise AssertionError("Failed to set permissions for file %s" " to %s" % (script_path, oct(mode))) self.set_execargs(script_path, duration) return script_path
def check_hardware_status_and_core_files(self): """ function checks hardware status and core files every 5 minutes """ du = DshUtils() self.hardware_report_timer = Timer( 300, self.check_hardware_status_and_core_files) self.hardware_report_timer.start() systems = list(self.param_dict['servers']) systems.extend(self.param_dict['moms']) systems.extend(self.param_dict['comms']) systems = list(set(systems)) for hostname in systems: hr = SystemInfo() hr.get_system_info(hostname) # monitors disk used_disk_percent = getattr(hr, 'system_disk_used_percent', None) if used_disk_percent is None: _msg = hostname _msg += ": unable to get disk info" self.hardware_report_timer.cancel() raise SkipTest(_msg) elif 70 <= used_disk_percent < 95: _msg = hostname + ": disk usage is at " _msg += str(used_disk_percent) + "%" _msg += ", disk cleanup is recommended." self.logger.warning(_msg) elif used_disk_percent >= 95: _msg = hostname + ":disk usage > 95%, skipping the test(s)" self.hardware_report_timer.cancel() raise SkipTest(_msg) # checks for core files pbs_conf = du.parse_pbs_config(hostname) mom_priv_path = os.path.join(pbs_conf["PBS_HOME"], "mom_priv") if du.isdir(hostname=hostname, path=mom_priv_path): mom_priv_files = du.listdir( hostname=hostname, path=mom_priv_path, sudo=True, fullpath=False) if fnmatch.filter(mom_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += mom_priv_path self.logger.warning(_msg) server_priv_path = os.path.join( pbs_conf["PBS_HOME"], "server_priv") if du.isdir(hostname=hostname, path=server_priv_path): server_priv_files = du.listdir( hostname=hostname, path=server_priv_path, sudo=True, fullpath=False) if fnmatch.filter(server_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += server_priv_path self.logger.warning(_msg) sched_priv_path = os.path.join(pbs_conf["PBS_HOME"], "sched_priv") if du.isdir(hostname=hostname, path=sched_priv_path): sched_priv_files = du.listdir( hostname=hostname, path=sched_priv_path, sudo=True, fullpath=False) if fnmatch.filter(sched_priv_files, "core*"): _msg = hostname + ": core files found in " _msg += sched_priv_path self.logger.warning(_msg) for u in PBS_ALL_USERS: user_home_files = du.listdir(hostname=hostname, path=u.home, sudo=True, fullpath=False, runas=u.name) if user_home_files and fnmatch.filter( user_home_files, "core*"): _msg = hostname + ": user-" + str(u) _msg += ": core files found in " self.logger.warning(_msg + u.home)