def priv_ports_info(hostname=None): """ Return a list of privileged ports in use on a given host :param hostname: The host on which to query privilege ports usage. Defaults to the local host :type hostname: str or None """ from ptl.utils.pbs_dshutils import DshUtils netstat_tag = re.compile("tcp[\s]+[\d]+[\s]+[\d]+[\s]+" "(?P<srchost>[\w\*\.]+):(?P<srcport>[\d]+)" "[\s]+(?P<desthost>[\.\w\*:]+):" "(?P<destport>[\d]+)" "[\s]+(?P<state>[\w]+).*") du = DshUtils() ret = du.run_cmd(hostname, ['netstat', '-at', '--numeric-ports']) if ret['rc'] != 0: return False msg = [] lines = ret['out'] resv_ports = {} source_hosts = [] for line in lines: m = netstat_tag.match(line) if m: srcport = int(m.group('srcport')) srchost = m.group('srchost') destport = int(m.group('destport')) desthost = m.group('desthost') if srcport < 1024: if srchost not in source_hosts: source_hosts.append(srchost) msg.append(line) if srchost not in resv_ports: resv_ports[srchost] = [srcport] elif srcport not in resv_ports[srchost]: resv_ports[srchost].append(srcport) if destport < 1024: msg.append(line) if desthost not in resv_ports: resv_ports[desthost] = [destport] elif destport not in resv_ports[desthost]: resv_ports[desthost].append(destport) if len(resv_ports) > 0: msg.append('\nPrivilege ports in use: ') for k, v in resv_ports.items(): msg.append('\t' + k + ': ' + str(",".join(map(lambda l: str(l), v)))) for sh in source_hosts: msg.append('\nTotal on ' + sh + ': ' + str(len(resv_ports[sh]))) else: msg.append('No privileged ports currently allocated') return msg
class SyncData(threading.Thread): """ Sync thread """ def __init__(self, sharedpath, queue): threading.Thread.__init__(self) self.sharedpath = sharedpath self.queue = queue self._go = True self.du = DshUtils() def run(self): while self._go: try: host, datadir, bi, sn, hostname, tn, lp = self.queue.get(False, 1.0) except Queue.Empty: continue destdatadir = os.path.join(self.sharedpath, bi, sn, hostname, tn, lp) homedir = os.path.join(datadir, 'PBS_' + host) _s = ['#!/bin/bash'] _s += ['mkdir -p %s' % (destdatadir)] _s += ['chmod -R 0755 %s' % (destdatadir)] _s += ['cp -rp %s %s' % (homedir, destdatadir)] _s += ['cp %s/qstat_tf %s' % (datadir, destdatadir)] _s += ['cp %s/pbsnodes %s' % (datadir, destdatadir)] _s += ['cp %s/print_server %s' % (datadir, destdatadir)] _s += ['cp %s/logfile_* %s' % (datadir, destdatadir)] _s += ['cat %s/uptime >> %s/uptime' % (datadir, destdatadir)] _s += ['cat %s/vmstat >> %s/vmstat' % (datadir, destdatadir)] _s += ['cat %s/netstat >> %s/netstat' % (datadir, destdatadir)] _s += ['cat %s/ps >> %s/ps' % (datadir, destdatadir)] _s += ['cat %s/df >> %s/df' % (datadir, destdatadir)] fd, fn = self.du.mkstemp(host, mode=0755, body='\n'.join(_s)) os.close(fd) self.du.run_cmd(host, cmd=fn, sudo=True) def stop(self): self._go = False
def get_system_info(self, hostname=None): du = DshUtils() # getting RAM size in gb mem_info = du.cat(hostname, "/proc/meminfo") if mem_info['rc'] != 0: _msg = 'failed to get content of /proc/meminfo of host: ' self.logger.error(_msg + hostname) else: got_mem_available = False for i in mem_info['out']: if "MemTotal" in i: self.system_total_ram = float(i.split()[1]) / (2**20) elif "MemAvailable" in i: mem_available = float(i.split()[1]) / (2**20) got_mem_available = True break elif "MemFree" in i: mem_free = float(i.split()[1]) / (2**20) elif "Buffers" in i: buffers = float(i.split()[1]) / (2**20) elif i.startswith("Cached"): cached = float(i.split()[1]) / (2**20) if got_mem_available: self.system_ram = mem_available else: self.system_ram = mem_free + buffers + cached # getting disk size in gb pbs_conf = du.parse_pbs_config(hostname) pbs_home_info = du.run_cmd(hostname, cmd=['df', '-k', pbs_conf['PBS_HOME']]) if pbs_home_info['rc'] != 0: _msg = 'failed to get output of df -k command of host: ' self.logger.error(_msg + hostname) else: disk_info = pbs_home_info['out'] disk_size = disk_info[1].split() self.system_disk = float(disk_size[3]) / (2**20) self.system_disk_used_percent = float(disk_size[4].rstrip('%'))
def get_system_info(self, hostname=None): du = DshUtils() # getting RAM size in gb mem_info = du.cat(hostname, "/proc/meminfo") if mem_info['rc'] != 0: _msg = 'failed to get content of /proc/meminfo of host: ' self.logger.error(_msg + hostname) else: for i in mem_info['out']: if "MemAvailable" in i: self.system_ram = float(i.split()[1]) / (2**20) break # getting disk size in gb pbs_conf = du.parse_pbs_config(hostname) pbs_home_info = du.run_cmd(hostname, cmd=['df', '-k', pbs_conf['PBS_HOME']]) if pbs_home_info['rc'] != 0: _msg = 'failed to get output of df -k command of host: ' self.logger.error(_msg + hostname) else: disk_info = pbs_home_info['out'] disk_size = disk_info[1].split() self.system_disk = float(disk_size[3]) / (2**20)
class CrayUtils(object): """ Cray specific utility class """ node_status = [] node_summary = {} cmd_output = [] du = None def __init__(self): self.du = DshUtils() (self.node_status, self.node_summary) = self.parse_apstat_rn() def call_apstat(self, options): """ Build the apstat command and run it. Return the output of the command. :param options: options to pass to apstat command :type options: str :returns: the command output """ hostname = socket.gethostname() platform = self.du.get_platform(hostname) apstat_env = os.environ apstat_cmd = "apstat" if 'cray' not in platform: return None if 'craysim' in platform: lib_path = '$LD_LIBRARY_PATH:/opt/alps/tester/usr/lib/' apstat_env['LD_LIBRARY_PATH'] = lib_path apstat_env['ALPS_CONFIG_FILE'] = '/opt/alps/tester/alps.conf' apstat_env['apsched_sharedDir'] = '/opt/alps/tester/' apstat_cmd = "/opt/alps/tester/usr/bin/apstat -d ." cmd_run = self.du.run_cmd(hostname, [apstat_cmd, options], as_script=True, wait_on_script=True, env=apstat_env) return cmd_run def parse_apstat_rn(self): """ Parse the apstat command output for node status and summary :type options: str :returns: tuple of (node status, node summary) """ status = [] summary = {} count = 0 options = '-rn' cmd_run = self.call_apstat(options) if cmd_run is None: return (status, summary) cmd_result = cmd_run['out'] keys = cmd_result[0].split() # Add a key 'Mode' because 'State' is composed of two list items, e.g: # State = 'UP B', where Mode = 'B' k2 = ['Mode'] keys = keys[0:3] + k2 + keys[3:] cmd_iter = iter(cmd_result) for line in cmd_iter: if count == 0: count = 1 continue if "Compute node summary" in line: summary_line = next(cmd_iter) summary_keys = summary_line.split() summary_data = next(cmd_iter).split() sum_index = 0 for a in summary_keys: summary[a] = summary_data[sum_index] sum_index += 1 break obj = {} line = line.split() for i, value in enumerate(line): obj[keys[i]] = value if keys[i] == 'State': obj[keys[i]] = value + " " + line[i + 1] # If there is no Apids in the apstat then use 'None' as the value if "Apids" in obj: pass else: obj["Apids"] = None status.append(obj) return (status, summary) def count_node_summ(self, cnsumm='up'): """ Return the value of any one of the following parameters as shown in the 'Compute Node Summary' section of 'apstat -rn' output: arch, config, up, resv, use, avail, down :param cnsumm: parameter which is being queried, defaults to 'up' :type cnsumm: str :returns: value of parameter being queried """ return int(self.node_summary[cnsumm]) def count_node_state(self, state='UP B'): """ Return how many nodes have a certain 'State' value. :param state: parameter which is being queried, defaults to 'UP B' :type state: str :returns: count of how many nodes have the state """ count = 0 status = self.node_status for stat in status: if stat['State'] == state: count += 1 return count def get_numthreads(self, nid): """ Returns the number of hyperthread for the given node """ options = '-N %d -n -f "nid,c/cu"' % int(nid) cmd_run = self.call_apstat(options) if cmd_run is None: return None cmd_result = cmd_run['out'] cmd_iter = iter(cmd_result) numthreads = 0 for line in cmd_iter: if "Compute node summary" in line: break elif "NID" in line: continue else: key = line.split() numthreads = int(key[1]) return numthreads def num_compute_vnodes(self, server): """ Count the Cray compute nodes and return the value. """ vnl = server.filter(MGR_OBJ_NODE, {'resources_available.vntype': 'cray_compute'}) return len(vnl["resources_available.vntype=cray_compute"])
class CrayUtils(object): """ Cray specific utility class """ node_status = [] node_summary = {} cmd_output = [] du = None def __init__(self): self.du = DshUtils() (self.node_status, self.node_summary) = self.parse_apstat_rn() def call_apstat(self, options): """ Build the apstat command and run it. Return the output of the command. :param options: options to pass to apstat command :type options: str :returns: the command output """ hostname = socket.gethostname() platform = self.du.get_platform(hostname) apstat_env = os.environ apstat_cmd = "apstat" if 'cray' not in platform: return None if 'craysim' in platform: lib_path = '$LD_LIBRARY_PATH:/opt/alps/tester/usr/lib/' apstat_env['LD_LIBRARY_PATH'] = lib_path apstat_env['ALPS_CONFIG_FILE'] = '/opt/alps/tester/alps.conf' apstat_env['apsched_sharedDir'] = '/opt/alps/tester/' apstat_cmd = "/opt/alps/tester/usr/bin/apstat -d ." cmd_run = self.du.run_cmd(hostname, [apstat_cmd, options], as_script=True, wait_on_script=True, env=apstat_env) return cmd_run def parse_apstat_rn(self): """ Parse the apstat command output for node status and summary :type options: str :returns: tuple of (node status, node summary) """ status = [] summary = {} count = 0 options = '-rn' cmd_run = self.call_apstat(options) if cmd_run is None: return (status, summary) cmd_result = cmd_run['out'] keys = cmd_result[0].split() # Add a key 'Mode' because 'State' is composed of two list items, e.g: # State = 'UP B', where Mode = 'B' k2 = ['Mode'] keys = keys[0:3] + k2 + keys[3:] cmd_iter = iter(cmd_result) for line in cmd_iter: if count == 0: count = 1 continue if "Compute node summary" in line: summary_line = next(cmd_iter) summary_keys = summary_line.split() summary_data = next(cmd_iter).split() sum_index = 0 for a in summary_keys: summary[a] = summary_data[sum_index] sum_index += 1 break obj = {} line = line.split() for i, value in enumerate(line): obj[keys[i]] = value if keys[i] == 'State': obj[keys[i]] = value + " " + line[i + 1] # If there is no Apids in the apstat then use 'None' as the value if "Apids" in obj: pass else: obj["Apids"] = None status.append(obj) return (status, summary) def count_node_summ(self, cnsumm='up'): """ Return the value of any one of the following parameters as shown in the 'Compute Node Summary' section of 'apstat -rn' output: arch, config, up, resv, use, avail, down :param cnsumm: parameter which is being queried, defaults to 'up' :type cnsumm: str :returns: value of parameter being queried """ return int(self.node_summary[cnsumm]) def count_node_state(self, state='UP B'): """ Return how many nodes have a certain 'State' value. :param state: parameter which is being queried, defaults to 'UP B' :type state: str :returns: count of how many nodes have the state """ count = 0 status = self.node_status for stat in status: if stat['State'] == state: count += 1 return count def get_numthreads(self, nid): """ Returns the number of hyperthread for the given node """ options = '-N %d -n -f "nid,c/cu"' % int(nid) cmd_run = self.call_apstat(options) if cmd_run is None: return None cmd_result = cmd_run['out'] cmd_iter = iter(cmd_result) numthreads = 0 for line in cmd_iter: if "Compute node summary" in line: break elif "NID" in line: continue else: key = line.split() numthreads = int(key[1]) return numthreads
class PTLTestData(Plugin): """ Save post analysis data on test cases failure or error """ name = 'PTLTestData' score = sys.maxsize - 6 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.post_data_dir = None self.max_postdata_threshold = None self.__save_data_count = 0 self.__priv_sn = '' self.du = DshUtils() def options(self, parser, env): """ Register command line options """ pass def set_data(self, post_data_dir, max_postdata_threshold): self.post_data_dir = post_data_dir self.max_postdata_threshold = max_postdata_threshold def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config if self.post_data_dir is not None: self.enabled = True else: self.enabled = False def __save_home(self, test, status, err=None): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): _test = test.context sn = _test.__name__ else: # test does not have any PBS Objects, so just return return if self.__priv_sn != sn: self.__save_data_count = 0 self.__priv_sn = sn # Saving home might take time so disable timeout # handler set by runner tn = getattr(_test, '_testMethodName', 'unknown') testlogs = getattr(test, 'captured_logs', '') datadir = os.path.join(self.post_data_dir, sn, tn) if os.path.exists(datadir): _msg = 'Old post analysis data exists at %s' % datadir _msg += ', skipping saving data for this test case' self.logger.warn(_msg) _msg = 'Please remove old directory or' _msg += ' provide different directory' self.logger.warn(_msg) return if getattr(test, 'old_sigalrm_handler', None) is not None: _h = getattr(test, 'old_sigalrm_handler') signal.signal(signal.SIGALRM, _h) signal.alarm(0) self.logger.log(logging.DEBUG2, 'Saving post analysis data...') current_host = socket.gethostname().split('.')[0] self.du.mkdir(current_host, path=datadir, mode=0o755, parents=True, logerr=False, level=logging.DEBUG2) if err is not None: if isclass(err[0]) and issubclass(err[0], SkipTest): status = 'SKIP' status_data = 'Reason = %s' % (err[1]) else: if isclass(err[0]) and issubclass(err[0], TimeOut): status = 'TIMEDOUT' status_data = getattr(test, 'err_in_string', '') else: status_data = '' logfile = os.path.join(datadir, 'logfile_' + status) f = open(logfile, 'w+') f.write(testlogs + '\n') f.write(status_data + '\n') f.write('test duration: %s\n' % str(getattr(test, 'duration', '0'))) if status in ('PASS', 'SKIP'): # Test case passed or skipped, no need to save post analysis data f.close() return if ((self.max_postdata_threshold != 0) and (self.__save_data_count >= self.max_postdata_threshold)): _msg = 'Total number of saved post analysis data for this' _msg += ' testsuite is exceeded max postdata threshold' _msg += ' (%d)' % self.max_postdata_threshold f.write(_msg + '\n') self.logger.error(_msg) f.close() return servers = getattr(_test, 'servers', None) if servers is not None: server_host = servers.values()[0].shortname else: _msg = 'Could not find Server Object in given test object' _msg += ', skipping saving post analysis data' f.write(_msg + '\n') self.logger.warning(_msg) f.close() return moms = getattr(_test, 'moms', None) comms = getattr(_test, 'comms', None) client = getattr(_test.servers.values()[0], 'client', None) server = servers.values()[0] add_hosts = [] if len(servers) > 1: for param in servers.values()[1:]: add_hosts.append(param.shortname) if moms is not None: for param in moms.values(): add_hosts.append(param.shortname) if comms is not None: for param in comms.values(): add_hosts.append(param.shortname) if client is not None: add_hosts.append(client.split('.')[0]) add_hosts = list(set(add_hosts) - set([server_host])) pbs_snapshot_path = os.path.join(server.pbs_conf["PBS_EXEC"], "sbin", "pbs_snapshot") cur_user = self.du.get_current_user() cur_user_dir = pwd.getpwnam(cur_user).pw_dir cmd = [ pbs_snapshot_path, '-H', server_host, '--daemon-logs', '2', '--accounting-logs', '2', '--with-sudo' ] if len(add_hosts) > 0: cmd += ['--additional-hosts=' + ','.join(add_hosts)] cmd += ['-o', cur_user_dir] ret = self.du.run_cmd(current_host, cmd, level=logging.DEBUG2, logerr=False) if ret['rc'] != 0: _msg = 'Failed to get analysis information ' _msg += 'on %s:' % server_host _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: if len(ret['out']) == 0: self.logger.error('Snapshot command failed') f.close() return snap_out = ret['out'][0] snap_out_dest = (snap_out.split(":")[1]).strip() dest = os.path.join(datadir, 'PBS_' + server_host + '.tar.gz') ret = self.du.run_copy(current_host, snap_out_dest, dest, sudo=True, level=logging.DEBUG2) self.du.rm(current_host, path=snap_out_dest, recursive=True, force=True, level=logging.DEBUG2) f.close() self.__save_data_count += 1 _msg = 'Saved post analysis data' self.logger.info(_msg) def addError(self, test, err): self.__save_home(test, 'ERROR', err) def addFailure(self, test, err): self.__save_home(test, 'FAIL', err) def addSuccess(self, test): self.__save_home(test, 'PASS')
class PTLTestData(Plugin): """ Save post analysis data on test cases failure or error """ name = 'PTLTestData' score = sys.maxint - 3 logger = logging.getLogger(__name__) def __init__(self): self.sharedpath = None self.du = DshUtils() self.__syncth = None self.__queue = Queue.Queue() def options(self, parser, env): """ Register command line options """ pass def set_data(self, sharedpath): self.sharedpath = sharedpath def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config self.enabled = True def __get_sntnbi_name(self, test): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): _test = test.context sn = _test.__name__ else: return ('unknown', 'unknown', 'unknown') tn = getattr(_test, '_testMethodName', 'unknown') if (hasattr(_test, 'server') and (getattr(_test, 'server', None) is not None)): bi = _test.server.attributes['pbs_version'] else: bi = 'unknown' return (sn, tn, bi) def __save_home(self, test, status): if hasattr(test, 'test'): _test = test.test elif hasattr(test, 'context'): _test = test.context else: # test does not have any PBS Objects, so just return return if not hasattr(_test, 'server'): # test does not have any PBS Objects, so just return return st = getattr(test, 'start_time', None) if st is not None: st = time.mktime(st.timetuple()) else: st = time.time() st -= 180 # starttime - 3 min et = getattr(test, 'end_time', None) if et is not None: et = time.mktime(et.timetuple()) else: et = time.time() hostname = socket.gethostname().split('.')[0] lp = os.environ.get('PBS_JOBID', time.strftime("%Y%b%d_%H_%m_%S", time.localtime())) sn, tn, bi = self.__get_sntnbi_name(test) if getattr(_test, 'servers', None) is not None: shosts = map(lambda x: x.split('.')[0], _test.servers.host_keys()) else: shosts = [] if getattr(_test, 'schedulers', None) is not None: schosts = map(lambda x: x.split('.')[0], _test.schedulers.host_keys()) else: schosts = [] if getattr(_test, 'moms', None) is not None: mhosts = map(lambda x: x.split('.')[0], _test.moms.host_keys()) else: mhosts = [] hosts = [] hosts.extend(shosts) hosts.extend(schosts) hosts.extend(mhosts) hosts.append(hostname) hosts = sorted(set(hosts)) for host in hosts: confpath = self.du.get_pbs_conf_file(host) tmpdir = self.du.get_tempdir(host) datadir = os.path.join(tmpdir, bi, sn, hostname, tn, lp) _s = ['#!/bin/bash'] _s += ['. %s' % (confpath)] _s += ['mkdir -p %s' % (datadir)] _s += ['chmod -R 0755 %s' % (datadir)] if host == _test.server.shortname: _l = '${PBS_EXEC}/bin/qstat -tf > %s/qstat_tf &' % (datadir) _s += [_l] _l = '${PBS_EXEC}/bin/pbsnodes -av > %s/pbsnodes &' % (datadir) _s += [_l] _l = '${PBS_EXEC}/bin/qmgr -c "p s"' _l += ' > %s/print_server &' % (datadir) _s += [_l] _s += ['echo "%s" >> %s/uptime' % ('*' * 80, datadir)] _s += ['echo "On host : %s" >> %s/uptime' % (host, datadir)] _s += ['uptime >> %s/uptime' % (datadir)] _s += ['echo "" >> %s/uptime' % (datadir)] _s += ['echo "%s" >> %s/netstat' % ('*' * 80, datadir)] _s += ['echo "On host : %s" >> %s/netstat' % (host, datadir)] _cmd = self.du.which(host, 'netstat') if _cmd == 'netstat': _cmd = 'ss' if sys.platform.startswith('linux'): _cmd += ' -ap' else: _cmd += ' -an' _s += ['%s >> %s/netstat' % (_cmd, datadir)] _s += ['echo "" >> %s/netstat' % (datadir)] _s += ['echo "%s" >> %s/ps' % ('*' * 80, datadir)] _s += ['echo "On host : %s" >> %s/ps' % (host, datadir)] _s += ['ps -ef | grep pbs_ >> %s/ps' % (datadir)] _s += ['echo "" >> %s/ps' % (datadir)] _s += ['echo "%s" >> %s/df' % ('*' * 80, datadir)] _s += ['echo "On host : %s" >> %s/df' % (host, datadir)] _s += ['df -h >> %s/df' % (datadir)] _s += ['echo "" >> %s/df' % (datadir)] _s += ['echo "%s" >> %s/vmstat' % ('*' * 80, datadir)] _s += ['echo "On host : %s" >> %s/vmstat' % (host, datadir)] _s += ['vmstat >> %s/vmstat' % (datadir)] _s += ['echo "" >> %s/vmstat' % (datadir)] _dst = os.path.join(datadir, 'PBS_' + host) _s += ['cp -rp ${PBS_HOME} %s' % (_dst)] _s += ['tar -cf %s/datastore.tar %s/datastore' % (_dst, _dst)] _s += ['gzip -rf %s/datastore.tar' % (_dst)] _s += ['rm -rf %s/datastore' % (_dst)] _s += ['rm -rf %s/*_logs' % (_dst)] _s += ['rm -rf %s/server_priv/accounting' % (_dst)] _s += ['cp %s %s/pbs.conf.%s' % (confpath, _dst, host)] if host == hostname: _s += ['cat > %s/logfile_%s <<EOF' % (datadir, status)] _s += ['%s' % (getattr(test, 'err_in_string', ''))] _s += [''] _s += ['EOF'] _s += ['wait'] fd, fn = self.du.mkstemp(hostname, mode=0755, body='\n'.join(_s)) os.close(fd) self.du.run_cmd(hostname, cmd=fn, sudo=True, logerr=False) self.du.rm(hostname, fn, force=True, sudo=True) svr = _test.servers[host] if svr is not None: self.__save_logs(svr, _dst, 'server_logs', st, et) _adst = os.path.join(_dst, 'server_priv') self.__save_logs(svr, _adst, 'accounting', st, et) if getattr(_test, 'moms', None) is not None: self.__save_logs(_test.moms[host], _dst, 'mom_logs', st, et) if getattr(_test, 'schedulers', None) is not None: self.__save_logs(_test.schedulers[host], _dst, 'sched_logs', st, et) if ((self.sharedpath is not None) and (self.__syncth is not None)): self.__queue.put((host, datadir, bi, sn, hostname, tn, lp)) def __save_logs(self, obj, dst, name, st, et, jid=None): if name == 'accounting': logs = obj.log_lines('accounting', n='ALL', starttime=st, endtime=et) logs = map(lambda x: x + '\n', logs) elif name == 'tracejob': logs = obj.log_lines('tracejob', id=jid, n='ALL') name += '_' + jid else: logs = obj.log_lines(obj, n='ALL', starttime=st, endtime=et) f = open(os.path.join(dst, name), 'w+') f.writelines(logs) f.close() def begin(self): if self.sharedpath is not None: self.__syncth = SyncData(self.sharedpath, self.__queue) self.__syncth.daemon = True self.__syncth.start() def addError(self, test, err): self.__save_home(test, 'ERROR') def addFailure(self, test, err): self.__save_home(test, 'FAIL') def finalize(self, result): if ((self.sharedpath is not None) and (self.__syncth is not None)): while not self.__queue.empty(): pass self.__syncth.stop() self.__syncth.join()
class CrayUtils(object): """ Cray specific utility class """ node_status = [] node_summary = {} du = None def __init__(self): self.du = DshUtils() (self.node_status, self.node_summary) = self.parse_apstat_rn() def parse_apstat_rn(self): """ Run apstat command on cray/craysim and parse its output :param options: options to pass to apstat command :type options: str :returns: tuple of (node status, node summary) """ status = [] summary = {} count = 0 options = '-rn' hostname = socket.gethostname() platform = self.du.get_platform(hostname) apstat_env = os.environ apstat_cmd = "apstat" if 'cray' not in platform: return (status, summary) if 'craysim' in platform: lib_path = '$LD_LIBRARY_PATH:/opt/alps/tester/usr/lib/' apstat_env['LD_LIBRARY_PATH'] = lib_path apstat_env['ALPS_CONFIG_FILE'] = '/opt/alps/tester/alps.conf' apstat_env['apsched_sharedDir'] = '/opt/alps/tester/' apstat_cmd = "/opt/alps/tester/usr/bin/apstat -d ." cmd_run = self.du.run_cmd(hostname, [apstat_cmd, options], as_script=True, wait_on_script=True, env=apstat_env) cmd_result = cmd_run['out'] keys = cmd_result[0].split() # Add a key 'Mode' because 'State' is composed of two list items, e.g: # State = 'UP B', where Mode = 'B' k2 = ['Mode'] keys = keys[0:3] + k2 + keys[3:] cmd_iter = iter(cmd_result) for line in cmd_iter: if count == 0: count = 1 continue if "Compute node summary" in line: summary_line = next(cmd_iter) summary_keys = summary_line.split() summary_data = next(cmd_iter).split() sum_index = 0 for a in summary_keys: summary[a] = summary_data[sum_index] sum_index += 1 break obj = {} line = line.split() for i, value in enumerate(line): obj[keys[i]] = value if keys[i] == 'State': obj[keys[i]] = value + " " + line[i + 1] # If there is no Apids in the apstat then use 'None' as the value if "Apids" in obj: pass else: obj["Apids"] = None status.append(obj) return (status, summary) def count_node_summ(self, cnsumm='up'): """ Return the value of any one of the following parameters as shown in the 'Compute Node Summary' section of 'apstat -rn' output: arch, config, up, resv, use, avail, down :param cnsumm: parameter which is being queried, defaults to 'up' :type cnsumm: str :returns: value of parameter being queried """ return int(self.node_summary[cnsumm]) def count_node_state(self, state='UP B'): """ Return how many nodes have a certain 'State' value. :param state: parameter which is being queried, defaults to 'UP B' :type state: str :returns: count of how many nodes have the state """ count = 0 status = self.node_status for stat in status: if stat['State'] == state: count += 1 return count
class PTLTestData(Plugin): """ Save post analysis data on test cases failure or error """ name = 'PTLTestData' score = sys.maxint - 6 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.post_data_dir = None self.max_postdata_threshold = None self.__save_data_count = 0 self.__priv_sn = '' self.du = DshUtils() def options(self, parser, env): """ Register command line options """ pass def set_data(self, post_data_dir, max_postdata_threshold): self.post_data_dir = post_data_dir self.max_postdata_threshold = max_postdata_threshold def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config if self.post_data_dir is not None: self.enabled = True else: self.enabled = False def __save_home(self, test, status, err=None): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): _test = test.context sn = _test.__name__ else: # test does not have any PBS Objects, so just return return if self.__priv_sn != sn: self.__save_data_count = 0 self.__priv_sn = sn # Saving home might take time so disable timeout # handler set by runner tn = getattr(_test, '_testMethodName', 'unknown') testlogs = getattr(test, 'captured_logs', '') datadir = os.path.join(self.post_data_dir, sn, tn) if os.path.exists(datadir): _msg = 'Old post analysis data exists at %s' % datadir _msg += ', skipping saving data for this test case' self.logger.warn(_msg) _msg = 'Please remove old directory or' _msg += ' provide different directory' self.logger.warn(_msg) return if getattr(test, 'old_sigalrm_handler', None) is not None: _h = getattr(test, 'old_sigalrm_handler') signal.signal(signal.SIGALRM, _h) signal.alarm(0) self.logger.log(logging.DEBUG2, 'Saving post analysis data...') current_host = socket.gethostname().split('.')[0] self.du.mkdir(current_host, path=datadir, mode=0755, parents=True, logerr=False, level=logging.DEBUG2) if err is not None: if isclass(err[0]) and issubclass(err[0], SkipTest): status = 'SKIP' status_data = 'Reason = %s' % (err[1]) else: if isclass(err[0]) and issubclass(err[0], TimeOut): status = 'TIMEDOUT' status_data = getattr(test, 'err_in_string', '') else: status_data = '' logfile = os.path.join(datadir, 'logfile_' + status) f = open(logfile, 'w+') f.write(testlogs + '\n') f.write(status_data + '\n') f.write('test duration: %s\n' % str(getattr(test, 'duration', '0'))) if status in ('PASS', 'SKIP'): # Test case passed or skipped, no need to save post analysis data f.close() return if ((self.max_postdata_threshold != 0) and (self.__save_data_count >= self.max_postdata_threshold)): _msg = 'Total number of saved post analysis data for this' _msg += ' testsuite is exceeded max postdata threshold' _msg += ' (%d)' % self.max_postdata_threshold f.write(_msg + '\n') self.logger.error(_msg) f.close() return svr = getattr(_test, 'server', None) if svr is not None: svr_host = svr.hostname else: _msg = 'Could not find Server Object in given test object' _msg += ', skipping saving post analysis data' f.write(_msg + '\n') self.logger.warning(_msg) f.close() return pbs_diag = os.path.join(svr.pbs_conf['PBS_EXEC'], 'unsupported', 'pbs_diag') cur_user = self.du.get_current_user() cmd = [pbs_diag, '-f', '-d', '2'] cmd += ['-u', cur_user] cmd += ['-o', pwd.getpwnam(cur_user).pw_dir] if len(svr.jobs) > 0: cmd += ['-j', ','.join(svr.jobs.keys())] ret = self.du.run_cmd(svr_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to get diag information for ' _msg += 'on %s:' % svr_host _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: diag_re = r"(?P<path>\/.*\/pbs_diag_[\d]+_[\d]+\.tar\.gz).*" m = re.search(diag_re, '\n'.join(ret['out'])) if m is not None: diag_out = m.group('path') else: _msg = 'Failed to find generated diag path in below output:' _msg += '\n\n' + '-' * 80 + '\n' _msg += '\n'.join(ret['out']) + '\n' _msg += '-' * 80 + '\n\n' f.write(_msg) self.logger.error(_msg) f.close() return diag_out_dest = os.path.join(datadir, os.path.basename(diag_out)) if not self.du.is_localhost(svr_host): diag_out_r = svr_host + ':' + diag_out else: diag_out_r = diag_out ret = self.du.run_copy(current_host, diag_out_r, diag_out_dest, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to copy generated diag from' _msg += ' %s to %s' % (diag_out_r, diag_out_dest) f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: self.du.rm(svr_host, path=diag_out, sudo=True, force=True, level=logging.DEBUG2) cores = [] dir_list = ['server_priv', 'sched_priv', 'mom_priv'] for d in dir_list: path = os.path.join(svr.pbs_conf['PBS_HOME'], d) files = self.du.listdir(hostname=svr_host, path=path, sudo=True, level=logging.DEBUG2) for _f in files: if os.path.basename(_f).startswith('core'): cores.append(_f) cores = list(set(cores)) if len(cores) > 0: cmd = ['gunzip', diag_out_dest] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed unzip generated diag at %s:' % diag_out_dest _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return diag_out_dest = diag_out_dest.rstrip('.gz') cmd = ['tar', '-xf', diag_out_dest, '-C', datadir] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed extract generated diag %s' % diag_out_dest _msg += ' to %s:' % datadir _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(hostname=current_host, path=diag_out_dest, force=True, sudo=True, level=logging.DEBUG2) diag_out_dest = diag_out_dest.rstrip('.tar') for c in cores: cmd = [pbs_diag, '-g', c] ret = self.du.run_cmd(svr_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to get core file information for ' _msg += '%s on %s:' % (c, svr_host) _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) else: of = os.path.join(diag_out_dest, os.path.basename(c) + '.out') _f = open(of, 'w+') _f.write('\n'.join(ret['out']) + '\n') _f.close() self.du.rm(hostname=svr_host, path=c, force=True, sudo=True, level=logging.DEBUG2) cmd = ['tar', '-cf', diag_out_dest + '.tar'] cmd += [os.path.basename(diag_out_dest)] ret = self.du.run_cmd(current_host, cmd, sudo=True, cwd=datadir, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed generate tarball of diag directory' _msg += ' %s' % diag_out_dest _msg += ' after adding core(s) information in it:' _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return cmd = ['gzip', diag_out_dest + '.tar'] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed compress tarball of diag %s' % diag_out_dest _msg += '.tar after adding core(s) information in it:' _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(current_host, diag_out_dest, sudo=True, recursive=True, force=True, level=logging.DEBUG2) else: diag_out_dest = diag_out_dest.rstrip('.tar.gz') dest = os.path.join(datadir, 'PBS_' + current_host.split('.')[0] + '.tar.gz') ret = self.du.run_copy(current_host, diag_out_dest + '.tar.gz', dest, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed rename tarball of diag from %s' % diag_out_dest _msg += '.tar.gz to %s:' % dest _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(current_host, path=diag_out_dest + '.tar.gz', force=True, sudo=True, level=logging.DEBUG2) f.close() self.__save_data_count += 1 _msg = 'Successfully saved post analysis data' self.logger.log(logging.DEBUG2, _msg) def addError(self, test, err): self.__save_home(test, 'ERROR', err) def addFailure(self, test, err): self.__save_home(test, 'FAIL', err) def addSuccess(self, test): self.__save_home(test, 'PASS')
class PTLTestData(Plugin): """ Save post analysis data on test cases failure or error """ name = 'PTLTestData' score = sys.maxint - 6 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.post_data_dir = None self.max_postdata_threshold = None self.__save_data_count = 0 self.__priv_sn = '' self.du = DshUtils() def options(self, parser, env): """ Register command line options """ pass def set_data(self, post_data_dir, max_postdata_threshold): self.post_data_dir = post_data_dir self.max_postdata_threshold = max_postdata_threshold def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config if self.post_data_dir is not None: self.enabled = True else: self.enabled = False def __save_home(self, test, status, err=None): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): _test = test.context sn = _test.__name__ else: # test does not have any PBS Objects, so just return return if self.__priv_sn != sn: self.__save_data_count = 0 self.__priv_sn = sn # Saving home might take time so disable timeout # handler set by runner tn = getattr(_test, '_testMethodName', 'unknown') testlogs = getattr(test, 'captured_logs', '') datadir = os.path.join(self.post_data_dir, sn, tn) if os.path.exists(datadir): _msg = 'Old post analysis data exists at %s' % datadir _msg += ', skipping saving data for this test case' self.logger.warn(_msg) _msg = 'Please remove old directory or' _msg += ' provide different directory' self.logger.warn(_msg) return if getattr(test, 'old_sigalrm_handler', None) is not None: _h = getattr(test, 'old_sigalrm_handler') signal.signal(signal.SIGALRM, _h) signal.alarm(0) self.logger.log(logging.DEBUG2, 'Saving post analysis data...') current_host = socket.gethostname().split('.')[0] self.du.mkdir(current_host, path=datadir, mode=0755, parents=True, logerr=False, level=logging.DEBUG2) if err is not None: if isclass(err[0]) and issubclass(err[0], SkipTest): status = 'SKIP' status_data = 'Reason = %s' % (err[1]) else: if isclass(err[0]) and issubclass(err[0], TimeOut): status = 'TIMEDOUT' status_data = getattr(test, 'err_in_string', '') else: status_data = '' logfile = os.path.join(datadir, 'logfile_' + status) f = open(logfile, 'w+') f.write(testlogs + '\n') f.write(status_data + '\n') f.write('test duration: %s\n' % str(getattr(test, 'duration', '0'))) if status in ('PASS', 'SKIP'): # Test case passed or skipped, no need to save post analysis data f.close() return if ((self.max_postdata_threshold != 0) and (self.__save_data_count >= self.max_postdata_threshold)): _msg = 'Total number of saved post analysis data for this' _msg += ' testsuite is exceeded max postdata threshold' _msg += ' (%d)' % self.max_postdata_threshold f.write(_msg + '\n') self.logger.error(_msg) f.close() return svr = getattr(_test, 'server', None) if svr is not None: svr_host = svr.hostname else: _msg = 'Could not find Server Object in given test object' _msg += ', skipping saving post analysis data' f.write(_msg + '\n') self.logger.warning(_msg) f.close() return pbs_diag = os.path.join(svr.pbs_conf['PBS_EXEC'], 'unsupported', 'pbs_diag') cmd = [pbs_diag, '-f', '-d', '2'] cmd += ['-u', self.du.get_current_user()] if len(svr.jobs) > 0: cmd += ['-j', ','.join(svr.jobs.keys())] ret = self.du.run_cmd(svr_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to get diag information for ' _msg += 'on %s:' % svr_host _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: diag_re = r"(?P<path>\/.*\/pbs_diag_[\d]+_[\d]+\.tar\.gz).*" m = re.search(diag_re, '\n'.join(ret['out'])) if m is not None: diag_out = m.group('path') else: _msg = 'Failed to find generated diag path in below output:' _msg += '\n\n' + '-' * 80 + '\n' _msg += '\n'.join(ret['out']) + '\n' _msg += '-' * 80 + '\n\n' f.write(_msg) self.logger.error(_msg) f.close() return diag_out_dest = os.path.join(datadir, os.path.basename(diag_out)) if not self.du.is_localhost(svr_host): diag_out_r = svr_host + ':' + diag_out else: diag_out_r = diag_out ret = self.du.run_copy(current_host, diag_out_r, diag_out_dest, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to copy generated diag from' _msg += ' %s to %s' % (diag_out_r, diag_out_dest) f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: self.du.rm(svr_host, path=diag_out, sudo=True, force=True, level=logging.DEBUG2) cores = [] dir_list = ['server_priv', 'sched_priv', 'mom_priv'] for d in dir_list: path = os.path.join(svr.pbs_conf['PBS_HOME'], d) files = self.du.listdir(hostname=svr_host, path=path, sudo=True, level=logging.DEBUG2) for _f in files: if os.path.basename(_f).startswith('core'): cores.append(_f) cores = list(set(cores)) if len(cores) > 0: cmd = ['gunzip', diag_out_dest] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed unzip generated diag at %s:' % diag_out_dest _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return diag_out_dest = diag_out_dest.rstrip('.gz') cmd = ['tar', '-xf', diag_out_dest, '-C', datadir] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed extract generated diag %s' % diag_out_dest _msg += ' to %s:' % datadir _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(hostname=current_host, path=diag_out_dest, force=True, sudo=True, level=logging.DEBUG2) diag_out_dest = diag_out_dest.rstrip('.tar') for c in cores: cmd = [pbs_diag, '-g', c] ret = self.du.run_cmd(svr_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed to get core file information for ' _msg += '%s on %s:' % (c, svr_host) _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) else: of = os.path.join(diag_out_dest, os.path.basename(c) + '.out') _f = open(of, 'w+') _f.write('\n'.join(ret['out']) + '\n') _f.close() self.du.rm(hostname=svr_host, path=c, force=True, sudo=True, level=logging.DEBUG2) cmd = ['tar', '-cf', diag_out_dest + '.tar'] cmd += [os.path.basename(diag_out_dest)] ret = self.du.run_cmd(current_host, cmd, sudo=True, cwd=datadir, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed generate tarball of diag directory' _msg += ' %s' % diag_out_dest _msg += ' after adding core(s) information in it:' _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return cmd = ['gzip', diag_out_dest + '.tar'] ret = self.du.run_cmd(current_host, cmd, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed compress tarball of diag %s' % diag_out_dest _msg += '.tar after adding core(s) information in it:' _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(current_host, diag_out_dest, sudo=True, recursive=True, force=True, level=logging.DEBUG2) else: diag_out_dest = diag_out_dest.rstrip('.tar.gz') dest = os.path.join(datadir, 'PBS_' + current_host.split('.')[0] + '.tar.gz') ret = self.du.run_copy(current_host, diag_out_dest + '.tar.gz', dest, sudo=True, level=logging.DEBUG2) if ret['rc'] != 0: _msg = 'Failed rename tarball of diag from %s' % diag_out_dest _msg += '.tar.gz to %s:' % dest _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return self.du.rm(current_host, path=diag_out_dest + '.tar.gz', force=True, sudo=True, level=logging.DEBUG2) f.close() self.__save_data_count += 1 _msg = 'Successfully saved post analysis data' self.logger.log(logging.DEBUG2, _msg) def addError(self, test, err): self.__save_home(test, 'ERROR', err) def addFailure(self, test, err): self.__save_home(test, 'FAIL', err) def addSuccess(self, test): self.__save_home(test, 'PASS')
class PTLTestData(Plugin): """ Save post analysis data on test cases failure or error """ name = 'PTLTestData' score = sys.maxsize - 6 logger = logging.getLogger(__name__) def __init__(self): Plugin.__init__(self) self.post_data_dir = None self.max_postdata_threshold = None self.__save_data_count = 0 self.__priv_sn = '' self.du = DshUtils() def options(self, parser, env): """ Register command line options """ pass def set_data(self, post_data_dir, max_postdata_threshold): self.post_data_dir = post_data_dir self.max_postdata_threshold = max_postdata_threshold def configure(self, options, config): """ Configure the plugin and system, based on selected options """ self.config = config if self.post_data_dir is not None: self.enabled = True else: self.enabled = False def __save_home(self, test, status, err=None): if hasattr(test, 'test'): _test = test.test sn = _test.__class__.__name__ elif hasattr(test, 'context'): _test = test.context sn = _test.__name__ else: # test does not have any PBS Objects, so just return return if self.__priv_sn != sn: self.__save_data_count = 0 self.__priv_sn = sn # Saving home might take time so disable timeout # handler set by runner tn = getattr(_test, '_testMethodName', 'unknown') testlogs = getattr(test, 'captured_logs', '') datadir = os.path.join(self.post_data_dir, sn, tn) if os.path.exists(datadir): _msg = 'Old post analysis data exists at %s' % datadir _msg += ', skipping saving data for this test case' self.logger.warn(_msg) _msg = 'Please remove old directory or' _msg += ' provide different directory' self.logger.warn(_msg) return if getattr(test, 'old_sigalrm_handler', None) is not None: _h = getattr(test, 'old_sigalrm_handler') signal.signal(signal.SIGALRM, _h) signal.alarm(0) self.logger.log(logging.DEBUG2, 'Saving post analysis data...') current_host = socket.gethostname().split('.')[0] self.du.mkdir(current_host, path=datadir, mode=0o755, parents=True, logerr=False, level=logging.DEBUG2) if err is not None: if isclass(err[0]) and issubclass(err[0], SkipTest): status = 'SKIP' status_data = 'Reason = %s' % (err[1]) else: if isclass(err[0]) and issubclass(err[0], TimeOut): status = 'TIMEDOUT' status_data = getattr(test, 'err_in_string', '') else: status_data = '' logfile = os.path.join(datadir, 'logfile_' + status) f = open(logfile, 'w+') f.write(testlogs + '\n') f.write(status_data + '\n') f.write('test duration: %s\n' % str(getattr(test, 'duration', '0'))) if status in ('PASS', 'SKIP'): # Test case passed or skipped, no need to save post analysis data f.close() return if ((self.max_postdata_threshold != 0) and (self.__save_data_count >= self.max_postdata_threshold)): _msg = 'Total number of saved post analysis data for this' _msg += ' testsuite is exceeded max postdata threshold' _msg += ' (%d)' % self.max_postdata_threshold f.write(_msg + '\n') self.logger.error(_msg) f.close() return servers = getattr(_test, 'servers', None) if servers is not None: server_host = servers.values()[0].shortname else: _msg = 'Could not find Server Object in given test object' _msg += ', skipping saving post analysis data' f.write(_msg + '\n') self.logger.warning(_msg) f.close() return moms = getattr(_test, 'moms', None) comms = getattr(_test, 'comms', None) client = getattr(_test.servers.values()[0], 'client', None) server = servers.values()[0] add_hosts = [] if len(servers) > 1: for param in servers.values()[1:]: add_hosts.append(param.shortname) if moms is not None: for param in moms.values(): add_hosts.append(param.shortname) if comms is not None: for param in comms.values(): add_hosts.append(param.shortname) if client is not None: add_hosts.append(client.split('.')[0]) add_hosts = list(set(add_hosts) - set([server_host])) pbs_snapshot_path = os.path.join( server.pbs_conf["PBS_EXEC"], "sbin", "pbs_snapshot") cur_user = self.du.get_current_user() cur_user_dir = pwd.getpwnam(cur_user).pw_dir cmd = [ pbs_snapshot_path, '-H', server_host, '--daemon-logs', '2', '--accounting-logs', '2', '--with-sudo' ] if len(add_hosts) > 0: cmd += ['--additional-hosts=' + ','.join(add_hosts)] cmd += ['-o', cur_user_dir] ret = self.du.run_cmd(current_host, cmd, level=logging.DEBUG2, logerr=False) if ret['rc'] != 0: _msg = 'Failed to get analysis information for ' _msg += 'on %s:' % servers_host _msg += '\n\n' + '\n'.join(ret['err']) + '\n\n' f.write(_msg + '\n') self.logger.error(_msg) f.close() return else: if len(ret['out']) == 0: self.logger.error('Snapshot command failed') f.close() return snap_out = ret['out'][0] snap_out_dest = (snap_out.split(":")[1]).strip() dest = os.path.join(datadir, 'PBS_' + server_host + '.tar.gz') ret = self.du.run_copy(current_host, snap_out_dest, dest, sudo=True, level=logging.DEBUG2) self.du.rm(current_host, path=snap_out_dest, recursive=True, force=True, level=logging.DEBUG2) f.close() self.__save_data_count += 1 _msg = 'Saved post analysis data' self.logger.info(_msg) def addError(self, test, err): self.__save_home(test, 'ERROR', err) def addFailure(self, test, err): self.__save_home(test, 'FAIL', err) def addSuccess(self, test): self.__save_home(test, 'PASS')
def _cleanup(self): self.logger.info('Cleaning up temporary files') du = DshUtils() hosts = set(self.param_dict['moms']).union( set(self.param_dict['servers'])) for user in PBS_USERS: self.logger.debug('Cleaning %s\'s home directory' % (str(user))) runas = PbsUser.get_user(user) for host in hosts: ret = du.run_cmd(host, cmd=['echo', '$HOME'], sudo=True, runas=runas, logerr=False, as_script=True, level=logging.DEBUG) if ret['rc'] == 0: path = ret['out'][0].strip() else: return None ftd = [] files = du.listdir(host, path=path, runas=user, level=logging.DEBUG) bn = os.path.basename ftd.extend([f for f in files if bn(f).startswith('PtlPbs')]) ftd.extend([f for f in files if bn(f).startswith('STDIN')]) if len(ftd) > 1000: for i in range(0, len(ftd), 1000): j = i + 1000 du.rm(host, path=ftd[i:j], runas=user, force=True, level=logging.DEBUG) root_dir = os.sep dirlist = set([os.path.join(root_dir, 'tmp'), os.path.join(root_dir, 'var', 'tmp')]) # get tmp dir from the environment for envname in 'TMPDIR', 'TEMP', 'TMP': dirname = os.getenv(envname) if dirname: dirlist.add(dirname) p = re.compile(r'^pbs\.\d+') for tmpdir in dirlist: # list the contents of each tmp dir and # get the file list to be deleted self.logger.info('Cleaning up ' + tmpdir + ' dir') ftd = [] files = du.listdir(path=tmpdir) bn = os.path.basename ftd.extend([f for f in files if bn(f).startswith('PtlPbs')]) ftd.extend([f for f in files if bn(f).startswith('STDIN')]) ftd.extend([f for f in files if bn(f).startswith('pbsscrpt')]) ftd.extend([f for f in files if bn(f).startswith('pbs.conf.')]) ftd.extend([f for f in files if p.match(bn(f))]) for f in ftd: du.rm(path=f, sudo=True, recursive=True, force=True, level=logging.DEBUG) for f in du.tmpfilelist: du.rm(path=f, sudo=True, force=True, level=logging.DEBUG) del du.tmpfilelist[:] tmpdir = tempfile.gettempdir() os.chdir(tmpdir) tmppath = os.path.join(tmpdir, 'dejagnutemp%s' % os.getpid()) if du.isdir(path=tmppath): du.rm(path=tmppath, recursive=True, sudo=True, force=True, level=logging.DEBUG)