def queryNodeAvg(self, node, minutes=5): cpu_rlt, mem_rlt = 0, 0 start_ts = self.nodes[node]['last_ts'] - minutes * 60 if node not in self.nodes or not self.nodes[node]: logger.warning("queryNodeAvg: Node {} is not in cache".format( node, list(self.nodes.keys()))) return 0 if start_ts > self.nodes[node][ 'first_ts'] + minutes * 60 * 0.2: # 20% relax on period inclusion logger.warning( "queryNodeAvg: Node {} requested period {}- is not completely in cache ({}-{})" .format(node, start_ts, self.nodes[node]['first_ts'], self.nodes[node]['last_ts'])) #return 0 # still return some value for uid, user_usage in self.node_user[node].items(): #logger.debug("\t{}:{}".format(node, self.node_user[node][uid])) user = MyTool.getUser(uid) idx = bisect_left(user_usage, (start_ts, )) seq = user_usage[idx:] try: cpu_rlt += mean( [usage[InMemCache.CPU_IDX] for (ts, usage) in seq] ) #usage is evenly distributed, thus just mean, TODO: have problem when node is down and not sending data #mem_rlt += mean([usage[InMemCache.RSS_IDX] for (ts, usage) in seq]) #usage is evenly distributed, thus just mean except BaseException as e: print("ERROR {} uid={} usage={} start={} idx={} ".format( e, uid, user_usage, start_ts, idx)) logger.debug("\tnode={}:cpu_rlt={}, mem_rlt={}".format( self.nodes[node], cpu_rlt, mem_rlt)) return cpu_rlt
def getJobByName_cluster(job_name, cluster, fields): start, stop, df = SlurmDBQuery.readJobTable(cluster, fld_lst=fields) df = df[df['job_name'] == job_name] df['state'] = df['state'].map(lambda x: SLURM_STATE_DICT.get(x, x)) df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x)) df['duration'] = df['time_end'] - df['time_start'] df['duration'] = df['duration'].map(lambda x: x if x > 0 else 0) df = df.fillna('Not Defined') lst = df.to_dict(orient='records') return lst
def gendata_all(fs, start='', stop='', topN=5): if fs not in FileSystems: logger.warning( "WARNING gendata_all: Unknown file system: {}".format(fs)) return [], [] dDict = gendata_fs_history(fs, start, stop) if not dDict: return [], [] # get last ts, decide top users based on latest usage last_ts = list(sorted(dDict))[-1] top_df = dDict[last_ts].nlargest(5, ["fc", "bc"], keep='first') top_uid1 = top_df['uid'].tolist() top_df = dDict[last_ts].nlargest(5, ["bc", "fc"], keep='first') top_uid2 = top_df['uid'].tolist() df = pandas.concat(dDict, names=['ts', 'idx']) dfg = df.groupby('uid') # for each uid, dfg.get_group uid2seq1 = [] #{ uid: [(ts, value), ...], ...} for uid in top_uid1: uidDf = dfg.get_group(uid).reset_index() uname = MyTool.getUser(uid, True) uid2seq1.append({ 'name': uname, 'data': uidDf.loc[:, ['ts', 'fc']].values.tolist() }) uid2seq2 = [] #{ uid: [(ts, value), ...], ...} for uid in top_uid2: uidDf = dfg.get_group(uid).reset_index() uname = MyTool.getUser(uid, True) uid2seq2.append({ 'name': uname, 'data': uidDf.loc[:, ['ts', 'bc']].values.tolist() }) return uid2seq1, uid2seq2
def queryNode(self, node, start_ts=None, end_ts=None): cpu_rlt, mem_rlt, io_rlt = [], [], [] if node not in self.nodes: logger.info("queryNode: Node {} is not in cache".format( node, list(self.nodes.keys()))) return None, [], [], [] if start_ts and start_ts < self.nodes[node][ 'first_ts'] - 300: # five minutes gap is allowed logger.info( "queryNode: Node {} period {}-{} is not completely in cache ({}-{})" .format(node, start_ts, end_ts, self.nodes[node]['first_ts'], self.nodes[node]['last_ts'])) return None, [], [], [] # else start_ts==None or start_ts >= self.nodes[node]['first_ts']-300 for uid, user_usage in self.node_user[node].items(): #logger.debug("\t{}:{}".format(node, self.node_user[node][uid])) user = MyTool.getUser(uid) seq = user_usage if start_ts and start_ts >= self.nodes[node]['first_ts'] - 300: idx = bisect_left(seq, (start_ts, )) seq = user_usage[idx:] if end_ts and end_ts >= self.nodes[node]['last_ts'] - 300: idx = bisect_right(seq, (end_ts, )) seq = user_usage[:idx - 1] cpu_rlt.append({ 'name': user, 'data': [[ts * 1000, usage[InMemCache.CPU_IDX]] for (ts, usage) in seq] }) mem_rlt.append({ 'name': user, 'data': [[ts * 1000, usage[InMemCache.RSS_IDX]] for (ts, usage) in seq] }) io_rlt.append({ 'name': user, 'data': [[ts * 1000, usage[11]] for (ts, usage) in seq] }) logger.debug("\tnode={}:cpu_rlt={}".format(self.nodes[node], cpu_rlt)) return self.nodes[node], cpu_rlt, mem_rlt, io_rlt
def sacct_getReport( criteria, days=3, output='JobID,JobName,AllocCPUS,State,ExitCode,User,NodeList,Start,End', skipJobStep=True): #print('sacct_getReport {} {} {}'.format(criteria, days, skipJobStep)) if days: t = date.today() + timedelta(days=-days) startDate = '%d-%02d-%02d' % (t.year, t.month, t.day) criteria = ['-S', startDate] + criteria #Constraints has problem field_str, sacct_rlt = SlurmCmdQuery.sacctCmd(criteria, output) keys = field_str.split(sep='|') jobs = [] jid_idx = keys.index('JobID') for line in sacct_rlt: ff = line.split(sep='|') if (skipJobStep and '.' in ff[jid_idx]): continue # indicates a job step --- under what circumstances should these be broken out? #508550_0.extern, 508550_[111-626%20], (array job) 511269+0, 511269+0.extern, 511269+0.0 (?) if ('.' in ff[jid_idx]): ff0 = ff[jid_idx].split(sep='.')[0] else: ff0 = ff[jid_idx] m = re.fullmatch(r'(\d+)([_\+])(.*)', ff0) if not m: jid = int(ff0) else: jid = int(m.group(1)) if ff[3].startswith('CANCELLED by '): uid = ff[3].rsplit(' ', 1)[1] uname = MyTool.getUser(uid) ff[3] = '%s (%s)' % (ff[3], uname) job = dict(zip(keys, ff)) jobs.append(job) if 'AllocTRES' in output: for job in jobs: job['AllocGPUS'] = MyTool.getTresGPUCount(job['AllocTRES']) return jobs
def getNodeRunJobs(self, node, start, stop): df = pandas.read_csv(CSV_DIR + "slurm_cluster_job_table.csv", usecols=[ 'id_job', 'id_user', 'nodelist', 'nodes_alloc', 'state', 'time_start', 'time_end', 'time_suspended' ]) start, stop, df = MyTool.getDFBetween(df, 'time_start', start, stop) df = df[df['nodes_alloc'] > 0] #jobs running on node if node: criterion = df['nodelist'].map(lambda x: node in MyTool.nl2flat(x)) df = df[criterion] df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x)) return df[[ 'id_job', 'user', 'time_start', 'time_end', 'time_suspended' ]]
def display_job_GPU(jid): ts = int(time.time()) job = PyslurmQuery.getCurrJob(jid) if not job: print("{} Job {} does not exist or already stops running.".format( MyTool.getTsString(ts), jid)) return j_gpu = PyslurmQuery.getJobAllocGPU(job) #print(j_gpu) if not j_gpu: print("{} Job {} does not allocate any GPU.".format( MyTool.getTsString(ts), jid)) return print("{} Job {} of {} run for {},\talloc {} GPUs on {} GPU nodes.".format( MyTool.getTsString(ts), jid, MyTool.getUser(job['user_id']), datetime.timedelta(seconds=ts - job['start_time']), sum([len(g_lst) for g_lst in j_gpu.values()]), sum([1 for g_lst in j_gpu.values() if g_lst]))) gpu_union = reduce(lambda rlt, curr: rlt.union(set(curr)), j_gpu.values(), set()) #print(gpu_union) gpu_data = BrightRestClient().getGPU(list(j_gpu.keys()), job['start_time'], list(gpu_union), msec=False) #print(gpu_data.keys()) print("\t{:12}{:>6}{:>20}{:>25}".format("Node", "GPU", "Job avg util", "Avg util (5,10,30min)")) for node_name, gpu_list in j_gpu.items(): for gid in gpu_list: g_data = gpu_data['{}.gpu{}'.format(node_name, gid)] g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts) g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts) g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts) g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts) print("\t{:12}{:6}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".format( node_name, gid, g_avg * 100, g_avg1 * 100, g_avg2 * 100, g_avg3 * 100)) return
def gendata_fs(yyyymmdd, fs, ansible_users={}, anon=False): if fs not in FileSystems: return 'Unknown file system: {}'.format(fs) label, dataDir, suffix, uidx, fcx, bcx, rge = FileSystems[fs] ff = sorted(glob.glob(dataDir + '/2*' + suffix)) idx = 0 for x, f in enumerate(ff): if yyyymmdd in os.path.basename(f): #filename without dir idx = x break else: idx = len(ff) - 1 logger.info('Date {}:{} not found. Use most recent {} instead.'.format( fs, yyyymmdd, ff[-1])) yyyymmdd = getDateFromFileName(rge, os.path.basename(ff[-1])) #calculate delta and cut_off pre = read_file(ff[idx - 1], uidx, [fcx, bcx]) curr = read_file(ff[idx], uidx, [fcx, bcx]) delta = { k: minus_list(curr.get(k, None), pre.get(k, None)) + curr.get(k, [0, 0]) for k in (set(pre) | set(curr)) } #uid: [delta_fc, delta_bc, curr_fc, curr_bc] t_dfc = sum([v[0] for v in delta.values()]) t_dbc = sum([v[1] for v in delta.values()]) # find N50 wrt file count. s = 0 uid2x = {} cutoff = None for x, (dfc, uid) in enumerate( sorted([(dfc, uid) for uid, [dfc, d, d, d] in delta.items()], reverse=True)): s += dfc if 2 * s > t_dfc: cutoff = x uid2x[uid] = x if 4 * x > len(delta): cutoff = 2 #non_home_user = [(uid, MyTool.getUser(uid)) for uid, v in delta.items() if v[2]==0 or v[3]==0] #MyTool.logTmp("{} non_home_user={}".format(fs, non_home_user), time.time()) r = [] for uid, v in delta.items(): if uid < 1000: # skip continue if 0 == v[2] or 0 == v[3]: # curr_f==0 or curr_bc ==0, skip continue d = { 'x': v[3], 'y': v[2], 'z': log(max(2**20, v[1]), 2) - 19, 'dfb': v[1], 'dfc': v[0], 'id': uid } uname = MyTool.getUser(uid, fakeName=False) # slurm user name if not uname: # cannot user uname = ansible_users.get(uid, None) # ansilbe user name uname = "User_{}".format(uid) if not uname else uname d['name'] = anonimize(uname) if anon else uname d['marker'] = {'fillColor': 'rgba(255,225,0,0.5)'} r.append(d) #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname), 'id':uid, 'marker':{'fillColor': 'rgba(255,225,0,0.5)'}}) else: d['name'] = anonimize(uname) if anon else uname if cutoff and uid2x[uid] <= cutoff: d['marker'] = {'fillColor': 'rgba(236,124,181,0.9)'} #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname), 'marker':{'fillColor': 'rgba(236,124,181,0.9)'}}) #else: #r.append({'x':v[3], 'y':v[2], 'z':log(max(2**20, v[1]),2)-19, 'dfb':v[1], 'dfc':v[0], 'name':'{}'.format(uname)}) r.append(d) return [label, r, yyyymmdd]
def getProcsByUser(self, hostname, msg_ts, msg_procs, pre_ts, pre_procs, uid2cpuCnt): procsByUser = [ ] # [[user, uid, alloc_cores, proc_cnt, totCPURate, totRss, totVMS, procs, totIOBps, totCPUTime]...] uid2procs = DDict( list ) # uid - [[pid, CPURate, create_time, user_time, system_time, rss, vms, cmdline, IOBps]...] for pid, proc in msg_procs.items(): if pid in pre_procs: # continue proc pre_proc = pre_procs[pid] c0 = pre_proc['cpu']['user_time'] + pre_proc['cpu'][ 'system_time'] i0 = pre_proc['io']['read_bytes'] + pre_proc['io'][ 'write_bytes'] d = msg_ts - pre_ts else: # new proc c0 = 0.0 i0 = 0 d = msg_ts - proc['create_time'] if d < 0.1: logger.warning( "The time period betweeen {} and {} is too small, use 0.1 to calculate the CPU rate" .format(msg_ts, pre_ts)) d = 0.1 CPURate = (proc['cpu']['user_time'] + proc['cpu']['system_time'] - c0) / d #TODO: Replace cheap trick to avoid div0. if d < 1: logger.warning( "The time period betweeen {} and {} is too small, use 1 to calculate IOBps" .format(msg_ts, pre_ts)) d = 1 IOBps = int( (proc['io']['read_bytes'] + proc['io']['write_bytes'] - i0) / d) #add jid 12/09/2019, add io_read, write 12/13/2019 proc_lst = [ pid, CPURate, proc['create_time'], proc['cpu']['user_time'], proc['cpu']['system_time'], proc['mem']['rss'], proc['mem']['vms'], proc['cmdline'], IOBps, proc['jid'], proc['num_fds'], proc['io']['read_bytes'], proc['io']['write_bytes'], proc['uid'] ] uid2procs[proc['uid']].append(proc_lst) # get summary over processes of uid for uid, procs in uid2procs.items( ): # proc: [pid, CPURate, create_time, user_time, system_time, rss, vms, cmdline, IOBps] totCPUTime = sum([proc[3] + proc[4] for proc in procs]) totCPURate = sum([proc[1] for proc in procs]) totRSS = sum([proc[5] for proc in procs]) totVMS = sum([proc[6] for proc in procs]) totIOBps = sum([proc[8] for proc in procs]) procsByUser.append([ MyTool.getUser(uid), uid, uid2cpuCnt.get(uid, 0), len(procs), totCPURate, totRSS, totVMS, procs, totIOBps, totCPUTime ]) return procsByUser