示例#1
0
def display_node_GPU(node_name):
    ts = int(time.time())
    node = PyslurmQuery.getNode(node_name)
    if not node:
        print("{} Node {} does not exist.".format(MyTool.getTsString(ts),
                                                  node_name))
        return
    if 'gpu' not in node['features']:
        print("{} Node {} does not have GPUs.".format(MyTool.getTsString(ts),
                                                      node_name))
        return

    jobs = PyslurmQuery.getNodeAllocJobs(node_name, node)
    gpu_total, gpu_used = MyTool.getGPUCount(node['gres'], node['gres_used'])
    print("{}: Node {} up {},\t{} GPUs ({} used), {} allocated jobs.".format(
        MyTool.getTsString(ts), node_name,
        datetime.timedelta(seconds=ts - node['boot_time']), gpu_total,
        gpu_used,
        len(jobs) if jobs else 0))

    jid2gpu = dict(
        map(
            lambda job:
            (job['job_id'], PyslurmQuery.getJobAllocGPUonNode(job, node)),
            jobs))
    if jid2gpu:
        job_gpu = reduce(operator.add, jid2gpu.values())
        start_ts = min(
            [job['start_time'] for job in jobs if job['gres_detail']])
        gpu_data = BrightRestClient().getNodeGPU(node_name,
                                                 start_ts,
                                                 job_gpu,
                                                 msec=False)
    else:
        gpu_data = {}

    jid2job = dict(map(lambda job: (job['job_id'], job), jobs))
    gid2jid = defaultdict(list)
    for jid, gpu_list in jid2gpu.items():
        for gid in gpu_list:
            gid2jid[gid].append(jid)
    print("\t{:6}{:10}{:>20}{:>20}{:>25}".format("GPU", "Jid", "Job run time",
                                                 "Job avg util",
                                                 "Avg util (5,10,30min)"))
    for gid in range(0, gpu_total):
        jid_list = gid2jid[gid]
        if jid_list:
            start_ts = min([jid2job[jid]['start_time'] for jid in jid_list])
            g_data = gpu_data['{}.gpu{}'.format(node_name, gid)]
            g_avg = MyTool.getTimeSeqAvg(g_data, start_ts, ts)
            g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts)
            g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts)
            g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts)
            print("\t{:<6}{:10}{:>20}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".
                  format(gid, str(jid_list),
                         str(datetime.timedelta(seconds=ts - start_ts)),
                         g_avg * 100, g_avg1 * 100, g_avg2 * 100,
                         g_avg3 * 100))
        else:
            print("\t{:<6}{:10}".format(gid, "IDLE"))
示例#2
0
def display_job_GPU(jid):
    ts = int(time.time())
    job = PyslurmQuery.getCurrJob(jid)
    if not job:
        print("{} Job {} does not exist or already stops running.".format(
            MyTool.getTsString(ts), jid))
        return
    j_gpu = PyslurmQuery.getJobAllocGPU(job)
    #print(j_gpu)
    if not j_gpu:
        print("{} Job {} does not allocate any GPU.".format(
            MyTool.getTsString(ts), jid))
        return

    print("{} Job {} of {} run for {},\talloc {} GPUs on {} GPU nodes.".format(
        MyTool.getTsString(ts), jid, MyTool.getUser(job['user_id']),
        datetime.timedelta(seconds=ts - job['start_time']),
        sum([len(g_lst) for g_lst in j_gpu.values()]),
        sum([1 for g_lst in j_gpu.values() if g_lst])))
    gpu_union = reduce(lambda rlt, curr: rlt.union(set(curr)), j_gpu.values(),
                       set())
    #print(gpu_union)
    gpu_data = BrightRestClient().getGPU(list(j_gpu.keys()),
                                         job['start_time'],
                                         list(gpu_union),
                                         msec=False)
    #print(gpu_data.keys())
    print("\t{:12}{:>6}{:>20}{:>25}".format("Node", "GPU", "Job avg util",
                                            "Avg util (5,10,30min)"))
    for node_name, gpu_list in j_gpu.items():
        for gid in gpu_list:
            g_data = gpu_data['{}.gpu{}'.format(node_name, gid)]
            g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts)
            g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts)
            g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts)
            g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts)
            print("\t{:12}{:6}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}".format(
                node_name, gid, g_avg * 100, g_avg1 * 100, g_avg2 * 100,
                g_avg3 * 100))
    return
示例#3
0
 def sendUpdate(self, ts, slurmJobs, hn2data, slurmNodes):
     for url in self.urls:
         zps = zlib.compress(
             cPickle.dumps((ts, slurmJobs, hn2data, slurmNodes), -1))
         try:
             logger.debug("sendUpdate to {}".format(url))
             if not self.test_mode:
                 resp = urllib2.urlopen(
                     urllib2.Request(
                         url, zps,
                         {'Content-Type': 'application/octet-stream'}))
             else:
                 resp = 0
             logger.debug(
                 "{}:{}: sendUpdate to {} with return code {}".format(
                     threading.currentThread().ident,
                     MyTool.getTsString(ts), url, resp))
         except Exception as e:
             logger.error('Failed to update slurm data {}: {}'.format(
                 url, e))
示例#4
0
    def dealData(self, ts, slurmJobs, slurmNodes, msgs):
        # faciliated data structure
        node2uid2cpuCnt = self.getUserAllocCPUOnNode(slurmJobs)

        nodeUserProcs = {}  #node:state, delta, ts, [user, procs]
        #update the information using msg
        for hostname, slurmNode in slurmNodes.items(
        ):  # need to generate a record for every host to reflect current
            # SLURM status, even if we don't have a msg for it.
            pre_ts, pre_procs, pre_nodeUserProcs = self.savNode2TsProcs[
                hostname]
            if (hostname not in msgs) or (pre_ts >=
                                          msgs[hostname][-1]['hdr']['msg_ts']):
                logger.debug(
                    "No new data of {}. Use previous data at {}".format(
                        hostname, MyTool.getTsString(pre_ts)))
                if hostname in msgs:
                    msgs.pop(hostname)
                    logger.debug(
                        "\tIgnore the incoming older data at {}.".format(
                            MyTool.getTsString(msg['hdr']['msg_ts'])))
                if pre_ts != -1:
                    nodeUserProcs[hostname] = pre_nodeUserProcs
                else:
                    nodeUserProcs[hostname] = [
                        slurmNode.get('state', '?STATE?'), 0.0, ts
                    ]
            else:  #hostname in msgs:
                host_msgs = msgs.pop(hostname)
                msg = host_msgs[-1]  # get the latest message
                msg_ts = msg['hdr']['msg_ts']
                msg_procs = dict([(proc['pid'], proc)
                                  for proc in msg['processes']])
                if len(host_msgs) > 1:
                    pre_msg = host_msgs[-2]
                    if (pre_ts < pre_msg['hdr']['msg_ts']) and (
                            pre_msg['hdr']['msg_ts'] <
                            msg_ts):  #saved value is older
                        pre_ts = pre_msg['hdr']['msg_ts']
                        pre_procs = dict([(proc['pid'], proc)
                                          for proc in pre_msg['processes']])

                delta = 0.0 if -1.0 == pre_ts else msg_ts - pre_ts
                procsByUser = self.getProcsByUser(
                    hostname, msg_ts, msg_procs, pre_ts, pre_procs,
                    node2uid2cpuCnt.get(hostname, {}))
                nodeUserProcs[hostname] = [
                    slurmNode.get('state', '?STATE?'), delta, msg_ts
                ] + procsByUser
                # upate savNode2TsProcs
                self.savNode2TsProcs[hostname] = (msg_ts, msg_procs,
                                                  nodeUserProcs[hostname])

                #save information to files
                #logger.debug("writeData {}:{}".format(ts, hostname))
                if self.write_file_flag:
                    self.hostData_dir.writeData(hostname, ts,
                                                nodeUserProcs[hostname])
                else:
                    logger.debug("simulate write to file")

        self.discardMessage(msgs)
        self.sendUpdate(ts, slurmJobs, nodeUserProcs, slurmNodes)
示例#5
0
def display_user_GPU(user_name):
    ts = int(time.time())
    uid = MyTool.getUid(user)
    if not uid:
        print("{} User {} does not exist.".format(MyTool.getTsString(ts),
                                                  user_name))
    user_job_lst = PyslurmQuery.getUserCurrJobs(uid)
    if not user_job_lst:
        print("{} User {} does not have running jobs.".format(
            MyTool.getTsString(ts), user_name))
        return
    node_dict = PyslurmQuery.getAllNodes()
    job_gpu_d = dict([(job['job_id'],
                       PyslurmQuery.getJobAllocGPU(job, node_dict))
                      for job in user_job_lst])

    u_node = [
        node_name for g_alloc_d in job_gpu_d.values()
        for node_name in g_alloc_d
    ]
    u_gpu_cnt = sum([
        len(g_lst) for g_alloc_d in job_gpu_d.values()
        for g_lst in g_alloc_d.values()
    ])
    g_union = reduce(lambda rlt, curr: rlt.union(set(curr)), [
        g_lst for g_alloc_d in job_gpu_d.values()
        for g_lst in g_alloc_d.values()
    ], set())
    print("{} User {} has {} running jobs,\talloc {} GPUs on {} GPU nodes.".
          format(MyTool.getTsString(ts), user_name, len(user_job_lst),
                 u_gpu_cnt, len(u_node)))
    #get gpu data
    if u_node:  #GPU nodes allocated
        gpu_data = BrightRestClient().getGPU(u_node,
                                             min([
                                                 job['start_time']
                                                 for job in user_job_lst
                                                 if job_gpu_d[job['job_id']]
                                             ]),
                                             list(g_union),
                                             msec=False)
    else:
        gpu_data = {}
    print("\t{:10}{:20}{:>16}{:>20}{:>25}".format("Jid", "Job run time",
                                                  "Node.GPU", "Job avg util",
                                                  "Avg util (5,10,30min)"))
    for job in user_job_lst:
        jid = job['job_id']
        j_run_time = str(datetime.timedelta(seconds=ts - job['start_time']))
        j_first_ln = True
        if not job_gpu_d[jid]:  # job not using GPU
            print("\t{:<10}{:20}{:>16}".format(job['job_id'], j_run_time,
                                               'No GPU'))
            continue
        for node, g_lst in job_gpu_d[jid].items():
            for g in g_lst:
                g_name = '{}.gpu{}'.format(node, g)
                g_data = gpu_data[g_name]
                g_avg = MyTool.getTimeSeqAvg(g_data, job['start_time'], ts)
                g_avg1 = MyTool.getTimeSeqAvg(g_data, ts - 5 * 60, ts)
                g_avg2 = MyTool.getTimeSeqAvg(g_data, ts - 10 * 60, ts)
                g_avg3 = MyTool.getTimeSeqAvg(g_data, ts - 30 * 60, ts)
                if j_first_ln:
                    print(
                        "\t{:<10}{:20}{:>16}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}"
                        .format(jid, j_run_time, g_name, g_avg * 100,
                                g_avg1 * 100, g_avg2 * 100, g_avg3 * 100))
                    j_first_ln = False
                else:
                    print(
                        "\t{:<10}{:20}{:>16}{:>20.2f}{:>10.2f},{:>6.2f},{:>6.2f}"
                        .format('', '', g_name, g_avg * 100, g_avg1 * 100,
                                g_avg2 * 100, g_avg3 * 100))