-
Notifications
You must be signed in to change notification settings - Fork 0
/
monitor.py
130 lines (104 loc) · 3.9 KB
/
monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import porthole
import subprocess
from time import sleep
import sys
mcc = getMissionControlClient()
hostname = mcc.getName()
# Configuration
serverName = "gozer.at.northwestern.edu"
nodeNames = [
'ray', 'peter', 'egon', 'winston', 'slimer', 'zuul', 'staypuft'
]
# update settings
updateInterval = 1
lastUpdateTime = 0
# This var stores data from the nodes
data = {}
# this will store the porthole service on the master node
ps = None
#-------------------------------------------------------------------------------
# master functions
# kill all monitor slaves on the cluster
def kill_slaves(nodeNames):
from time import sleep
for node in nodeNames:
# find pid of process executing monitor_slave and kill it.
cmd = "ssh -n {0} kill $(pgrep -f monitor)".format(node)
olaunch(cmd)
sleep(0.1)
def setup_master():
missionControlPort = int(sys.argv[1])
webServerPort = int(sys.argv[2])
# do a cleanup, and also register kill_slaves to be called on exit.
kill_slaves(nodeNames)
import atexit
atexit.register(kill_slaves, nodeNames)
# Launch the monitor slaves
for node in nodeNames:
cmd = "ssh -n {0} cd {1}; {2} -c system/headless.cfg monitor.py --mc @{3}:{4} -N {5} -L off --interactive-off".format(
node, os.getcwd(), ogetexecpath(), serverName, missionControlPort, node)
olaunch(cmd)
sleep(0.1)
# Launch the web server
global ps
porthole.initialize(webServerPort, './index.html')
ps = porthole.getService()
#-------------------------------------------------------------------------------
# slave functions
def getcputime():
cpu_infos = {}
with open('/proc/stat') as f:
for l in f:
if(l.startswith('cpu')):
cpu_line = l.split()
cpu_line = [cpu_line[0]]+[float(i) for i in cpu_line[1:]]#type casting
#print cpu_line
cpu_id,user,nice,system,idle,iowait,irq,softrig,steal,guest = cpu_line
Idle=idle+iowait
NonIdle=user+nice+system+irq+softrig+steal
Total=Idle+NonIdle
cpu_infos.update({cpu_id:{'total':Total,'idle':Idle}})
return cpu_infos
def poll_cpus():
start = getcputime()
sleep(updateInterval)
stop = getcputime()
cpu_load = []
for cpu in start:
Total = stop[cpu]['total']
PrevTotal = start[cpu]['total']
Idle = stop[cpu]['idle']
PrevIdle = start[cpu]['idle']
CPU_Percentage=((Total-PrevTotal)-(Idle-PrevIdle))/(Total-PrevTotal)*100
cpu_load.append((cpu, CPU_Percentage))
return cpu_load
def poll_gpus():
out = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv'])
res = []
for l in out.splitlines():
if(not l.startswith('utilization')):
vals = l.split()
res.append(int(vals[0]))
return res
#-------------------------------------------------------------------------------
# update functions
# master update function: send data to webpages
def update_master(frame, time, dt):
global lastUpdateTime
if(time - lastUpdateTime > updateInterval):
lastUpdateTime = time
ps.broadcastjs("data = {0}; update()".format(data), '')
# slave update function: poll usage data and send it back to baster
def update_slave(frame, time, dt):
global lastUpdateTime
if(time - lastUpdateTime > updateInterval):
lastUpdateTime = time
cpuUsage = poll_cpus()
gpuUsage = poll_gpus()
#print hostname + " " + str(cpuUsage)
mcc.postCommand('@server: data["{0}"] = [{1}, {2}]'.format(hostname, cpuUsage, gpuUsage))
if(hostname == "server"):
setup_master()
setUpdateFunction(update_master)
else:
setUpdateFunction(update_slave)