forked from COSC1127-AI/pacman-contest-cluster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_manager.py
374 lines (310 loc) · 16.4 KB
/
cluster_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
ClusterManager manages a set of remote workers and distributes a list of jobs using a greedy policy (jobs are assigned,
in order, to the first free worker. Transfers and communications are done over SSH.
The manager creates a temporary environment for each job, and can copy files to and from such environment (via relative
paths) or anywhere else (via absolute paths).
Extreme care is recommended to both commands and file paths passed: this script performs no checks whatsoever - it's on
you!
"""
__author__ = "Sebastian Sardina, Marco Tamassia, and Nir Lipovetzky"
__copyright__ = "Copyright 2017-2018"
__license__ = "GPLv3"
from collections import namedtuple
from queue import Queue
import random
from time import sleep
import os
import datetime
from joblib import Parallel, delayed
from getpass import getpass, getuser
# doc for paramiko: http://docs.paramiko.org/en/2.4/api/client.html
from paramiko.config import SSHConfig
from paramiko.client import SSHClient
from paramiko.rsakey import RSAKey
from paramiko.proxy import ProxyCommand
from paramiko import AutoAddPolicy
import logging
import traceback
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO,
datefmt='%a, %d %b %Y %H:%M:%S')
logging.getLogger("paramiko").setLevel(logging.WARNING)
# ----------------------------------------------------------------------------------------------------------------------
# Import class from helper module
Host = namedtuple('Host', ['no_cpu', 'hostname', 'username', 'password', 'key_filename', 'key_password'], verbose=False)
Job = namedtuple('Job', ['command', 'required_files', 'return_files', 'id', 'data'], verbose=False)
TransferableFile = namedtuple('TransferableFile', ['local_path', 'remote_path'], verbose=False)
# Keep track of the number of total jobs to run and number of jobs completed (for reporting)
no_total_jobs = 0
no_successful_jobs = 0
no_failed_jobs = 0
time_games = [] # list of seconds, one per game finished
time_start = datetime.datetime.now()
CORE_PACKAGE_DIR = '/tmp/pacman_files'
NO_LOCAL_RETRIES = 1 # Number of retries when a remote command failed (e.g., connection lost)
NO_GLOBAL_TRIES = 2
class ErrorInGame(Exception):
'''raise this when there's a lookup error for my app'''
class ClusterManager:
def __init__(self, hosts, jobs, core_req_file = None):
self.hosts = hosts # type: 'List[Host]'
self.jobs = jobs # type: 'List[Job]'
self.workers = [] # type: 'List[SSHClient]'
self.pool = Queue() # type: 'Queue[SSHClient]'
self.no_tries = NO_LOCAL_RETRIES
total_no_workers = sum(host.no_cpu for host in hosts)
# https: // pythonhosted.org / joblib / generated / joblib.Parallel.html
global no_total_jobs
no_total_jobs = len(self.jobs)
logging.info('ABOUT TO RUN %d jobs in %d hosts (%d CPUs) #####################' % \
(no_total_jobs, len(hosts), total_no_workers))
# Firsts, authenticate abd build all workers (each Hostname + core gives a worker)
self.workers = Parallel(total_no_workers, backend='threading')(delayed(create_worker)(host)
for host in self.hosts
for _ in range(host.no_cpu))
# Second, transfer the required core files to each hostname, if any
# (this is good because there there will be many less than workers, just one per IP)
logging.info('FIRST COPYING REQUIRED FILES TO HOSTS....')
if not core_req_file is None:
Parallel(len(self.hosts), backend='threading')(
delayed(transfer_core_package)(host.hostname, self.workers, core_req_file)
for host in self.hosts)
# Put all workers in pool
for worker in self.workers:
self.pool.put(worker)
def start(self):
global time_start
time_start = datetime.datetime.now()
jobs_list = self.jobs
results = [] # list of results: job.data, exit_code, result_out, result_err, job_secs_taken
try_no = 0
while jobs_list:
try_no = try_no + 1
results_run = Parallel(self.pool.qsize(), backend='threading')(delayed(run_job)(self.pool, job)
for job in jobs_list)
if try_no < NO_GLOBAL_TRIES:
games_failed = [job_data for job_data, exit_code, _, _, _ in results_run if exit_code == -1]
jobs_list = [j for j in jobs_list if j.data in games_failed] # extract failed jobs (to retry)
good_results = [tuple(result) for result in results_run if not result[1] == -1]
results = results + good_results # keep non-error results only (rest will be re-tried)
print('============================ ONE FULL PASS ON JOBS COMPELTED ============================')
else:
# tough luck, include failed jobs in results as they came with score = -1 (failed)...
results = results + results_run
break
if len(time_games) > 0:
avg_secs_game = round(sum(time_games) / len(time_games), 0)
max_secs_game = round(max(time_games), 0)
else:
avg_secs_game = 0
max_secs_game = 0
logging.info("STATISTICS: {} games played / {} per game / {} the longest game"
.format(no_successful_jobs,
str(datetime.timedelta(seconds=avg_secs_game)),
str(datetime.timedelta(seconds=max_secs_game))))
return results
def create_worker(host):
config = SSHConfig()
proxy = None
if os.path.exists(os.path.expanduser('~/.ssh/config')):
config.parse(open(os.path.expanduser('~/.ssh/config')))
if host.hostname is not None and \
'proxycommand' in config.lookup(host.hostname):
proxy = ProxyCommand(config.lookup(host.hostname)['proxycommand'])
# proxy = paramiko.ProxyCommand("ssh -o StrictHostKeyChecking=no e62439@131.170.5.132 nc 118.138.239.241 22")
worker = SSHClient()
worker.load_system_host_keys()
worker.set_missing_host_key_policy(AutoAddPolicy())
worker.hostname = host.hostname # store all this for later reference (e.g., logging, reconnection)
worker.username = host.username
worker.password = host.password
worker.proxy = proxy
if not host.key_filename is None:
worker.pkey = RSAKey.from_private_key_file(host.key_filename, host.key_password)
else:
worker.pkey = None
# time.sleep(4)
# worker.connect(hostname=host.hostname, username=host.username, password=host.password, key_filename=host.key_filename, sock=proxy, timeout=3600)
worker.connect(hostname=host.hostname, username=host.username, password=host.password, pkey=worker.pkey, sock=proxy)
return worker
# Transfer the core package and leave it in /tmp/pacman_files
def transfer_core_package(hostname, workers, required_files):
# Find a worker for this hostname and transfer the required files to des_dir
for worker in workers:
if worker.hostname == hostname:
# clean temporary directory of worker
worker.exec_command('rm -rf /tmp/cluster_instance*')
logging.info("[START] CORE PACKAGE TRANSFERED TO HOST %s\n" % hostname)
sftp = worker.open_sftp()
for tf in required_files:
sftp.put(localpath=tf.local_path, remotepath=tf.remote_path)
sftp.close()
logging.info("[END] CORE PACKAGE TRANSFERED TO HOST %s\n" % hostname)
break
return
def run_job(pool, job):
global no_successful_jobs
global no_failed_jobs
global no_total_jobs
# worker is a SSHClient
worker = pool.get()
# We tried NO_RETRIES time - and then give up....
for i in range(NO_LOCAL_RETRIES):
try:
# time.sleep(randint(1, 10))
# TODO: does not work when filename has a ' like Sebcant'code
result_job_on_worker = run_job_on_worker(worker, job)
no_successful_jobs += 1
#TODO: this captures any error that may happen when doing the job in the worker. Is it enough?
except ErrorInGame as e:
# Somehow some games the zip does no uncompress well.....
logging.error("Job with ID {} has FAILED (will retry) with exception: {}".format(job.id, str(e)))
if i < NO_LOCAL_RETRIES - 1: # i is zero indexed
# sleep(4)
continue
else:
no_failed_jobs += 1
logging.error(
"I am giving up local retying job {} in worker {}, too many local failures...".format(job.id,
worker.hostname))
result_job_on_worker = job.data, -1, '', 'Match did not work: {}'.format(str(e)), 1
except Exception as e:
logging.error("Somehow the following job FAILED to execute (will reconnect & retry): {} with exception: {}".format(str(job.id), str(job)))
traceback.print_exc()
worker.close()
worker.connect(hostname=worker.hostname, username=worker.username, password=worker.password,
pkey=worker.pkey, sock=worker.proxy)
if i < NO_LOCAL_RETRIES - 1: # i is zero indexed
continue
else:
no_failed_jobs += 1
logging.error("I am giving up on job %s" % str(job.id))
result_job_on_worker = job.data, -1, '', 'Match did not work', 1
break
games_played = no_successful_jobs + no_failed_jobs
games_left = no_total_jobs - no_successful_jobs
secs_so_far = (datetime.datetime.now() - time_start).total_seconds()
est_time_left = round((games_left * secs_so_far) / games_played, 0)
logging.info(
"Jobs COMPLETED: (%d successful, %d failed) of %d total games (%d games left; estimated time left: %s)"
% (no_successful_jobs, no_failed_jobs, no_total_jobs, games_left, str(datetime.timedelta(seconds=est_time_left))))
pool.put(worker)
return result_job_on_worker
def report_progress_bytes_transfered(xfer, to_be_xfer, job_id):
remains_per = 0.000
remains_per = (xfer / to_be_xfer) * 100
logging.debug(
'Complete percent for job %s: %.2f%% - (%d bytes transfered out of %d)' % (job_id, remains_per, xfer, to_be_xfer))
def report_match(job):
return job.id
return job.data[0][0] + " vs " + job.data[1][0] + " in map " + job.data[2] # this is particular to games
def _rmdir(sftp, path):
files = sftp.listdir(path)
for f in files:
filepath = os.path.join(path, f)
try:
sftp.remove(filepath)
except IOError:
_rmdir(sftp, filepath)
sftp.rmdir(path)
def run_job_on_worker(worker, job):
global max_secs_game
# worker is an SSHClient
# create remote env
instance_id = ''.join(random.choice('0123456789abcdef') for _ in range(30))
instance_id = '{}-{}'.format(job.id.replace(' ','_'), datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
dest_dir = '/tmp/cluster_instance_{}'.format(instance_id)
logging.info('ABOUT TO RUN JOB in host %s (%s): %s' % (worker.hostname, dest_dir, report_match(job)))
sftp = worker.open_sftp()
try:
sftp.mkdir(dest_dir)
except IOError: # dir already exists!
logging.debug('Directory {} seems to exist in {}. Deleting it... '.format(dest_dir, worker.hostname))
_rmdir(sftp, dest_dir)
# worker.exec_command('rm -rf %s' % dest_dir)
sftp.mkdir(dest_dir)
except: # dir already exists!
logging.debug('Error creating directory {} in host {}.'.format(dest_dir, worker.hostname))
_rmdir(sftp, dest_dir)
sftp.mkdir(dest_dir)
sftp.chdir(dest_dir)
# copy core package into the temporary dir for this particular job
# worker.exec_command('cp -a %s/* %s' % (CORE_PACKAGE_DIR, dest_dir))
# logging.debug('GAME PREPARED AND COPIED in host %s (%s): %s' % (worker.hostname, dest_dir, report_match(job)))
# If the job requires files transfer them to the remote path
# (for pacman now, required files is empty, as we transfer the core package once at the start and then copy it)
for tf in job.required_files:
# sftp.put(localpath=tf.local_path, remotepath=tf.remote_path,
# callback=lambda x, y: report_progress_bytes_transfered(x, y, str(job.id)))
sftp.put(localpath=tf.local_path, remotepath=tf.remote_path)
logging.debug('ABOUT TO EXECUTE command in host %s dir %s: %s' % (worker.hostname, dest_dir, job.command))
# run job
startTime = datetime.datetime.now()
actual_command = """cd %s ; sh -c '%s'""" % (dest_dir, job.command)
try:
# TODO: do we want to put a timeout here in case the call does not return? some pacman games take 3 min eh
# _, ssh_stdout, ssh_stderr = worker.exec_command(actual_command, timeout=60, get_pty=True) # Non-blocking call
_, ssh_stdout, ssh_stderr = worker.exec_command(actual_command, get_pty=True) # Non-blocking call
result_out = ssh_stdout.read()
result_err = ssh_stderr.read()
exit_code = ssh_stdout.channel.recv_exit_status() # Blocking call but only after reading it all
# if random.randint(0, 10) > 5: # to force failure!
# exit_code = -1
if not exit_code == 0:
raise ErrorInGame('Error in running game - cmd: {}'.format(actual_command))
except ErrorInGame:
raise
except Exception as e:
job_secs_taken = datetime.datetime.now() - startTime
logging.warning('TIME OUT in host %s (%s secs. taken; %s): %s' % (
worker.hostname, job_secs_taken, dest_dir, report_match(job)))
raise
job_secs_taken = (datetime.datetime.now().replace(microsecond=0) - startTime.replace(microsecond=0)).total_seconds()
# if job_secs_taken < 3:
# print('Strange, game too short, something bad happened, failing it....')
# raise ErrorInGame('Error in running game - cmd: {}'.format(actual_command))
time_games.append(job_secs_taken)
logging.debug(
'END OF GAME in host %s (%s) - START COPYING BACK RESULT: %s' % (worker.hostname, dest_dir, report_match(job)))
# Retrieve replay file
for tf in job.return_files:
#print(tf)
sftp.get(localpath=tf.local_path, remotepath=tf.remote_path)
sftp.close()
# clean temporary directory for game
worker.exec_command('rm -rf %s' % dest_dir)
logging.info('FINISHED GAME in host %s (%s time taken; %s): %s' % (
worker.hostname, job_secs_taken, dest_dir, report_match(job)))
logging.debug(
'FINISHED SUCCESSFULLY EXECUTING command in host %s dir %s: %s' % (worker.hostname, dest_dir, job.command))
return job.data, exit_code, result_out, result_err, job_secs_taken
if __name__ == '__main__':
"""
Little demo:
- connects to localhost
- executes for 10 times, using 2 processes in parallel, the following
- copy the source of this script to the worker
- sleep 1 second
- trim the copied file keeping only the first line
- add some stuff to the file
- copy the file back to the directory of this script
"""
hosts = [
# prompt for password (for password authentication or if private key is password protected)
Host(no_cpu=2, hostname='localhost', username=getuser(), password=getpass(), key_filename=None)
# use this if no pass is necessary (for private key authentication)
# Host(no_cpu=2, hostname='localhost', username=getuser(), password=None, key_filename=None)
]
jobs = []
for i in range(10):
instance_id = ''.join(random.choice('0123456789abcdef') for _ in range(30))
test_file = "%s.txt" % instance_id
command = "sleep 1; cat %s | head -1 > a.txt ; cat a.txt > %s ; ls -l >> %s ; echo ciao >> %s" % (
test_file, test_file, test_file, test_file)
req_file = TransferableFile(local_path='cluster_manager.py', remote_path=test_file)
ret_file = TransferableFile(local_path=test_file, remote_path=test_file)
jobs.append(Job(command=command, required_files=[req_file], return_files=[ret_file], data=None, id='test'))
cm = ClusterManager(hosts=hosts, jobs=jobs)
os.chdir(os.path.dirname(os.path.abspath(__file__)))
cm.start()