def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning(style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def submit_job(self, comb): """Use the batch script on psmn""" logger.info('Submit job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = SshProcess('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', jobserver).run() return job_sub.stdout.splitlines()[-1].split('.')[0]
def submit_job(self, comb): """Use the batch script""" logger.info('Submiting job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = sp.Popen('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', shell=True, stdout=sp.PIPE, stderr=sp.STDOUT) return job_sub.stdout.readlines()[-1].split('.')[0]
def workflow(self, comb): self.create_par_file(comb) job_id = self.submit_job(comb) logger.info('Combination %s will be treated by job %s', slugify(comb), str(job_id)) while self.is_job_running(job_id): sleep(10) self.sweeper.done(comb)
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter(lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info('No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info(style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def create_par_file(self, comb): """Create Run directory on remote server and upload par file""" logger.info('Creating par file') comb_dir = parent_dir + slugify(comb) + '/' logger.info('comb_dir = ' + comb_dir) # Create remote directories mdir = sp.call('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' + comb_dir + 'Op ; ', shell=True) # Generate par file par_file = 'par_' + slugify(comb) nml = f90nml.read('template.nml') nml['refstate']['ra0'] = float(comb['RA']) nml['tracersin']['K_Fe'] = comb['KFe'] nml['geometry']['r_cmb'] = comb['RCMB'] nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128) nml['geometry']['nztot'] = nztot nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot) nml.write(par_file, force=True) logger.info('Created par file ' + par_file) # Upload par file to remote directory cpar = sp.call('cp ' + par_file + ' ' + comb_dir, shell=True) mpar = sp.call('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', shell=True) logger.info('Done')
def create_par_file(self, comb): """Create Run directory on remote server and upload par file""" logger.info('Creating and uploading par file') comb_dir = parent_dir + slugify(comb) + '/' logger.info('comb_dir = ' + comb_dir) # Create remote directories make_dirs = SshProcess('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' + comb_dir + 'Op ; ', jobserver).run() # Generate par file par_file = 'par_' + slugify(comb) logger.info('par_file = %s', style.emph(par_file)) nml = f90nml.read('template.nml') nml['refstate']['ra0'] = float(comb['RA']) nml['tracersin']['K_Fe'] = comb['KFe'] nml['geometry']['r_cmb'] = comb['RCMB'] nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128) nml['geometry']['nztot'] = nztot nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot) nml.write(par_file, force=True) logger.info('Created par file ' + par_file) # Upload par file to remote directory Put([jobserver], [par_file], remote_location=comb_dir).run() SshProcess('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', jobserver).run() logger.info('Done')
def run(self): """ """ token = 'bRIJb9jp5igAAAAAAAAACc5QzQ619Vp0pYa2PdIrt0q2y0qFyJgwrKvtzuTp3Sz_' client = dropbox.client.DropboxClient(token) parameters = {'size': igeom(128, 2048, 5), 'db_if': ['rest', 'sdk']} combs = sweep(parameters) sweeper = ParamSweeper(self.result_dir + "/sweeps", combs) f = open(self.result_dir + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: comb = sweeper.get_next() logger.info('Treating combination %s', pformat(comb)) comb_dir = self.result_dir + '/' + slugify(comb) try: os.mkdir(comb_dir) except: pass fname = self.create_file(comb['size']) timer = Timer() if comb['db_if'] == 'sdk': self.upload_file_sdk(client, fname, fname.split('/')[-1]) up_time = timer.elapsed() self.download_file_sdk(client, fname.split('/')[-1], comb_dir + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time sweeper.done(comb) elif comb['db_if'] == 'rest': logger.warning('REST interface not implemented') sweeper.skip(comb) continue os.remove(fname) f.write("%f %i %f %f \n" % (timer.start_date(), comb['size'], up_time, dl_time)) f.close()
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess( comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run(self): """ run method from engine in order to do our workflow """ mongo = ClientMongo() size = dict if not self.options.file: if not self.options.only: size = { 1, long(self.options.size * 0.25), long(self.options.size * 0.5), long(self.options.size * 0.75), long(self.options.size) } else: size = {long(self.options.size)} else: if self.OnlyDownload: size = getFilSize(self.options.file) else: size = {0} drive = None if self.options.drive: drive = self.options.drive else: drive = self.drive interface = ['rest', 'sdk'] parameters = { 'size': size, 'if': interface, 'drive': drive, 'transfert': self.transfert } p = None for n in range(0, int(self.options.ntest), 1): logger.info('---------------------') logger.info('Round %i', n + 1) combs = sweep(parameters) date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pathResults = os.getcwd() + '/Results/Bench' + date sweeper = ParamSweeper(pathResults + "/sweeps", combs) f = open(pathResults + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: # sort the parameters for i in interface: for dr in drive: for s in size: comb = sweeper.get_next(filtr=lambda r: filter( lambda x: x['drive'] == dr and x['size'] == s and x['if'] == i, r)) if not comb: continue # start of the workflow if comb['drive'] == 'amazon': p = providerS3.ProviderS3() elif comb['drive'] == 'dropbox': p = providerDB.ProviderDB() else: p = providerGD.ProviderGD() logger.info('Treating combination %s', pformat(comb)) comb_dir = pathResults + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) if not self.options.file: fname = self.create_file(comb['size']) else: fname = self.options.file timer = Timer() up_time = 0 dl_time = 0 start_date = datetime.datetime.now() if comb['if'] == 'sdk': if p.provider_name == "amazon": # AMAZON clientAmz = p.getConnexion() if self.OnlyDownload: p.bucketKey += fname else: p.bucketKey += '/' + fname if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, fname) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey) elif p.provider_name == "dropbox": # DROPBOX client = p.getToken() if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( client, fname, fname.split('/')[-1]) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( client, fname.split('/')[-1], comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file(client, fname.split('/')[-1]) elif p.provider_name == "googledrive": # GOOGLEDRIVE drive_service = p.getConnexion() new_file = None if comb['transfert'] == 'upload' or comb[ 'transfert'] == 'upDown': new_file = p.upload_file_sdk( drive_service, fname, fname.split('/')[-1], 'text/plain') up_time = timer.elapsed() if comb['transfert'] == 'download' or comb[ 'transfert'] == 'upDown': p.download_file_sdk( drive_service, new_file, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( drive_service, new_file['id']) sweeper.done(comb) elif comb['if'] == 'rest': logger.warning( 'REST interface not implemented') sweeper.skip(comb) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) # delete only if rest is implmented # os.remove(comb_dir + '/' + fname.split('/')[-1]) continue if comb['transfert'] == "upload" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "upload", up_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'upload', 'time': up_time }) if comb['transfert'] == "download" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "download", dl_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'download', 'time': dl_time }) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) if os.path.isfile(comb_dir + '/' + fname): os.remove(comb_dir + '/' + fname.split('/')[-1]) f.close() # delete the Bench Folder os.rmdir(self.result_dir) logger.info("---------------------------------------") for t in check_Exp_database(self.options, self.localisation)['result']: logger.info(t)
def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process( 'cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process( "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len( res.stderr ) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning( style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 500000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16' + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host]).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('FUZZY_VERSION', str(comb['fuzzy'])) if comb['move']: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65') else: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6') line = line.replace('GAUSSIAN_HEIGHT', str(comb['height'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log', [host]).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log', [host]).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess(comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run_xp(self): master = self.cluster[0] opt = '' """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: # Take sweeper comb = self.sweeper.get_next() logger.info('Processing new combination %s' % (comb, )) try: # metric from linux sar tools, works with clock def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout) #Set CPU Freq and Policy according current combination cmd_template_Freq_Policy = ("cpufreq-set -r -g {policy}") cmd_template_Freq = ("cpufreq-set -r -f {freq}") if comb['Freq'] == 'OnDemand': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='ondemand') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() elif comb['Freq'] == 'conservative': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='conservative') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() else: cmd_freq_policy = cmd_template_Freq_Policy.format( policy='userspace') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() cmd_freq = cmd_template_Freq.format(freq=comb['Freq']) Remote(cmd_freq, master, connection_params={ 'user': '******' }).run() # build command src = 'source /opt/intel-performance-snapshoot/apsvars.sh' cmd_mpirun_template = ( "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}" ) cmd_mpirun = cmd_mpirun_template.format( opt='', pr1=comb['n_core'], typeNPB=comb['Benchmark'], NPBclass=comb['NPBclass'], pr2=comb['n_core']) cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format( src, cmd_mpirun, slugify(comb)) curPath = self.result_dir + slugify(comb) # run Mpi through execo remote SshProcess def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok # start clock and exec command in the master node time.sleep(5) startUnix = int(time.time()) start24Hour = datetime.datetime.fromtimestamp( startUnix).strftime('%H:%M:%S') task1 = runMpi(cmd) endUnix = int(time.time()) end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime( '%H:%M:%S') time.sleep(5) with open(os.path.join(curPath, "executionTime.txt"), "w") as sout: sout.write( 'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format( str(endUnix - startUnix), start24Hour, end24Hour)) takeMetric(curPath, start24Hour, end24Hour, ['cpu', 'mem', 'disk', 'swap', 'network']) # collect power from kWAPI: grid5000 infrastructure made tool for hostname in self.cluster: powerOut = '{}_power'.format(hostname) collect_metric(startUnix, endUnix, 'power', curPath, self.site, powerOut, hostname) st = '/tmp/out/' + slugify(comb) intelAppPerf = str(st + '.html') # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot'] # https://software.intel.com/en-us/performance-snapshot Get(master, [intelAppPerf], curPath, connection_params={ 'user': '******' }).run() if task1: logger.info("comb ok: %s" % (comb, )) self.sweeper.done(comb) continue except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise logger.info("comb NOT ok: %s" % (comb, )) self.sweeper.cancel(comb)
def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) if 'parapluie' in str(host): nb_proc = 24 elif 'paranoia' in str(host): nb_proc = 20 elif 'parapide' in str(host): nb_proc = 8 else: nb_proc = 16 try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' logger.info(thread_name + "Killing other RAevol") killa = Remote("killall -9 aevol_run", [host]) for killp in killa.processes: killp.ignore_error = True killa.run() if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 300000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p ' + str(nb_proc) + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: if 'CONFIGURE_ENVIRONMENT_VALUES' in line: if comb['env'] == 'const': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 1') f.write(line) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) elif comb['env'] == 'lat_3': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 2') f.write(line) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.8 0.05' + os.linesep) elif comb['env'] == 'lat_all': line = line.replace('CONFIGURE_ENVIRONMENT_VALUES', 'NB_ENVIRONMENTS 16') f.write(line) #const f.write('ENV_ADD_GAUSSIAN 1 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 1 0.5 0.8 0.05' + os.linesep) # 1 f.write('ENV_ADD_GAUSSIAN 2 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 2 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 3 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 4 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 5 0.5 0.85 0.05' + os.linesep) # 2 f.write('ENV_ADD_GAUSSIAN 6 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 6 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 7 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 8 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 9 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 10 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 11 0.5 0.85 0.05' + os.linesep) # 3 f.write('ENV_ADD_GAUSSIAN 12 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 12 0.5 0.8 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.6 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 13 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.4 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 14 0.5 0.85 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.2 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 15 0.5 0.85 0.05' + os.linesep) # 4 f.write('ENV_ADD_GAUSSIAN 16 0.5 0.25 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.45 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.65 0.05' + os.linesep) f.write('ENV_ADD_GAUSSIAN 16 0.5 0.85 0.05' + os.linesep) elif 'CONFIGURE_SIGNAL_VALUES' in line: if comb['env'] == 'const': line = line.replace('CONFIGURE_SIGNAL_VALUES', '') f.write(line) elif comb['env'] == 'lat_3': line = line.replace( 'CONFIGURE_SIGNAL_VALUES', 'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1' ) f.write(line) f.write('ENV_ADD_SIGNAL 2 1' + os.linesep) elif comb['env'] == 'lat_all': line = line.replace( 'CONFIGURE_SIGNAL_VALUES', 'CREATE_SIGNAL h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h0 w0 h1 m1 w0 h1 m0 h0 h1 w0 h0 m1 h1 h1 m1 m0 h1 m0 m1' ) f.write(line) f.write( 'CREATE_SIGNAL m0 h0 m1 h1 m1 w0 m0 m1 m0 h0 m1 h1 w0 h0 h0 h1 m1 m0 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 m1 w0 w0 h1 h0 w0 h1 h0 h0 m0 h0 w0 h0 m1 m0 w0 h1 w0 w0 h1 m0' + os.linesep) f.write( 'CREATE_SIGNAL h0 h0 h0 w0 m0 m1 m0 h1 h0 m0 h0 m1 h1 w0 h1 h0 m1 h1 m0 w0 w0 m0 w0 h0 h1 m1 w0 m0 m1 m0 w0 h1 h0 m0 h0 m1 h1 w0 h0 w0 m0 m1 m0 w0 h1 h0 w0 w0 h1' + os.linesep) f.write( 'CREATE_SIGNAL h1 h1 m0 w0 w0 h1 m1 h1 h1 m1 m0 w0 m1 m0 m0 w0 m0 h0 m0 h0 w0 h0 m0 h0 h1 m1 h0 h1 w0 h0 h1 m1 h1 m1 m0' + os.linesep) f.write('ENV_ADD_SIGNAL 2 1' + os.linesep) f.write('ENV_ADD_SIGNAL 3 2' + os.linesep) f.write('ENV_ADD_SIGNAL 4 3' + os.linesep) f.write('ENV_ADD_SIGNAL 5 4' + os.linesep) f.write('ENV_ADD_SIGNAL 6 1' + os.linesep) f.write('ENV_ADD_SIGNAL 6 2' + os.linesep) f.write('ENV_ADD_SIGNAL 7 1' + os.linesep) f.write('ENV_ADD_SIGNAL 7 3' + os.linesep) f.write('ENV_ADD_SIGNAL 8 1' + os.linesep) f.write('ENV_ADD_SIGNAL 8 4' + os.linesep) f.write('ENV_ADD_SIGNAL 9 2' + os.linesep) f.write('ENV_ADD_SIGNAL 9 3' + os.linesep) f.write('ENV_ADD_SIGNAL 10 2' + os.linesep) f.write('ENV_ADD_SIGNAL 10 4' + os.linesep) f.write('ENV_ADD_SIGNAL 11 3' + os.linesep) f.write('ENV_ADD_SIGNAL 11 4' + os.linesep) f.write('ENV_ADD_SIGNAL 12 1' + os.linesep) f.write('ENV_ADD_SIGNAL 12 2' + os.linesep) f.write('ENV_ADD_SIGNAL 12 3' + os.linesep) f.write('ENV_ADD_SIGNAL 13 1' + os.linesep) f.write('ENV_ADD_SIGNAL 13 2' + os.linesep) f.write('ENV_ADD_SIGNAL 13 4' + os.linesep) f.write('ENV_ADD_SIGNAL 14 1' + os.linesep) f.write('ENV_ADD_SIGNAL 14 3' + os.linesep) f.write('ENV_ADD_SIGNAL 14 4' + os.linesep) f.write('ENV_ADD_SIGNAL 15 2' + os.linesep) f.write('ENV_ADD_SIGNAL 15 3' + os.linesep) f.write('ENV_ADD_SIGNAL 15 4' + os.linesep) f.write('ENV_ADD_SIGNAL 16 1' + os.linesep) f.write('ENV_ADD_SIGNAL 16 2' + os.linesep) f.write('ENV_ADD_SIGNAL 16 3' + os.linesep) f.write('ENV_ADD_SIGNAL 16 4' + os.linesep) else: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('MUTATION_RATE_VALUE', comb['mutation']) line = line.replace('SELECTION_PRESSURE', str(comb['selection'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in; cp /home/jorouzaudcornabas/aevol_binary/aevol/execo/mut_lat/binding_matrix.rae .', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_create > aevol_create.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p ' + str(nb_proc) + ' -n 300000 > aevol_run.log', [host], process_args={ 'default_stdout_handler': False, 'default_stderr_handler': False }).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))