def create_paramsweeper(parameters, result_dir): """Generate an iterator over combination parameters This function initializes a `ParamSweeper` as an iterator over the possible parameters space (The dictionary of parameters space is created from the `define_parameters` function.). The detail information about the `ParamSweeper` can be found here: http://execo.gforge.inria.fr/doc/latest-stable/execo_engine.html#paramsweeper Parameters ---------- parameters: dict a dictionary contains the parameters space key: str, the name of the experiment parameter value: list, a list of possible values for a parameter of the experiment result_dir: str the path to the result directory on the disk for `ParamSweeper` to persist the state of combinations Returns ------- ParamSweeper an instance of the `ParamSweeper` object. """ logger.debug('Parameters:\n%s' % parameters) sweeps = sweep(parameters) sweeper = ParamSweeper(os.path.join(result_dir, "sweeps"), sweeps) logger.info('-----> TOTAL COMBINATIONS: %s', len(sweeps)) if len(sweeper.get_remaining()) < len(sweeps): logger.info('%s combinations remaining\n' % len(sweeper.get_remaining())) return sweeper
def define_parameters(self): """ """ parameters = self.get_parameters("conf.xml") sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """ """ parameters = { 'blas' : ['none','mkl','atlas','openblas'], 'experiment' : ['aevol','raevol'], 'compilator' : ['gcc','intel'], 'parallel' : ['openmp','tbb'] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """Create the iterator that contains the parameters to be explored """ self.parameters = { 'sizes': [100], 'zipf': [1], 'pop_keys': [100], 'min_size': [500, 1000], 'int_phases': [1, 2, 3, 4, 5, 10], 'iosf': [100] } logger.info(self.parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """ """ parameters = { 'seed': [ 51456165, 33263658, 7158785, 456847894, 1223144, 878944, 121145, 3587842 ], 'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'], 'env': ['const', 'lat_3', 'lat_all'], 'selection': [750, 2000, 4000] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def create_paramsweeper(self): """Generate an iterator over combination parameters""" if self.parameters is None: parameters = self.define_parameters() logger.detail(pformat(parameters)) sweeps = sweep(parameters) logger.info('% s combinations', len(sweeps)) self.sweeper = ParamSweeper(path.join(self.result_dir, "sweeps"), sweeps)
def create_sweeper(self): """Define the parameter space and return a sweeper.""" parameters = { 'RA': ['1.e5', '1.e6', '1.e7'], 'RCMB' : [2.], 'KFe' : [0.85, 0.9, 0.95, 0.99] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
def create_paramsweeper(self): """Test all the sites, with or without a KaVLAN and for several env.""" params = { "version": ['kadeploy3-dev', 'kadeploy3'], "kavlan": [True, False], "site": get_g5k_sites(), "n_nodes": [1, 4, 10], "env": ['wheezy-x64-base', 'wheezy-x64-prod', 'wheezy-x64-xen'] } logger.info('Defining parameters: %s', pformat(params)) combs = sweep(params) return ParamSweeper(self.result_dir + "/sweeper", combs)
def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [ cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite' ], 'cores': {i: { 'px': expRange(1, i) } for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX'] } logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def define_parameters(self): """Create the iterator on the parameters combinations to be explored""" # fixed number of nodes self.n_nodes = 4 # choose a list of clusters clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi'] #clusters = ['petitprince', 'paradent'] # compute the maximum number of cores among all clusters max_core = self.n_nodes * max([ get_host_attributes(cluster + '-1')['architecture']['smt_size'] for cluster in clusters]) # define the parameters self.parameters = { 'cluster' : clusters, 'n_core': filter(lambda i: i >= self.n_nodes, list(takewhile(lambda i: i<max_core, (2**i for i in count(0, 1))))), 'size' : ['A', 'B', 'C'] } logger.info(self.parameters) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))
def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite'], 'cores': {i: {'px': expRange(1, i)} for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']} logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def run(self): """ """ token = 'bRIJb9jp5igAAAAAAAAACc5QzQ619Vp0pYa2PdIrt0q2y0qFyJgwrKvtzuTp3Sz_' client = dropbox.client.DropboxClient(token) parameters = {'size': igeom(128, 2048, 5), 'db_if': ['rest', 'sdk']} combs = sweep(parameters) sweeper = ParamSweeper(self.result_dir + "/sweeps", combs) f = open(self.result_dir + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: comb = sweeper.get_next() logger.info('Treating combination %s', pformat(comb)) comb_dir = self.result_dir + '/' + slugify(comb) try: os.mkdir(comb_dir) except: pass fname = self.create_file(comb['size']) timer = Timer() if comb['db_if'] == 'sdk': self.upload_file_sdk(client, fname, fname.split('/')[-1]) up_time = timer.elapsed() self.download_file_sdk(client, fname.split('/')[-1], comb_dir + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time sweeper.done(comb) elif comb['db_if'] == 'rest': logger.warning('REST interface not implemented') sweeper.skip(comb) continue os.remove(fname) f.write("%f %i %f %f \n" % (timer.start_date(), comb['size'], up_time, dl_time)) f.close()
def run(self): """ run method from engine in order to do our workflow """ mongo = ClientMongo() size = dict if not self.options.file: if not self.options.only: size = { 1, long(self.options.size * 0.25), long(self.options.size * 0.5), long(self.options.size * 0.75), long(self.options.size) } else: size = {long(self.options.size)} else: if self.OnlyDownload: size = getFilSize(self.options.file) else: size = {0} drive = None if self.options.drive: drive = self.options.drive else: drive = self.drive interface = ['rest', 'sdk'] parameters = { 'size': size, 'if': interface, 'drive': drive, 'transfert': self.transfert } p = None for n in range(0, int(self.options.ntest), 1): logger.info('---------------------') logger.info('Round %i', n + 1) combs = sweep(parameters) date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pathResults = os.getcwd() + '/Results/Bench' + date sweeper = ParamSweeper(pathResults + "/sweeps", combs) f = open(pathResults + '/results.txt', 'w') while len(sweeper.get_remaining()) > 0: # sort the parameters for i in interface: for dr in drive: for s in size: comb = sweeper.get_next(filtr=lambda r: filter( lambda x: x['drive'] == dr and x['size'] == s and x['if'] == i, r)) if not comb: continue # start of the workflow if comb['drive'] == 'amazon': p = providerS3.ProviderS3() elif comb['drive'] == 'dropbox': p = providerDB.ProviderDB() else: p = providerGD.ProviderGD() logger.info('Treating combination %s', pformat(comb)) comb_dir = pathResults + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) if not self.options.file: fname = self.create_file(comb['size']) else: fname = self.options.file timer = Timer() up_time = 0 dl_time = 0 start_date = datetime.datetime.now() if comb['if'] == 'sdk': if p.provider_name == "amazon": # AMAZON clientAmz = p.getConnexion() if self.OnlyDownload: p.bucketKey += fname else: p.bucketKey += '/' + fname if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, fname) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( clientAmz.get_bucket(p.bucketName), p.bucketKey) elif p.provider_name == "dropbox": # DROPBOX client = p.getToken() if comb['transfert'] == "upload" or comb[ 'transfert'] == 'upDown': p.upload_file_sdk( client, fname, fname.split('/')[-1]) up_time = timer.elapsed() if comb['transfert'] == "download" or comb[ 'transfert'] == 'upDown': p.download_file_sdk( client, fname.split('/')[-1], comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file(client, fname.split('/')[-1]) elif p.provider_name == "googledrive": # GOOGLEDRIVE drive_service = p.getConnexion() new_file = None if comb['transfert'] == 'upload' or comb[ 'transfert'] == 'upDown': new_file = p.upload_file_sdk( drive_service, fname, fname.split('/')[-1], 'text/plain') up_time = timer.elapsed() if comb['transfert'] == 'download' or comb[ 'transfert'] == 'upDown': p.download_file_sdk( drive_service, new_file, comb_dir + '/' + fname.split('/')[-1]) dl_time = timer.elapsed() - up_time if not self.OnlyDownload: p.delete_file_sdk( drive_service, new_file['id']) sweeper.done(comb) elif comb['if'] == 'rest': logger.warning( 'REST interface not implemented') sweeper.skip(comb) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) # delete only if rest is implmented # os.remove(comb_dir + '/' + fname.split('/')[-1]) continue if comb['transfert'] == "upload" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "upload", up_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'upload', 'time': up_time }) if comb['transfert'] == "download" or comb[ 'transfert'] == "upDown": f.write("%s %s %s %s %s %s %s %f %i %s %f\n" % (self.localisation['ip'], self.localisation['lat'], self.localisation['lon'], self.localisation['city'], self.localisation['country'], comb['drive'], comb['if'], timer.start_date(), comb['size'], "download", dl_time)) mongo.collection.insert({ 'ip': self.localisation['ip'], 'latitude': self.localisation['lat'], 'longitude': self.localisation['lon'], 'city': self.localisation['city'], 'country': self.localisation['country'], 'drive': comb['drive'], 'interface': comb['if'], 'start_date': start_date, 'size': comb['size'], 'transfert': 'download', 'time': dl_time }) if not self.OnlyDownload: # logger.info('delete de '+fname) if os.path.isfile(fname): os.remove(fname) if os.path.isfile(comb_dir + '/' + fname): os.remove(comb_dir + '/' + fname.split('/')[-1]) f.close() # delete the Bench Folder os.rmdir(self.result_dir) logger.info("---------------------------------------") for t in check_Exp_database(self.options, self.localisation)['result']: logger.info(t)
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run() EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank=0 n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size']; cores = nodes * n_cores cores = cores[0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log("One or more client process failed. Enjoy reading their outputs.") self._log("OUTPUT STARTS -------------------------------------------------\n") for process in client_request.processes(): print("----- {0} returned {1}".format(process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log("OUTPUT ENDS ---------------------------------------------------\n") sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location = self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
class fp_hadoop(Engine): def __init__(self): """ Surchargement la methode init pour ajouter des options""" super(fp_hadoop, self).__init__() self.options_parser.set_usage("usage: %prog <cluster>") self.options_parser.add_argument( "cluster", "The cluster on which to run the experiment") self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") self.options_parser.add_option("-j", dest="oar_job_id", help="oar_job_id to relaunch an engine", type=int) self.options_parser.add_option("-o", dest="outofchart", help="Run the engine outside days", action="store_true") self.n_nodes = 10 self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="3:00:00") def xp(self, comb): comb_ok = False try: """ tout ton xp """ comb_ok = True finally: if comb_ok: self.sweeper.done(comb) else: self.sweeper.cancel(comb) logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy( Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter( lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """Create the iterator that contains the parameters to be explored """ self.parameters = { 'sizes': [100], 'zipf': [1], 'pop_keys': [100], 'min_size': [500, 1000], 'int_phases': [1, 2, 3, 4, 5, 10], 'iosf': [100] } logger.info(self.parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] while n_nodes < self.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, self.n_nodes def make_reservation(self): """Perform a reservation of the required number of nodes""" logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) while not n_nodes: logger.info('No enough nodes found between %s and %s, ' + \ 'increasing time window', format_date(starttime), format_date(endtime)) starttime = endtime endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) if starttime > int(time.time() + timedelta_to_seconds(datetime.timedelta( weeks=6))): logger.error('There are not enough nodes on %s for your ' + \ 'experiments, abort ...', self.cluster) exit() jobs_specs = get_jobs_specs({self.cluster: n_nodes}, name=self.__class__.__name__) sub = jobs_specs[0][0] sub.walltime = self.options.walltime sub.additional_options = '-t deploy' sub.reservation_date = startdate (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0] logger.info('Startdate: %s, n_nodes: %s', format_date(startdate), str(n_nodes))
class DVFS(Engine): def __init__(self, result_dir, cluster, site): Engine.__init__(self) self.result_dir = result_dir self.cluster = cluster self.site = site def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.run_xp() def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def run_xp(self): master = self.cluster[0] opt = '' """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: # Take sweeper comb = self.sweeper.get_next() logger.info('Processing new combination %s' % (comb, )) try: # metric from linux sar tools, works with clock def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout) #Set CPU Freq and Policy according current combination cmd_template_Freq_Policy = ("cpufreq-set -r -g {policy}") cmd_template_Freq = ("cpufreq-set -r -f {freq}") if comb['Freq'] == 'OnDemand': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='ondemand') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() elif comb['Freq'] == 'conservative': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='conservative') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() else: cmd_freq_policy = cmd_template_Freq_Policy.format( policy='userspace') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() cmd_freq = cmd_template_Freq.format(freq=comb['Freq']) Remote(cmd_freq, master, connection_params={ 'user': '******' }).run() # build command src = 'source /opt/intel-performance-snapshoot/apsvars.sh' cmd_mpirun_template = ( "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}" ) cmd_mpirun = cmd_mpirun_template.format( opt='', pr1=comb['n_core'], typeNPB=comb['Benchmark'], NPBclass=comb['NPBclass'], pr2=comb['n_core']) cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format( src, cmd_mpirun, slugify(comb)) curPath = self.result_dir + slugify(comb) # run Mpi through execo remote SshProcess def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok # start clock and exec command in the master node time.sleep(5) startUnix = int(time.time()) start24Hour = datetime.datetime.fromtimestamp( startUnix).strftime('%H:%M:%S') task1 = runMpi(cmd) endUnix = int(time.time()) end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime( '%H:%M:%S') time.sleep(5) with open(os.path.join(curPath, "executionTime.txt"), "w") as sout: sout.write( 'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format( str(endUnix - startUnix), start24Hour, end24Hour)) takeMetric(curPath, start24Hour, end24Hour, ['cpu', 'mem', 'disk', 'swap', 'network']) # collect power from kWAPI: grid5000 infrastructure made tool for hostname in self.cluster: powerOut = '{}_power'.format(hostname) collect_metric(startUnix, endUnix, 'power', curPath, self.site, powerOut, hostname) st = '/tmp/out/' + slugify(comb) intelAppPerf = str(st + '.html') # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot'] # https://software.intel.com/en-us/performance-snapshot Get(master, [intelAppPerf], curPath, connection_params={ 'user': '******' }).run() if task1: logger.info("comb ok: %s" % (comb, )) self.sweeper.done(comb) continue except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise logger.info("comb NOT ok: %s" % (comb, )) self.sweeper.cancel(comb)
class l2c_fft(Engine): workingPath = '/home/jrichard/l2c-fft-new-distrib/bin' genLadScript = '/home/jrichard/l2c-fft-new-distrib/src/utils/gen-lad/genPencil.py' def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter( lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info( 'No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info( style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)]) def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process( 'cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process( "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append( os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len( res.stderr ) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning( style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [ cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite' ], 'cores': {i: { 'px': expRange(1, i) } for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX'] } logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission( resources="nodes=%d" % (max(1, comb['cores'] / n_core), ), sql_properties="cluster='%s'" % comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([ (submission, get_cluster_site(comb['cluster'])) ])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)] def generate_machine_file(self): """ Generate a machine file used by MPI to know which nodes use during the computation """ fd, mfile = mkstemp(dir='/tmp/', prefix='mfile_') f = os.fdopen(fd, 'w') f.write('\n'.join([host.address for host in self.hosts])) f.close() return mfile
class paasage_simu(Engine): JVM = 'java' SGCBJAR = 'SGCB_nTier.jar' PJDUMP = 'pj_dump' RSCRIPT = 'Rscript' def __init__(self): """Overloading class initialization with parent and adding options""" super(paasage_simu, self).__init__() self.options_parser.set_usage("usage: %prog ") self.options_parser.set_description("Execo Engine that can be used to" + \ "perform automatic virtual machines experiments") self.options_parser.add_option("-n", dest="n_nodes", help="maximum number of nodes used", type="int", default=200) self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="05:00:00") self.options_parser.add_option( "-j", dest="oargrid_job_id", help="oargrid_job_id to relaunch an engine", type=int) self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """ """ parameters = self.get_parameters("conf.xml") sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def make_reservation(self): """ """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) planning = get_planning(elements=['grid5000'], starttime=starttime) slots = compute_slots(planning, self.options.walltime) wanted = {"grid5000": 0} start_date, end_date, resources = find_first_slot(slots, wanted) wanted['grid5000'] = min(resources['grid5000'], self.options.n_nodes) actual_resources = distribute_hosts(resources, wanted) job_specs = get_jobs_specs(actual_resources, name='Paasage_Simu') logger.info("try to reserve " + str(actual_resources)) self.oargrid_job_id, _ = oargridsub(job_specs, start_date, walltime=end_date - start_date, job_type="deploy") logger.info("Reservation done") def create_string(self, param): res_str = "" for key, value in param.iteritems(): res_str += key + "_" + str(value) + "_" return res_str def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(host.split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: logger.info(thread_name + 'Generate conf file') param_str = self.create_string(comb) Remote( "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb " + param_str, [host]).run() logger.info(thread_name + 'Run code') Remote( "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s" % param_str, [host]).run() logger.info(thread_name + 'Get results') traceFile = "ntier_" + param_str get_results = Get([host], [ "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile + ".csv" ], local_location=comb_dir).run() for p in get_results.processes: if not p.ok: logger.error( host + ': Unable to retrieve the files for combination %s', slugify(comb)) exit() comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez = get_oargrid_job_info(self.oargrid_job_id) return (rez["start_date"] + rez["walltime"] > time.time()) def get_parameters(self, file_name): """Get the parameters to sweep, from the configuration file""" tree = ET.parse(file_name) rootSrc = tree.getroot() param = dict() for inst in rootSrc.iter("instance"): ty = inst.get("type") qt = inst.get("quantity") if (qt.isdigit()): param[ty] = qt else: ends = qt.split("-") param[ty] = range(int(ends[0]), int(ends[1]) + 1) return param
"nbr_clients"] * 1000 * params["pause"] #Function to pass in parameter to ParamSweeper.get_next() #Give the illusion that the Set of params is sorted by nbr_clients def sort_params_by_nbr_clients(set): return sorted((list(set)), key=lambda k: k['nbr_clients']) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) sweeps = sweep(PARAMETERS) sweeper = ParamSweeper( # Maybe puts the sweeper under the experimentation directory # This should be current/sweeps persistence_dir=os.path.join("%s/sweeps" % TEST_DIR), sweeps=sweeps, save_sweeps=True, name="test_case_1") #Get the next parameter in the set of all remaining params #This set is temporary viewed as sorted List with this filter function. params = sweeper.get_next(sort_params_by_nbr_clients) while params: if not accept(params): # skipping element # Note that the semantic of sweeper.skip is different sweeper.done(params) params = sweeper.get_next(sort_params_by_nbr_clients) continue # cleaning old backup_dir
class mpi_bench(Engine): def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() if self.prepare_bench(): logger.info('Bench prepared on all frontends') self.run_xp() def define_parameters(self): """Create the iterator on the parameters combinations to be explored""" # fixed number of nodes self.n_nodes = 4 # choose a list of clusters clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi'] #clusters = ['petitprince', 'paradent'] # compute the maximum number of cores among all clusters max_core = self.n_nodes * max([ get_host_attributes(cluster + '-1')['architecture']['smt_size'] for cluster in clusters]) # define the parameters self.parameters = { 'cluster' : clusters, 'n_core': filter(lambda i: i >= self.n_nodes, list(takewhile(lambda i: i<max_core, (2**i for i in count(0, 1))))), 'size' : ['A', 'B', 'C'] } logger.info(self.parameters) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining())) def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
class raevol_matrix(Engine): def __init__(self): """Overloading class initialization with parent and adding options""" super(raevol_matrix, self).__init__() self.options_parser.set_usage("usage: %prog ") self.options_parser.set_description("Execo Engine that can be used to" + \ "perform automatic virtual machines experiments") self.options_parser.add_option("-n", dest="n_nodes", help="maximum number of nodes used", type="int", default=200) self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="02:00:00") self.options_parser.add_option( "-j", dest="oargrid_job_id", help="oargrid_job_id to relaunch an engine", type=int, default=None) self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") self.options_parser.add_option("-u", dest="selected_cluster", help="run on a specific cluster.", type="string", default="taurus") self.options_parser.add_option("-o", dest="outofchart", help="Run the engine outside days", action="store_true") self.options_parser.add_option( "-s", dest="storage5k_job_id", help="storage5k_job_id to store the data", type=int) def run(self): """ """ if self.options.oargrid_job_id is not None: self.oar_job_id = self.options.oargrid_job_id else: self.oar_job_id = None self.list_of_clusters = [ 'parasilo', 'paravance', 'parapluie', 'paranoia' ] try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() self.working_dir = '/data/jorouzaudcornabas_' + str( self.options.storage5k_job_id) job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.submit_all_available_best_effort( self.list_of_clusters, self.options.walltime) # self.make_reservation_local() # Wait that the job starts logger.info('Waiting that the job start ' + str(self.oar_job_id)) wait_oar_job_start(self.oar_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') #=============================================================== # deployment = Deployment(hosts = self.hosts, # env_file='/home/sirimie/env/mywheezy-x64-base.env') # self.hosts, _ = deploy(deployment) #=============================================================== if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = self.hosts threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/logs' if not os.path.exists(comb_dir): os.mkdir(comb_dir) logger.info("Starting the thread " + str(self.is_job_alive()) + " " + str(len(threads.keys()))) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] logger.info("Launching thread") t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([self.oar_job_id]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """ """ parameters = { 'seed': [ 51456165, 33263658, 7158785, 456847894, 1223144, 878944, 121145, 3587842 ], 'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'], 'env': ['const', 'lat_3', 'lat_all'], 'selection': [750, 2000, 4000] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def make_reservation(self): """ """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) planning = get_planning(elements=[self.options.selected_cluster], starttime=starttime) slots = compute_slots(planning, self.options.walltime) wanted = {self.options.selected_cluster: 0} start_date, end_date, resources = find_first_slot(slots, wanted) wanted[self.options.selected_cluster] = resources[ self.options.selected_cluster] actual_resources = distribute_hosts(resources, wanted) job_specs = get_jobs_specs(actual_resources, name='Aevol_diff_area') logger.info("try to reserve " + str(actual_resources)) self.oargrid_job_id, _ = oargridsub( job_specs, walltime=end_date - start_date, job_type=['besteffort"' 'allow_classic_ssh']) logger.info("Reservation done") def make_reservation_local(self): """Perform a reservation of the required number of nodes, with 4000 IP. """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) self.cluster = self.options.selected_cluster startdate, n_nodes = self._get_nodes(starttime, endtime) while not n_nodes: logger.info('No enough nodes found between %s and %s, ' + \ 'increasing time window', format_date(starttime), format_date(endtime)) starttime = endtime endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) if starttime > int(time.time() + timedelta_to_seconds(datetime.timedelta( weeks=6))): logger.error('There are not enough nodes on %s for your ' + \ 'experiments, abort ...', self.cluster) exit() startdate = [] jobs_specs = get_jobs_specs({self.cluster: n_nodes}, name=self.__class__.__name__) sub = jobs_specs[0][0] tmp = str(sub.resources).replace('\\', '') sub.resources = tmp.replace('"', '') sub.walltime = self.options.walltime sub.additional_options = '-t allow_classic_ssh -t besteffort' (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0] logger.info('Startdate: besteffort, n_nodes: %s', str(n_nodes)) def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster), format_date(startdate)) while n_nodes < self.options.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, n_nodes def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 500000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16' + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host]).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('FUZZY_VERSION', str(comb['fuzzy'])) if comb['move']: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65') else: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6') line = line.replace('GAUSSIAN_HEIGHT', str(comb['height'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log', [host]).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log', [host]).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez = get_oar_job_info(self.oar_job_id) if rez['state'] == 'Error': return False if (rez["start_date"] + rez["walltime"] > time.time()): return True else: return False def get_immediately_available_nodes(self, list_of_clusters, walltime, ignore_besteffort=False): planning = get_planning(list_of_clusters, ignore_besteffort=False) slots = compute_slots(planning, walltime) wanted = {cluster: 0 for cluster in list_of_clusters} start_date, end_date, resources = find_first_slot(slots, wanted) actual_resources = { resource: n_nodes for resource, n_nodes in resources.iteritems() if resource in list_of_clusters and n_nodes > 0 } return start_date, end_date, actual_resources def submit_all_available_best_effort(self, list_of_clusters, walltime): start_date, end_date, resources = self.get_immediately_available_nodes( list_of_clusters, walltime, ignore_besteffort=False) job_specs = get_jobs_specs(resources) for j, f in job_specs: j.job_type = "besteffort" j.additional_options = "-t allow_classic_ssh" #jobs = oarsub(job_specs) (self.oar_job_id, self.frontend) = oarsub(job_specs)[0] print job_specs return self.oar_job_id
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
class l2c_fft(Engine): workingPath = '/home/jrichard/l2c-fft-new-distrib/bin' genLadScript = '/home/jrichard/l2c-fft-new-distrib/src/utils/gen-lad/genPencil.py' def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter(lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info('No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info(style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)]) def workflow(self, comb): """ Compute one application launch using a given parameter group """ comb_ok = False try: # Generate configuration file needed by MPI processes logger.info("Generating assembly file...") py = comb['cores'] / comb['px'] prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], comb['datasize'], comb['px'], py, comb['transposition'])) prepare.shell = True prepare.run() # Generate the MPI host file mfile = self.generate_machine_file() # Start L2C lad = "./app.lad" logger.info("Computing...") res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad)) res.shell = True res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out')) res.stdout_handlers.append(sys.stdout) res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err')) res.stderr_handlers.append(sys.stderr) res.run() if not res.ok: logger.error('Bad L2C termination') raise Exception('Bad L2C termination') if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this logger.warning('Not empty error output') # Clean configuration files logger.info("Removing assembly files...") res = Process('cd %s && rm -f app.lad*' % self.workingPath) res.shell = True res.run() comb_ok = True except Exception: pass finally: if comb_ok: self.sweeper.done(comb) logger.info(style.host(slugify(comb)) + ' has been done') else: self.sweeper.cancel(comb) logger.warning(style.host(slugify(comb)) + ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def define_parameters(self): """ Define the parametters used by the L2C application """ parameters = { 'cluster': [cluster for site in ['grenoble', 'nancy'] for cluster in get_site_clusters(site) if cluster != 'graphite'], 'cores': {i: {'px': expRange(1, i)} for i in expRange(4, 64)}, 'datasize': expRange(256, 256), 'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']} logger.info(pformat(parameters)) sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), sql_properties="cluster='%s'"%comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)] def generate_machine_file(self): """ Generate a machine file used by MPI to know which nodes use during the computation """ fd, mfile = mkstemp(dir='/tmp/', prefix='mfile_') f = os.fdopen(fd, 'w') f.write('\n'.join([host.address for host in self.hosts])) f.close() return mfile
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int( math.ceil( float(comb['n_clients']) / EX5.get_host_attributes( cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources=resources, job_type='allow_classic_ssh', walltime='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log( "Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback=prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params={ 'user': '******' }).run() EX.Put([nodes[0]], '../server.policy', connexion_params={ 'user': '******' }).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log( "Running experiment with {0} nodes and {1} transitions per client" .format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote( 'java -jar active-data-lib-0.1.2.jar', [server], stdout_handler=out_handler, stderr_handler=out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank = 0 n_cores = EX5.get_host_attributes( clients[0])['architecture']['smt_size'] cores = nodes * n_cores cores = cores[ 0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler( os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log( "One or more client process failed. Enjoy reading their outputs." ) self._log( "OUTPUT STARTS -------------------------------------------------\n" ) for process in client_request.processes(): print("----- {0} returned {1}".format( process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log( "OUTPUT ENDS ---------------------------------------------------\n" ) sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format( len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location=self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log( "Finishing experiment with {0} clients and {1} transitions per client" .format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next(filtr=lambda r: filter( lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
class overturn(Engine): def create_sweeper(self): """Define the parameter space and return a sweeper.""" parameters = { 'RA': ['1.e5', '1.e6', '1.e7'], 'RCMB' : [2.], 'KFe' : [0.85, 0.9, 0.95, 0.99] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) def create_par_file(self, comb): """Create Run directory on remote server and upload par file""" logger.info('Creating and uploading par file') comb_dir = parent_dir + slugify(comb) + '/' logger.info('comb_dir = ' + comb_dir) # Create remote directories make_dirs = SshProcess('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' + comb_dir + 'Op ; ', jobserver).run() # Generate par file par_file = 'par_' + slugify(comb) logger.info('par_file = %s', style.emph(par_file)) nml = f90nml.read('template.nml') nml['refstate']['ra0'] = float(comb['RA']) nml['tracersin']['K_Fe'] = comb['KFe'] nml['geometry']['r_cmb'] = comb['RCMB'] nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128) nml['geometry']['nztot'] = nztot nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot) nml.write(par_file, force=True) logger.info('Created par file ' + par_file) # Upload par file to remote directory Put([jobserver], [par_file], remote_location=comb_dir).run() SshProcess('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', jobserver).run() logger.info('Done') def submit_job(self, comb): """Use the batch script on psmn""" logger.info('Submit job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = SshProcess('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', jobserver).run() return job_sub.stdout.splitlines()[-1].split('.')[0] def is_job_running(self, job_id=None): """ """ get_state = SshProcess('qstat -f ' + str(job_id), jobserver) get_state.ignore_exit_code = True get_state.run() return get_state.ok def retrieve(self): """ """ SshProcess('') def workflow(self, comb): self.create_par_file(comb) job_id = self.submit_job(comb) logger.info('Combination %s will be treated by job %s', slugify(comb), str(job_id)) while self.is_job_running(job_id): sleep(10) self.sweeper.done(comb) def run(self): self.create_sweeper() logger.info('%s parameters combinations to be treated', len(self.sweeper.get_sweeps())) threads = [] while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() logger.info('comb = %s', comb) t = Thread(target=self.workflow, args=(comb,)) t.daemon = True threads.append(t) t.start() for t in threads: t.join()
@enostask() def backup(env=None): LOG.info(f"Running backup on {env['roles']}") @enostask() def destroy(env=None): LOG.info(f"Running destroy on {env['roles']}") # Iterate over a set of parameters parameters = {"param1": [1, 4], "param2": ["a", "b"]} sweeps = sweep(parameters) sweeper = ParamSweeper( persistence_dir=str(Path("sweeps")), sweeps=sweeps, save_sweeps=True ) parameter = sweeper.get_next() while parameter: try: deploy() bench(parameter) backup() sweeper.done(parameter) except Exception as e: traceback.print_exc() sweeper.skip(parameter) finally: destroy() parameter = sweeper.get_next()
class compil_aevol(Engine): def __init__(self): """Overloading class initialization with parent and adding options""" super(compil_aevol, self).__init__() def run(self): """ """ try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb: self.workflow(comb) finally: logger.info("Compilation DONE") def define_parameters(self): """ """ parameters = { 'blas' : ['none','mkl','atlas','openblas'], 'experiment' : ['aevol','raevol'], 'compilator' : ['gcc','intel'], 'parallel' : ['openmp','tbb'] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def workflow(self, comb): """ """ comb_ok = False logger.info(slugify(comb) + \ ' starts to compile') try: export = "source /opt/intel/bin/compilervars.sh intel64; " src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/" bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/" configure_option = "--with-tracing --without-x" if comb['parallel'] == 'tbb': configure_option += " --with-tbb" if comb['blas'] == 'openblas': configure_option += " --with-blas" elif comb['blas'] == 'mkl': configure_option += " --with-mkl" elif comb['blas'] == 'atlas': configure_option += " --with-atlas" if comb['experiment'] == 'raevol': configure_option += " --with-raevol" if comb['compilator'] == 'intel': configure_option += " CXX=icc" full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas'] try: os.mkdir(full_bin_directory) except: for f in os.listdir(full_bin_directory): os.remove(full_bin_directory + "/" + f) p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/') p.shell = True # p.run() print p.stdout comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez=get_oar_job_info(self.oar_job_id) if (rez["start_date"]+rez["walltime"] > time.time()): return True else: return False
def campaign(broker, provider, conf, test, env): def generate_id(params): def clean(s): return str(s).replace("/", "_sl_") \ .replace(":", "_sc_") return "-".join([ "%s__%s" % (clean(k), clean(v)) for k, v in sorted(params.items()) ]) def accept(params): call_ratio_max = 3 cast_ratio_max = 3 call_type = params["call_type"] if params["nbr_servers"] > params["nbr_clients"]: return False if call_type == "rpc-call": if not params["pause"]: # maximum rate return call_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] else: # we can afford more clients # based on our estimation a client sends 200msgs at full rate return call_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] * 200 * params["pause"] else: if not params["pause"]: # maximum rate return cast_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] else: # we can afford more clients # based on our estimation a client sends 200msgs at full rate return cast_ratio_max * params["nbr_servers"] >= params[ "nbr_clients"] * 1000 * params["pause"] # Function to pass in parameter to ParamSweeper.get_next() # Give the illusion that the Set of params is sorted by nbr_clients def sort_params_by_nbr_clients(set): return sorted((list(set)), key=lambda k: k['nbr_clients']) # Dump each params in the backup dir def dump_param(params): if not os.path.exists("%s/params.json" % test): with open("%s/params.json" % test, 'w') as outfile: json.dump([], outfile) #Add the current params to the json with open("%s/params.json" % test, 'r') as outfile: all_params = json.load(outfile) all_params.append(params) with open("%s/params.json" % test, 'w') as outfile: json.dump(all_params, outfile) # Loading the conf config = {} with open(conf) as f: config = yaml.load(f) parameters = config["campaign"][test] sweeps = sweep(parameters) filtered_sweeps = [param for param in sweeps if accept(param)] sweeper = ParamSweeper( # Maybe puts the sweeper under the experimentation directory # This should be current/sweeps persistence_dir=os.path.join("%s/sweeps" % test), sweeps=filtered_sweeps, save_sweeps=True, name=test) params = sweeper.get_next(sort_params_by_nbr_clients) PROVIDERS[provider](broker=broker, config=config, env=test) t.inventory() while params: params.pop("backup_dir", None) params.update({"backup_dir": generate_id(params)}) t.prepare(broker=broker) t.test_case_1(**params) sweeper.done(params) dump_param(params) params = sweeper.get_next(sort_params_by_nbr_clients) t.destroy()
class overturn(Engine): def create_sweeper(self): """Define the parameter space and return a sweeper.""" parameters = { 'RA': ['1.e5', '1.e6'], 'RCMB' : [1.19, 3.29], 'KFe' : [0.85, 0.9] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) def create_par_file(self, comb): """Create Run directory on remote server and upload par file""" logger.info('Creating par file') comb_dir = parent_dir + slugify(comb) + '/' logger.info('comb_dir = ' + comb_dir) # Create remote directories mdir = sp.call('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' + comb_dir + 'Op ; ', shell=True) # Generate par file par_file = 'par_' + slugify(comb) nml = f90nml.read('template.nml') nml['refstate']['ra0'] = float(comb['RA']) nml['tracersin']['K_Fe'] = comb['KFe'] nml['geometry']['r_cmb'] = comb['RCMB'] nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128) nml['geometry']['nztot'] = nztot nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot) nml.write(par_file, force=True) logger.info('Created par file ' + par_file) # Upload par file to remote directory cpar = sp.call('cp ' + par_file + ' ' + comb_dir, shell=True) mpar = sp.call('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', shell=True) logger.info('Done') def submit_job(self, comb): """Use the batch script""" logger.info('Submiting job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = sp.Popen('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', shell=True, stdout=sp.PIPE, stderr=sp.STDOUT) return job_sub.stdout.readlines()[-1].split('.')[0] def workflow(self, comb): self.create_par_file(comb) job_id = self.submit_job(comb) logger.info('Combination %s will be treated by job %s', slugify(comb), str(job_id)) self.sweeper.done(comb) def run(self): self.create_sweeper() logger.info('%s parameters combinations to be treated', len(self.sweeper.get_sweeps())) threads = [] while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() t = Thread(target=self.workflow, args=(comb,)) t.daemon = True threads.append(t) t.start() for t in threads: t.join()
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
class fp_hadoop(Engine): def __init__(self): """ Surchargement la methode init pour ajouter des options""" super(fp_hadoop, self).__init__() self.options_parser.set_usage("usage: %prog <cluster>") self.options_parser.add_argument("cluster", "The cluster on which to run the experiment") self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") self.options_parser.add_option("-j", dest="oar_job_id", help="oar_job_id to relaunch an engine", type=int) self.options_parser.add_option("-o", dest="outofchart", help="Run the engine outside days", action="store_true") self.n_nodes = 10 self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="3:00:00") def xp(self, comb): comb_ok = False try: """ tout ton xp """ comb_ok = True finally: if comb_ok: self.sweeper.done(comb) else: self.sweeper.cancel(comb) logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy(Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """Create the iterator that contains the parameters to be explored """ self.parameters = { 'sizes': [100], 'zipf': [1], 'pop_keys': [100], 'min_size': [500, 1000], 'int_phases': [1, 2, 3, 4, 5, 10], 'iosf': [100] } logger.info(self.parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] while n_nodes < self.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, self.n_nodes def make_reservation(self): """Perform a reservation of the required number of nodes""" logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) endtime = int(starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) while not n_nodes: logger.info('No enough nodes found between %s and %s, ' + \ 'increasing time window', format_date(starttime), format_date(endtime)) starttime = endtime endtime = int(starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) if starttime > int(time.time() + timedelta_to_seconds( datetime.timedelta(weeks=6))): logger.error('There are not enough nodes on %s for your ' + \ 'experiments, abort ...', self.cluster) exit() jobs_specs = get_jobs_specs({self.cluster: n_nodes}, name=self.__class__.__name__) sub = jobs_specs[0][0] sub.walltime = self.options.walltime sub.additional_options = '-t deploy' sub.reservation_date = startdate (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0] logger.info('Startdate: %s, n_nodes: %s', format_date(startdate), str(n_nodes))