示例#1
0
def create_paramsweeper(parameters, result_dir):
    """Generate an iterator over combination parameters

    This function initializes a `ParamSweeper` as an iterator over the possible
    parameters space (The dictionary of parameters space is created from the
    `define_parameters` function.). The detail information about the `ParamSweeper`
    can be found here: http://execo.gforge.inria.fr/doc/latest-stable/execo_engine.html#paramsweeper

    Parameters
    ----------
    parameters: dict
        a dictionary contains the parameters space
        key: str, the name of the experiment parameter
        value: list, a list of possible values for a parameter of the experiment

    result_dir: str
        the path to the result directory on the disk for `ParamSweeper` to persist
        the state of combinations

    Returns
    -------
    ParamSweeper
        an instance of the `ParamSweeper` object.
    """

    logger.debug('Parameters:\n%s' % parameters)
    sweeps = sweep(parameters)
    sweeper = ParamSweeper(os.path.join(result_dir, "sweeps"), sweeps)
    logger.info('-----> TOTAL COMBINATIONS: %s', len(sweeps))
    if len(sweeper.get_remaining()) < len(sweeps):
        logger.info('%s combinations remaining\n' % len(sweeper.get_remaining()))
    return sweeper
示例#2
0
 def define_parameters(self):
     """ """
     parameters = self.get_parameters("conf.xml")
     sweeps = sweep(parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweeps)
     logger.info('Number of parameters combinations %s',
                 len(self.sweeper.get_remaining()))
示例#3
0
    def define_parameters(self):
        nbNodes = len(self.cluster)
        # build parameters and make nbCore list per benchmark
        freqList = [2534000, 2000000, 1200000]
        n_nodes = float(len(self.cluster))
        max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l',
                              self.cluster[0],
                              connection_params={
                                  'user': '******'
                              }).run().stdout
        max_core = n_nodes * float(max_core)
        even = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (2**i for i in count(0, 1)))))
        powerTwo = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (i**2 for i in count(0, 1)))))

        # Define parameters
        self.parameters = {
            'Repeat': [1],
            "Freq": [2534000],
            "NPBclass": ['C'],
            "Benchmark": {
                # 'ft': {
                #     'n_core': even
                #     },
                # 'ep': {
                #     'n_core': even
                #     },
                # 'lu': {
                #     'n_core': even
                #     },
                # 'is': {
                #     'n_core': even
                #     },
                # 'sg': {
                #     'n_core': even
                #     },
                # 'bt': {
                #     'n_core': powerTwo
                #     },
                'sp': {
                    'n_core': powerTwo
                }
            }
        }

        logger.info(self.parameters)
        # make all possible parameters object,
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))
示例#4
0
    def define_parameters(self):
        """ """
        parameters = {
	  'blas' : ['none','mkl','atlas','openblas'],
	  'experiment' : ['aevol','raevol'],
	  'compilator' : ['gcc','intel'],
	  'parallel' : ['openmp','tbb']
        }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
示例#5
0
 def define_parameters(self):
     """Create the iterator that contains the parameters to be explored """
     self.parameters = {
         'sizes': [100],
         'zipf': [1],
         'pop_keys': [100],
         'min_size': [500, 1000],
         'int_phases': [1, 2, 3, 4, 5, 10],
         'iosf': [100]
     }
     logger.info(self.parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweep(self.parameters))
     logger.info('Number of parameters combinations %s',
                 len(self.sweeper.get_remaining()))
 def define_parameters(self):
     """ """
     parameters = {
         'seed': [
             51456165, 33263658, 7158785, 456847894, 1223144, 878944,
             121145, 3587842
         ],
         'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'],
         'env': ['const', 'lat_3', 'lat_all'],
         'selection': [750, 2000, 4000]
     }
     sweeps = sweep(parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweeps)
     logger.info('Number of parameters combinations %s',
                 len(self.sweeper.get_remaining()))
示例#7
0
 def create_paramsweeper(self):
     """Generate an iterator over combination parameters"""
     if self.parameters is None:
         parameters = self.define_parameters()
     logger.detail(pformat(parameters))
     sweeps = sweep(parameters)
     logger.info('% s combinations', len(sweeps))
     self.sweeper = ParamSweeper(path.join(self.result_dir, "sweeps"),
                                 sweeps)
示例#8
0
 def create_sweeper(self):
     """Define the parameter space and return a sweeper."""
     parameters = {
         'RA': ['1.e5', '1.e6', '1.e7'],
         'RCMB' : [2.],
         'KFe' : [0.85, 0.9, 0.95, 0.99]
         }
     sweeps = sweep(parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweeps)
    def create_paramsweeper(self):
        """Test all the sites, with or without a KaVLAN and for several env."""
        params = {
            "version": ['kadeploy3-dev', 'kadeploy3'],
            "kavlan": [True, False],
            "site": get_g5k_sites(),
            "n_nodes": [1, 4, 10],
            "env": ['wheezy-x64-base', 'wheezy-x64-prod', 'wheezy-x64-xen']
        }
        logger.info('Defining parameters: %s', pformat(params))

        combs = sweep(params)
        return ParamSweeper(self.result_dir + "/sweeper", combs)
示例#10
0
 def define_parameters(self):
     """Create the iterator that contains the parameters to be explored """
     self.parameters = {
         'sizes': [100],
         'zipf': [1],
         'pop_keys': [100],
         'min_size': [500, 1000],
         'int_phases': [1, 2, 3, 4, 5, 10],
         'iosf': [100]
                   }
     logger.info(self.parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), 
                                 sweep(self.parameters))
     logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
示例#11
0
    def define_parameters(self):
        """
            Define the parametters used by the L2C application
        """
        parameters = {
            'cluster': [
                cluster for site in ['grenoble', 'nancy']
                for cluster in get_site_clusters(site) if cluster != 'graphite'
            ],
            'cores': {i: {
                'px': expRange(1, i)
            }
                      for i in expRange(4, 64)},
            'datasize':
            expRange(256, 256),
            'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']
        }

        logger.info(pformat(parameters))
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))
示例#12
0
 def define_parameters(self):
     """Create the iterator on the parameters combinations to be explored"""
     # fixed number of nodes
     self.n_nodes = 4
     # choose a list of clusters
     clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi']
     #clusters = ['petitprince', 'paradent']
     # compute the maximum number of cores among all clusters
     max_core = self.n_nodes * max([
             get_host_attributes(cluster + '-1')['architecture']['smt_size']
             for cluster in clusters])
     # define the parameters
     self.parameters = {
         'cluster' : clusters,
         'n_core': filter(lambda i: i >= self.n_nodes,
                          list(takewhile(lambda i: i<max_core,
                                         (2**i for i in count(0, 1))))),
         'size' : ['A', 'B', 'C']
         }
     logger.info(self.parameters)
     # define the iterator over the parameters combinations
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                 sweep(self.parameters))
     logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))
示例#13
0
 def define_parameters(self):
     """
         Define the parametters used by the L2C application
     """
     parameters = {
         'cluster': [cluster for site in ['grenoble', 'nancy'] 
                     for cluster in get_site_clusters(site) if cluster != 'graphite'],
         'cores': {i: {'px': expRange(1, i)} 
                   for i in expRange(4, 64)},
         'datasize': expRange(256, 256),
         'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']}
     
     logger.info(pformat(parameters))
     sweeps = sweep(parameters)
     self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)        
     logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
    def run(self):
        """ """
        token = 'bRIJb9jp5igAAAAAAAAACc5QzQ619Vp0pYa2PdIrt0q2y0qFyJgwrKvtzuTp3Sz_'
        client = dropbox.client.DropboxClient(token)
        parameters = {'size': igeom(128, 2048, 5), 'db_if': ['rest', 'sdk']}
        combs = sweep(parameters)
        sweeper = ParamSweeper(self.result_dir + "/sweeps", combs)

        f = open(self.result_dir + '/results.txt', 'w')
        while len(sweeper.get_remaining()) > 0:
            comb = sweeper.get_next()

            logger.info('Treating combination %s', pformat(comb))
            comb_dir = self.result_dir + '/' + slugify(comb)
            try:
                os.mkdir(comb_dir)
            except:
                pass

            fname = self.create_file(comb['size'])

            timer = Timer()

            if comb['db_if'] == 'sdk':
                self.upload_file_sdk(client, fname, fname.split('/')[-1])
                up_time = timer.elapsed()
                self.download_file_sdk(client,
                                       fname.split('/')[-1],
                                       comb_dir + fname.split('/')[-1])
                dl_time = timer.elapsed() - up_time
                sweeper.done(comb)
            elif comb['db_if'] == 'rest':
                logger.warning('REST interface not implemented')
                sweeper.skip(comb)
                continue
            os.remove(fname)
            f.write("%f %i %f %f \n" %
                    (timer.start_date(), comb['size'], up_time, dl_time))
        f.close()
示例#15
0
    def run(self):
        """ """
        token = 'bRIJb9jp5igAAAAAAAAACc5QzQ619Vp0pYa2PdIrt0q2y0qFyJgwrKvtzuTp3Sz_'
        client = dropbox.client.DropboxClient(token)
        parameters = {'size': igeom(128, 2048, 5),
                      'db_if': ['rest', 'sdk']}
        combs = sweep(parameters)
        sweeper = ParamSweeper(self.result_dir + "/sweeps", combs)

        f = open(self.result_dir + '/results.txt', 'w')
        while len(sweeper.get_remaining()) > 0:
            comb = sweeper.get_next()

            logger.info('Treating combination %s', pformat(comb))
            comb_dir = self.result_dir + '/' + slugify(comb)
            try:
                os.mkdir(comb_dir)
            except:
                pass

            fname = self.create_file(comb['size'])

            timer = Timer()

            if comb['db_if'] == 'sdk':
                self.upload_file_sdk(client, fname, fname.split('/')[-1])
                up_time = timer.elapsed()
                self.download_file_sdk(client, fname.split('/')[-1],
                                       comb_dir + fname.split('/')[-1])
                dl_time = timer.elapsed() - up_time
                sweeper.done(comb)
            elif comb['db_if'] == 'rest':
                logger.warning('REST interface not implemented')
                sweeper.skip(comb)
                continue
            os.remove(fname)
            f.write("%f %i %f %f \n" % (timer.start_date(), comb['size'],
                                        up_time, dl_time))
        f.close()
示例#16
0
文件: bench.py 项目: hyptos/kyd
    def run(self):
        """ run method from engine in order to do our workflow """

        mongo = ClientMongo()

        size = dict
        if not self.options.file:
            if not self.options.only:
                size = {
                    1,
                    long(self.options.size * 0.25),
                    long(self.options.size * 0.5),
                    long(self.options.size * 0.75),
                    long(self.options.size)
                }
            else:
                size = {long(self.options.size)}
        else:
            if self.OnlyDownload:
                size = getFilSize(self.options.file)
            else:
                size = {0}

        drive = None
        if self.options.drive:
            drive = self.options.drive
        else:
            drive = self.drive

        interface = ['rest', 'sdk']
        parameters = {
            'size': size,
            'if': interface,
            'drive': drive,
            'transfert': self.transfert
        }

        p = None

        for n in range(0, int(self.options.ntest), 1):
            logger.info('---------------------')
            logger.info('Round %i', n + 1)
            combs = sweep(parameters)
            date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            pathResults = os.getcwd() + '/Results/Bench' + date
            sweeper = ParamSweeper(pathResults + "/sweeps", combs)
            f = open(pathResults + '/results.txt', 'w')

            while len(sweeper.get_remaining()) > 0:
                # sort the parameters
                for i in interface:
                    for dr in drive:
                        for s in size:
                            comb = sweeper.get_next(filtr=lambda r: filter(
                                lambda x: x['drive'] == dr and x['size'] == s
                                and x['if'] == i, r))
                            if not comb: continue

                            # start of the workflow
                            if comb['drive'] == 'amazon':
                                p = providerS3.ProviderS3()
                            elif comb['drive'] == 'dropbox':
                                p = providerDB.ProviderDB()
                            else:
                                p = providerGD.ProviderGD()

                            logger.info('Treating combination %s',
                                        pformat(comb))
                            comb_dir = pathResults + '/' + slugify(comb)

                            if not os.path.isdir(comb_dir):
                                os.mkdir(comb_dir)

                            if not self.options.file:
                                fname = self.create_file(comb['size'])
                            else:
                                fname = self.options.file

                            timer = Timer()
                            up_time = 0
                            dl_time = 0
                            start_date = datetime.datetime.now()
                            if comb['if'] == 'sdk':
                                if p.provider_name == "amazon":
                                    # AMAZON
                                    clientAmz = p.getConnexion()
                                    if self.OnlyDownload:
                                        p.bucketKey += fname
                                    else:
                                        p.bucketKey += '/' + fname
                                    if comb['transfert'] == "upload" or comb[
                                            'transfert'] == 'upDown':
                                        p.upload_file_sdk(
                                            clientAmz.get_bucket(p.bucketName),
                                            p.bucketKey, fname)
                                    up_time = timer.elapsed()

                                    if comb['transfert'] == "download" or comb[
                                            'transfert'] == 'upDown':
                                        p.download_file_sdk(
                                            clientAmz.get_bucket(p.bucketName),
                                            p.bucketKey, comb_dir + '/' +
                                            fname.split('/')[-1])
                                    dl_time = timer.elapsed() - up_time

                                    if not self.OnlyDownload:
                                        p.delete_file_sdk(
                                            clientAmz.get_bucket(p.bucketName),
                                            p.bucketKey)

                                elif p.provider_name == "dropbox":
                                    # DROPBOX
                                    client = p.getToken()
                                    if comb['transfert'] == "upload" or comb[
                                            'transfert'] == 'upDown':
                                        p.upload_file_sdk(
                                            client, fname,
                                            fname.split('/')[-1])
                                    up_time = timer.elapsed()
                                    if comb['transfert'] == "download" or comb[
                                            'transfert'] == 'upDown':
                                        p.download_file_sdk(
                                            client,
                                            fname.split('/')[-1], comb_dir +
                                            '/' + fname.split('/')[-1])
                                    dl_time = timer.elapsed() - up_time

                                    if not self.OnlyDownload:
                                        p.delete_file(client,
                                                      fname.split('/')[-1])

                                elif p.provider_name == "googledrive":
                                    # GOOGLEDRIVE
                                    drive_service = p.getConnexion()
                                    new_file = None
                                    if comb['transfert'] == 'upload' or comb[
                                            'transfert'] == 'upDown':
                                        new_file = p.upload_file_sdk(
                                            drive_service, fname,
                                            fname.split('/')[-1], 'text/plain')
                                        up_time = timer.elapsed()
                                    if comb['transfert'] == 'download' or comb[
                                            'transfert'] == 'upDown':
                                        p.download_file_sdk(
                                            drive_service, new_file, comb_dir +
                                            '/' + fname.split('/')[-1])
                                        dl_time = timer.elapsed() - up_time

                                    if not self.OnlyDownload:
                                        p.delete_file_sdk(
                                            drive_service, new_file['id'])

                                sweeper.done(comb)
                            elif comb['if'] == 'rest':
                                logger.warning(
                                    'REST interface not implemented')
                                sweeper.skip(comb)
                                if not self.OnlyDownload:
                                    # logger.info('delete de '+fname)
                                    if os.path.isfile(fname):
                                        os.remove(fname)
                                        # delete only if rest is implmented
                                        # os.remove(comb_dir + '/' + fname.split('/')[-1])
                                continue
                            if comb['transfert'] == "upload" or comb[
                                    'transfert'] == "upDown":
                                f.write("%s %s %s %s %s %s %s %f %i %s %f\n" %
                                        (self.localisation['ip'],
                                         self.localisation['lat'],
                                         self.localisation['lon'],
                                         self.localisation['city'],
                                         self.localisation['country'],
                                         comb['drive'], comb['if'],
                                         timer.start_date(), comb['size'],
                                         "upload", up_time))
                                mongo.collection.insert({
                                    'ip':
                                    self.localisation['ip'],
                                    'latitude':
                                    self.localisation['lat'],
                                    'longitude':
                                    self.localisation['lon'],
                                    'city':
                                    self.localisation['city'],
                                    'country':
                                    self.localisation['country'],
                                    'drive':
                                    comb['drive'],
                                    'interface':
                                    comb['if'],
                                    'start_date':
                                    start_date,
                                    'size':
                                    comb['size'],
                                    'transfert':
                                    'upload',
                                    'time':
                                    up_time
                                })

                            if comb['transfert'] == "download" or comb[
                                    'transfert'] == "upDown":
                                f.write("%s %s %s %s %s %s %s %f %i %s %f\n" %
                                        (self.localisation['ip'],
                                         self.localisation['lat'],
                                         self.localisation['lon'],
                                         self.localisation['city'],
                                         self.localisation['country'],
                                         comb['drive'], comb['if'],
                                         timer.start_date(), comb['size'],
                                         "download", dl_time))
                                mongo.collection.insert({
                                    'ip':
                                    self.localisation['ip'],
                                    'latitude':
                                    self.localisation['lat'],
                                    'longitude':
                                    self.localisation['lon'],
                                    'city':
                                    self.localisation['city'],
                                    'country':
                                    self.localisation['country'],
                                    'drive':
                                    comb['drive'],
                                    'interface':
                                    comb['if'],
                                    'start_date':
                                    start_date,
                                    'size':
                                    comb['size'],
                                    'transfert':
                                    'download',
                                    'time':
                                    dl_time
                                })

                            if not self.OnlyDownload:
                                # logger.info('delete de '+fname)
                                if os.path.isfile(fname):
                                    os.remove(fname)
                                if os.path.isfile(comb_dir + '/' + fname):
                                    os.remove(comb_dir + '/' +
                                              fname.split('/')[-1])
            f.close()
        # delete the Bench Folder
        os.rmdir(self.result_dir)
        logger.info("---------------------------------------")
        for t in check_Exp_database(self.options, self.localisation)['result']:
            logger.info(t)
	def run(self):
		# Defining experiment parameters
		self.parameters = {
			'n_clients': [400, 450, 500, 550, 600],
			'n_transitions': [10000]
		}
		cluster = 'griffon'
		sweeps = sweep(self.parameters)
		sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
		server_out_path = os.path.join(self.result_dir, "server.out")
		
		self._updateStat(sweeper.stats())
		
		# Loop on the number of nodes
		while True:
			# Taking the next parameter combinations
			comb = sweeper.get_next()
			if not comb: break

			# Performing the submission on G5K
			site = get_cluster_site(cluster)
			self._log("Output will go to " + self.result_dir)
			
			n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1
			self._log("Reserving {0} nodes on {1}".format(n_nodes, site))
			
			resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
			submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00')
			
			job = EX5.oarsub([(submission, site)])
			self.__class__._job = job
			
			# Sometimes oarsub fails silently
			if job[0][0] is None:
				print("\nError: no job was created")
				sys.exit(1)
				
			# Wait for the job to start
			self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL))
			EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction)
			nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])
			
			# Deploying nodes
			#deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
			#run_deploy = EX5.deploy(deployment)
			#nodes_deployed = run_deploy.hosts[0]
			
			# Copying active_data program on all deployed hosts
			EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run()
			EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run()
			
			# Loop on the number of requests per client process
			while True:
				# Split the nodes
				clients = nodes[1:]
				server = nodes[0] 
				
				self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions']))
				
				# Launching Server on one node
				out_handler = FileOutputHandler(server_out_path)
				launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start()
				self._log("Server started on " + server.address)
				time.sleep(2)
				
				# Launching clients
				rank=0
				n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size'];
				cores = nodes * n_cores
				cores = cores[0:comb['n_clients']] # Cut out the additional cores
				
				client_connection_params = {
						'taktuk_gateway': 'lyon.grid5000.fr',
						'host_rewrite_func': None
				}
				
				self._log("Launching {0} clients...".format(len(cores)))
				
				client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
								"{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
				client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out"))
				client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
									stdout_handler = client_out_handler, stderr_handler = client_out_handler)
				
				client_request.run()
				
				if not client_request.ok():
					# Some client failed, please panic
					self._log("One or more client process failed. Enjoy reading their outputs.")
					self._log("OUTPUT STARTS -------------------------------------------------\n")
					for process in client_request.processes():
						print("----- {0} returned {1}".format(process.host().address, process.exit_code()))
						if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL)
						if not process.stderr() == "": print(RED + process.stderr() + NORMAL)
						print("")
					self._log("OUTPUT ENDS ---------------------------------------------------\n")
					sweeper.skip(comb)
					launch_server.kill()
					launch_server.wait()
				else:
					# Waiting for server to end
					launch_server.wait()
				
					# Getting log files
					distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions'])
					local_path = distant_path
					
					EX.Get([server], distant_path).run()
					
					EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run()
					
					EX.Get([server], 'client_*.out', local_location = self.result_dir)
					EX.Remote('rm -f client_*.out', [server])
					
					self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions']))
					
					sweeper.done(comb)
					
				sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r))
				self._updateStat(sweeper.stats())
				
				if not sub_comb: 
					# Killing job
					EX5.oar.oardel(job)
					self.__class__._job = None
					break
				else: 
					comb = sub_comb
		
		print ""
示例#18
0
class fp_hadoop(Engine):
    def __init__(self):
        """ Surchargement la methode init pour ajouter des options"""
        super(fp_hadoop, self).__init__()
        self.options_parser.set_usage("usage: %prog <cluster>")
        self.options_parser.add_argument(
            "cluster", "The cluster on which to run the experiment")
        self.options_parser.add_option("-k",
                                       dest="keep_alive",
                                       help="keep reservation alive ..",
                                       action="store_true")
        self.options_parser.add_option("-j",
                                       dest="oar_job_id",
                                       help="oar_job_id to relaunch an engine",
                                       type=int)
        self.options_parser.add_option("-o",
                                       dest="outofchart",
                                       help="Run the engine outside days",
                                       action="store_true")
        self.n_nodes = 10
        self.options_parser.add_option("-w",
                                       dest="walltime",
                                       help="walltime for the reservation",
                                       type="string",
                                       default="3:00:00")

    def xp(self, comb):
        comb_ok = False
        try:
            """ tout ton xp """

            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
            else:
                self.sweeper.cancel(comb)
            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(
                    Deployment(self.hosts,
                               env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" %
                            (len(deployed), len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module
                attr = get_host_attributes(self.cluster + '-1')

                ## SETUP FINISHED

                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r: filter(
                        lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

    def define_parameters(self):
        """Create the iterator that contains the parameters to be explored """
        self.parameters = {
            'sizes': [100],
            'zipf': [1],
            'pop_keys': [100],
            'min_size': [500, 1000],
            'int_phases': [1, 2, 3, 4, 5, 10],
            'iosf': [100]
        }
        logger.info(self.parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def _get_nodes(self, starttime, endtime):
        """ """
        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        while n_nodes < self.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, self.n_nodes

    def make_reservation(self):
        """Perform a reservation of the required number of nodes"""
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        endtime = int(
            starttime +
            timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
        startdate, n_nodes = self._get_nodes(starttime, endtime)
        while not n_nodes:
            logger.info('No enough nodes found between %s and %s, ' + \
                        'increasing time window',
                        format_date(starttime), format_date(endtime))
            starttime = endtime
            endtime = int(
                starttime +
                timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
            startdate, n_nodes = self._get_nodes(starttime, endtime)
            if starttime > int(time.time() +
                               timedelta_to_seconds(datetime.timedelta(
                                   weeks=6))):
                logger.error('There are not enough nodes on %s for your ' + \
                             'experiments, abort ...', self.cluster)
                exit()
        jobs_specs = get_jobs_specs({self.cluster: n_nodes},
                                    name=self.__class__.__name__)
        sub = jobs_specs[0][0]
        sub.walltime = self.options.walltime
        sub.additional_options = '-t deploy'
        sub.reservation_date = startdate
        (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0]
        logger.info('Startdate: %s, n_nodes: %s', format_date(startdate),
                    str(n_nodes))
示例#19
0
class DVFS(Engine):
    def __init__(self, result_dir, cluster, site):
        Engine.__init__(self)
        self.result_dir = result_dir
        self.cluster = cluster
        self.site = site

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.run_xp()

    def define_parameters(self):
        nbNodes = len(self.cluster)
        # build parameters and make nbCore list per benchmark
        freqList = [2534000, 2000000, 1200000]
        n_nodes = float(len(self.cluster))
        max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l',
                              self.cluster[0],
                              connection_params={
                                  'user': '******'
                              }).run().stdout
        max_core = n_nodes * float(max_core)
        even = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (2**i for i in count(0, 1)))))
        powerTwo = filter(
            lambda i: i > n_nodes,
            list(takewhile(lambda i: i < max_core,
                           (i**2 for i in count(0, 1)))))

        # Define parameters
        self.parameters = {
            'Repeat': [1],
            "Freq": [2534000],
            "NPBclass": ['C'],
            "Benchmark": {
                # 'ft': {
                #     'n_core': even
                #     },
                # 'ep': {
                #     'n_core': even
                #     },
                # 'lu': {
                #     'n_core': even
                #     },
                # 'is': {
                #     'n_core': even
                #     },
                # 'sg': {
                #     'n_core': even
                #     },
                # 'bt': {
                #     'n_core': powerTwo
                #     },
                'sp': {
                    'n_core': powerTwo
                }
            }
        }

        logger.info(self.parameters)
        # make all possible parameters object,
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def run_xp(self):

        master = self.cluster[0]
        opt = ''
        """Iterate over the parameters and execute the bench"""
        while len(self.sweeper.get_remaining()) > 0:
            # Take sweeper
            comb = self.sweeper.get_next()

            logger.info('Processing new combination %s' % (comb, ))

            try:
                # metric from linux sar tools, works with clock
                def takeMetric(
                        path,
                        startTime,
                        endTime,
                        metric=['cpu', 'mem', 'disk', 'swap', 'network']):
                    opt = ''
                    cmd_template_sar = (
                        "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}"
                    )
                    for met in metric:
                        if met == 'cpu':
                            opt = 'u'
                        elif met == 'mem':
                            opt = 'r'
                        elif met == 'disk':
                            opt = 'dp'
                        elif met == 'swap':
                            opt = 'S'
                        elif met == 'network':
                            opt = 'n DEV'

                        cmd = cmd_template_sar.format(opt=opt,
                                                      startTime=startTime,
                                                      endTime=endTime)
                        for host in self.cluster:
                            hE = SshProcess(cmd,
                                            host,
                                            connection_params={'user': '******'})
                            hE.run()
                            stdMetric = host + '-' + met + '.txt'
                            with open(os.path.join(path, stdMetric),
                                      "w") as sout:
                                sout.write(hE.stdout)

                #Set CPU Freq and Policy according current combination
                cmd_template_Freq_Policy = ("cpufreq-set -r  -g {policy}")
                cmd_template_Freq = ("cpufreq-set -r -f {freq}")

                if comb['Freq'] == 'OnDemand':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='ondemand')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                elif comb['Freq'] == 'conservative':
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='conservative')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                else:
                    cmd_freq_policy = cmd_template_Freq_Policy.format(
                        policy='userspace')
                    Remote(cmd_freq_policy,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()
                    cmd_freq = cmd_template_Freq.format(freq=comb['Freq'])
                    Remote(cmd_freq,
                           master,
                           connection_params={
                               'user': '******'
                           }).run()

                # build command
                src = 'source /opt/intel-performance-snapshoot/apsvars.sh'
                cmd_mpirun_template = (
                    "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}"
                )
                cmd_mpirun = cmd_mpirun_template.format(
                    opt='',
                    pr1=comb['n_core'],
                    typeNPB=comb['Benchmark'],
                    NPBclass=comb['NPBclass'],
                    pr2=comb['n_core'])
                cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format(
                    src, cmd_mpirun, slugify(comb))

                curPath = self.result_dir + slugify(comb)

                # run Mpi through execo remote SshProcess
                def runMpi(cmd):
                    act = SshProcess(cmd,
                                     master,
                                     connection_params={'user': '******'},
                                     shell=True)
                    act.run()

                    if not os.path.exists(curPath):
                        os.makedirs(curPath)

                    with open(os.path.join(curPath, "stdout.txt"),
                              "a+") as sout, open(
                                  os.path.join(curPath, "stderr.txt"),
                                  "w") as serr:
                        sout.write(act.stdout)
                        serr.write(act.stderr)
                    return act.ok

                # start clock and exec command in the master node
                time.sleep(5)
                startUnix = int(time.time())
                start24Hour = datetime.datetime.fromtimestamp(
                    startUnix).strftime('%H:%M:%S')

                task1 = runMpi(cmd)

                endUnix = int(time.time())
                end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime(
                    '%H:%M:%S')
                time.sleep(5)

                with open(os.path.join(curPath, "executionTime.txt"),
                          "w") as sout:
                    sout.write(
                        'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format(
                            str(endUnix - startUnix), start24Hour, end24Hour))

                takeMetric(curPath, start24Hour, end24Hour,
                           ['cpu', 'mem', 'disk', 'swap', 'network'])

                # collect power from kWAPI: grid5000 infrastructure made tool
                for hostname in self.cluster:
                    powerOut = '{}_power'.format(hostname)
                    collect_metric(startUnix, endUnix, 'power', curPath,
                                   self.site, powerOut, hostname)

                st = '/tmp/out/' + slugify(comb)
                intelAppPerf = str(st + '.html')

                # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot']
                # https://software.intel.com/en-us/performance-snapshot
                Get(master, [intelAppPerf],
                    curPath,
                    connection_params={
                        'user': '******'
                    }).run()
                if task1:
                    logger.info("comb ok: %s" % (comb, ))
                    self.sweeper.done(comb)
                    continue

            except OSError as err:
                print("OS error: {0}".format(err))
            except ValueError:
                print("Could not convert data to an integer.")
            except:
                print("Unexpected error:", sys.exc_info()[0])
                raise

            logger.info("comb NOT ok: %s" % (comb, ))
            self.sweeper.cancel(comb)
示例#20
0
class l2c_fft(Engine):
    workingPath = '/home/jrichard/l2c-fft-new-distrib/bin'
    genLadScript = '/home/jrichard/l2c-fft-new-distrib/src/utils/gen-lad/genPencil.py'

    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()

        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id,
                                self.frontend)['state'] == 'Error':
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: filter(
                        lambda x: x['cores'] == comb['cores'] and x['cluster']
                        == comb['cluster'], r))

                    if not subcomb:
                        logger.info(
                            'No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(
                            style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id,
                                            self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break

            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])

    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process(
                'cd %s && python %s %d %d %d %d %d %s app.lad' %
                (self.workingPath, self.genLadScript, comb['datasize'],
                 comb['datasize'], comb['datasize'], comb['px'], py,
                 comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process(
                "export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s"
                % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(
                os.path.join(self.result_dir,
                             slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(
                    res.stderr
            ) > 0:  # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()

            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(
                    style.host(slugify(comb)) + ' has been canceled')

            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def define_parameters(self):
        """
            Define the parametters used by the L2C application
        """
        parameters = {
            'cluster': [
                cluster for site in ['grenoble', 'nancy']
                for cluster in get_site_clusters(site) if cluster != 'graphite'
            ],
            'cores': {i: {
                'px': expRange(1, i)
            }
                      for i in expRange(4, 64)},
            'datasize':
            expRange(256, 256),
            'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']
        }

        logger.info(pformat(parameters))
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def get_nodes(self, comb):
        """
            Perform a submission for a given comb and 
            retrieve the submission node list
        """
        logger.info('Performing submission')
        n_core = get_host_attributes(comb['cluster'] +
                                     '-1')['architecture']['smt_size']
        submission = OarSubmission(
            resources="nodes=%d" % (max(1, comb['cores'] / n_core), ),
            sql_properties="cluster='%s'" % comb['cluster'],
            job_type="besteffort",
            name="l2c_fft_eval")
        self.oar_job_id, self.frontend = oarsub([
            (submission, get_cluster_site(comb['cluster']))
        ])[0]
        logger.info("Waiting for job start")
        wait_oar_job_start(self.oar_job_id, self.frontend)
        logger.info("Retrieving hosts list")
        nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
        self.hosts = [host for host in nodes for i in range(n_core)]

    def generate_machine_file(self):
        """
            Generate a machine file used by MPI 
            to know which nodes use during the computation
        """
        fd, mfile = mkstemp(dir='/tmp/', prefix='mfile_')
        f = os.fdopen(fd, 'w')
        f.write('\n'.join([host.address for host in self.hosts]))
        f.close()
        return mfile
示例#21
0
class paasage_simu(Engine):

    JVM = 'java'
    SGCBJAR = 'SGCB_nTier.jar'
    PJDUMP = 'pj_dump'
    RSCRIPT = 'Rscript'

    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(paasage_simu, self).__init__()
        self.options_parser.set_usage("usage: %prog ")
        self.options_parser.set_description("Execo Engine that can be used to" + \
                "perform automatic virtual machines experiments")
        self.options_parser.add_option("-n",
                                       dest="n_nodes",
                                       help="maximum number of nodes used",
                                       type="int",
                                       default=200)
        self.options_parser.add_option("-w",
                                       dest="walltime",
                                       help="walltime for the reservation",
                                       type="string",
                                       default="05:00:00")
        self.options_parser.add_option(
            "-j",
            dest="oargrid_job_id",
            help="oargrid_job_id to relaunch an engine",
            type=int)
        self.options_parser.add_option("-k",
                                       dest="keep_alive",
                                       help="keep reservation alive ..",
                                       action="store_true")

    def run(self):
        """ """
        if self.options.oargrid_job_id:
            self.oargrid_job_id = self.options.oargrid_job_id
        else:
            self.oargrid_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oargrid_job_id is None:
                    self.make_reservation()
                # Wait that the job starts
                logger.info('Waiting that the job start')
                wait_oargrid_job_start(self.oargrid_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oargrid_job_nodes(self.oargrid_job_id)
                # Hosts deployment and configuration

                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                deployment = Deployment(
                    hosts=self.hosts,
                    env_file='/home/sirimie/env/mywheezy-x64-base.env')
                self.hosts, _ = deploy(deployment)

                Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*",
                       self.hosts).run()
                Remote(
                    "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml",
                    self.hosts).run()
                Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml",
                       self.hosts).run()

                Put(self.hosts, [
                    "run_all_execo.py", "xml_gen_execo.py", "conf.xml",
                    "platform_aws.xml", "cloud_ec2.xml"
                ],
                    remote_location="/home/Work/sgcbntier/paasage_demo/").run(
                    )
                logger.info("Done")

                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = [
                    host for host in self.hosts for i in range(
                        get_host_attributes(host)['architecture']['smt_size'])
                ]

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/csv_results'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False
                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]

                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oargrid_job_id = None

        finally:
            if self.oargrid_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oargriddel([self.oargrid_job_id])
                else:
                    logger.info('Keeping job alive for debugging')

    def define_parameters(self):
        """ """
        parameters = self.get_parameters("conf.xml")
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def make_reservation(self):
        """ """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        planning = get_planning(elements=['grid5000'], starttime=starttime)
        slots = compute_slots(planning, self.options.walltime)
        wanted = {"grid5000": 0}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        wanted['grid5000'] = min(resources['grid5000'], self.options.n_nodes)
        actual_resources = distribute_hosts(resources, wanted)

        job_specs = get_jobs_specs(actual_resources, name='Paasage_Simu')
        logger.info("try to reserve " + str(actual_resources))
        self.oargrid_job_id, _ = oargridsub(job_specs,
                                            start_date,
                                            walltime=end_date - start_date,
                                            job_type="deploy")
        logger.info("Reservation done")

    def create_string(self, param):
        res_str = ""
        for key, value in param.iteritems():
            res_str += key + "_" + str(value) + "_"
        return res_str

    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(host.split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            logger.info(thread_name + 'Generate conf file')
            param_str = self.create_string(comb)

            Remote(
                "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb "
                + param_str, [host]).run()

            logger.info(thread_name + 'Run code')
            Remote(
                "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s"
                % param_str, [host]).run()

            logger.info(thread_name + 'Get results')

            traceFile = "ntier_" + param_str
            get_results = Get([host], [
                "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile +
                ".csv"
            ],
                              local_location=comb_dir).run()

            for p in get_results.processes:
                if not p.ok:
                    logger.error(
                        host +
                        ': Unable to retrieve the files for combination %s',
                        slugify(comb))
                    exit()

            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez = get_oargrid_job_info(self.oargrid_job_id)
        return (rez["start_date"] + rez["walltime"] > time.time())

    def get_parameters(self, file_name):
        """Get the parameters to sweep, from the configuration file"""
        tree = ET.parse(file_name)
        rootSrc = tree.getroot()
        param = dict()

        for inst in rootSrc.iter("instance"):
            ty = inst.get("type")
            qt = inst.get("quantity")
            if (qt.isdigit()):
                param[ty] = qt
            else:
                ends = qt.split("-")
                param[ty] = range(int(ends[0]), int(ends[1]) + 1)

        return param
示例#22
0
                "nbr_clients"] * 1000 * params["pause"]


#Function to pass in parameter to ParamSweeper.get_next()
#Give the illusion that the Set of params is sorted by nbr_clients
def sort_params_by_nbr_clients(set):
    return sorted((list(set)), key=lambda k: k['nbr_clients'])


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    sweeps = sweep(PARAMETERS)
    sweeper = ParamSweeper(
        # Maybe puts the sweeper under the experimentation directory
        # This should be current/sweeps
        persistence_dir=os.path.join("%s/sweeps" % TEST_DIR),
        sweeps=sweeps,
        save_sweeps=True,
        name="test_case_1")

    #Get the next parameter in the set of all remaining params
    #This set is temporary viewed as sorted List with this filter function.
    params = sweeper.get_next(sort_params_by_nbr_clients)
    while params:
        if not accept(params):
            # skipping element
            # Note that the semantic of sweeper.skip is different
            sweeper.done(params)
            params = sweeper.get_next(sort_params_by_nbr_clients)
            continue
        # cleaning old backup_dir
示例#23
0
class mpi_bench(Engine):

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        if self.prepare_bench():
            logger.info('Bench prepared on all frontends')
            self.run_xp()

    def define_parameters(self):
        """Create the iterator on the parameters combinations to be explored"""
        # fixed number of nodes
        self.n_nodes = 4
        # choose a list of clusters
        clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi']
        #clusters = ['petitprince', 'paradent']
        # compute the maximum number of cores among all clusters
        max_core = self.n_nodes * max([
                get_host_attributes(cluster + '-1')['architecture']['smt_size']
                for cluster in clusters])
        # define the parameters
        self.parameters = {
            'cluster' : clusters,
            'n_core': filter(lambda i: i >= self.n_nodes,
                             list(takewhile(lambda i: i<max_core,
                                            (2**i for i in count(0, 1))))),
            'size' : ['A', 'B', 'C']
            }
        logger.info(self.parameters)
        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining()))

    def prepare_bench(self):
        """bench configuration and compilation, copy binaries to frontends
        
        return True if preparation is ok
        """
        logger.info("preparation: configure and compile benchmark")
        # the involved sites. We will do the compilation on the first of these.
        sites = list(set(map(get_cluster_site, self.parameters['cluster'])))
        # generate the bench compilation configuration
        bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core)
                                 for n_core in self.parameters['n_core']
                                 for size in self.parameters['size'] ])
        # Reserving a node because compiling on the frontend is forbidden
        # and because we need mpif77
        jobs = oarsub([(OarSubmission(resources = "nodes=1",
                                      job_type = 'allow_classic_ssh',
                                      walltime ='0:10:00'), sites[0])])
        if jobs[0][0]:
            try:
                logger.info("copying bench archive to %s" % (sites[0],))
                copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run()
                logger.info("extracting bench archive on %s" % (sites[0],))
                extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run()
                logger.info("waiting job start %s" % (jobs[0],))
                wait_oar_job_start(*jobs[0], prediction_callback = pred_cb)
                logger.info("getting nodes of %s" % (jobs[0],))
                nodes = get_oar_job_nodes(*jobs[0])
                logger.info("configure bench compilation")
                conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run()
                logger.info("compil bench")
                compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run()
                logger.info("compil finished")
            except:
                logger.error("unable to compile bench")
                return False
            finally:
                oardel(jobs)
        # Copying binaries to all other frontends
        frontends = sites[1:]
        rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', 
                       [get_host_site(nodes[0])] * len(frontends)) 
        rsync.run()
        return compilation.ok and rsync.ok

    def run_xp(self):
        """Iterate over the parameters and execute the bench"""
        while len(self.sweeper.get_remaining()) > 0:
            comb = self.sweeper.get_next()
            if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: 
                self.sweeper.skip(comb)
                continue
            logger.info('Processing new combination %s' % (comb,))
            site = get_cluster_site(comb['cluster'])
            jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes),
                                          job_type = 'allow_classic_ssh', 
                                          walltime ='0:10:00'), 
                            site)])
            if jobs[0][0]:
                try:
                    wait_oar_job_start(*jobs[0])
                    nodes = get_oar_job_nodes(*jobs[0])
                    bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % (
                        ",".join([node.address for node in nodes]),
                        comb['n_core'],
                        get_mpi_opts(comb['cluster']),
                        comb['size'],
                        comb['n_core'])
                    lu_bench = SshProcess(bench_cmd, nodes[0])
                    lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out')
                    lu_bench.run()
                    if lu_bench.ok:
                        logger.info("comb ok: %s" % (comb,))
                        self.sweeper.done(comb)
                        continue
                finally:
                    oardel(jobs)
            logger.info("comb NOT ok: %s" % (comb,))
            self.sweeper.cancel(comb)
class raevol_matrix(Engine):
    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(raevol_matrix, self).__init__()
        self.options_parser.set_usage("usage: %prog ")
        self.options_parser.set_description("Execo Engine that can be used to" + \
                "perform automatic virtual machines experiments")
        self.options_parser.add_option("-n",
                                       dest="n_nodes",
                                       help="maximum number of nodes used",
                                       type="int",
                                       default=200)
        self.options_parser.add_option("-w",
                                       dest="walltime",
                                       help="walltime for the reservation",
                                       type="string",
                                       default="02:00:00")
        self.options_parser.add_option(
            "-j",
            dest="oargrid_job_id",
            help="oargrid_job_id to relaunch an engine",
            type=int,
            default=None)
        self.options_parser.add_option("-k",
                                       dest="keep_alive",
                                       help="keep reservation alive ..",
                                       action="store_true")
        self.options_parser.add_option("-u",
                                       dest="selected_cluster",
                                       help="run on a specific cluster.",
                                       type="string",
                                       default="taurus")
        self.options_parser.add_option("-o",
                                       dest="outofchart",
                                       help="Run the engine outside days",
                                       action="store_true")
        self.options_parser.add_option(
            "-s",
            dest="storage5k_job_id",
            help="storage5k_job_id to store the data",
            type=int)

    def run(self):
        """ """
        if self.options.oargrid_job_id is not None:
            self.oar_job_id = self.options.oargrid_job_id
        else:
            self.oar_job_id = None

        self.list_of_clusters = [
            'parasilo', 'paravance', 'parapluie', 'paranoia'
        ]

        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
            self.working_dir = '/data/jorouzaudcornabas_' + str(
                self.options.storage5k_job_id)

            job_is_dead = False
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if self.oar_job_id is None:
                    self.submit_all_available_best_effort(
                        self.list_of_clusters, self.options.walltime)
                    # self.make_reservation_local()
                # Wait that the job starts
                logger.info('Waiting that the job start ' +
                            str(self.oar_job_id))
                wait_oar_job_start(self.oar_job_id)
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id)
                # Hosts deployment and configuration
                default_connection_params['user'] = '******'

                logger.info("Start hosts configuration")
                ex_log.setLevel('INFO')
                #===============================================================
                # deployment = Deployment(hosts = self.hosts,
                #             env_file='/home/sirimie/env/mywheezy-x64-base.env')
                # self.hosts, _ = deploy(deployment)
                #===============================================================
                if len(self.hosts) == 0:
                    break

                # Initializing the resources and threads
                available_hosts = self.hosts

                threads = {}

                # Creating the unique folder for storing the results
                comb_dir = self.result_dir + '/logs'
                if not os.path.exists(comb_dir):
                    os.mkdir(comb_dir)

                logger.info("Starting the thread " + str(self.is_job_alive()) +
                            " " + str(len(threads.keys())))
                # Checking that the job is running and not in Error
                while self.is_job_alive() or len(threads.keys()) > 0:
                    job_is_dead = False

                    while self.options.n_nodes > len(available_hosts):
                        tmp_threads = dict(threads)
                        for t in tmp_threads:
                            if not t.is_alive():
                                available_hosts.append(tmp_threads[t]['host'])
                                del threads[t]
                        sleep(5)
                        if not self.is_job_alive():
                            job_is_dead = True
                            break
                    if job_is_dead:
                        break

                    # Getting the next combination
                    comb = self.sweeper.get_next()
                    if not comb:
                        while len(threads.keys()) > 0:
                            tmp_threads = dict(threads)
                            for t in tmp_threads:
                                if not t.is_alive():
                                    del threads[t]
                            logger.info('Waiting for threads to complete')
                            sleep(20)
                        break

                    host = available_hosts[0]
                    available_hosts = available_hosts[1:]
                    logger.info("Launching thread")
                    t = Thread(target=self.workflow,
                               args=(comb, host, comb_dir))
                    threads[t] = {'host': host}
                    t.daemon = True
                    t.start()

                if not self.is_job_alive():
                    job_is_dead = True

                if job_is_dead:
                    self.oar_job_id = None

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([self.oar_job_id])
                else:
                    logger.info('Keeping job alive for debugging')

    def define_parameters(self):
        """ """
        parameters = {
            'seed': [
                51456165, 33263658, 7158785, 456847894, 1223144, 878944,
                121145, 3587842
            ],
            'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'],
            'env': ['const', 'lat_3', 'lat_all'],
            'selection': [750, 2000, 4000]
        }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)
        logger.info('Number of parameters combinations %s',
                    len(self.sweeper.get_remaining()))

    def make_reservation(self):
        """ """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        planning = get_planning(elements=[self.options.selected_cluster],
                                starttime=starttime)
        slots = compute_slots(planning, self.options.walltime)
        wanted = {self.options.selected_cluster: 0}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        wanted[self.options.selected_cluster] = resources[
            self.options.selected_cluster]
        actual_resources = distribute_hosts(resources, wanted)

        job_specs = get_jobs_specs(actual_resources, name='Aevol_diff_area')
        logger.info("try to reserve " + str(actual_resources))
        self.oargrid_job_id, _ = oargridsub(
            job_specs,
            walltime=end_date - start_date,
            job_type=['besteffort"'
                      'allow_classic_ssh'])
        logger.info("Reservation done")

    def make_reservation_local(self):
        """Perform a reservation of the required number of nodes, with 4000 IP.
        """
        logger.info('Performing reservation')
        starttime = int(time.time() +
                        timedelta_to_seconds(datetime.timedelta(minutes=1)))
        endtime = int(
            starttime +
            timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
        self.cluster = self.options.selected_cluster
        startdate, n_nodes = self._get_nodes(starttime, endtime)

        while not n_nodes:
            logger.info('No enough nodes found between %s and %s, ' + \
                        'increasing time window',
                        format_date(starttime), format_date(endtime))
            starttime = endtime
            endtime = int(
                starttime +
                timedelta_to_seconds(datetime.timedelta(days=3, minutes=1)))
            startdate, n_nodes = self._get_nodes(starttime, endtime)
            if starttime > int(time.time() +
                               timedelta_to_seconds(datetime.timedelta(
                                   weeks=6))):
                logger.error('There are not enough nodes on %s for your ' + \
                             'experiments, abort ...', self.cluster)
                exit()
        startdate = []
        jobs_specs = get_jobs_specs({self.cluster: n_nodes},
                                    name=self.__class__.__name__)
        sub = jobs_specs[0][0]
        tmp = str(sub.resources).replace('\\', '')
        sub.resources = tmp.replace('"', '')
        sub.walltime = self.options.walltime
        sub.additional_options = '-t allow_classic_ssh -t besteffort'
        (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0]
        logger.info('Startdate: besteffort, n_nodes: %s', str(n_nodes))

    def _get_nodes(self, starttime, endtime):
        """ """
        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster),
                    format_date(startdate))
        while n_nodes < self.options.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, n_nodes

    def workflow(self, comb, host, comb_dir):
        """ """
        comb_ok = False
        thread_name = style.Thread(str(host).split('.')[0]) + ': '
        logger.info(thread_name + 'Starting combination ' + slugify(comb))

        try:
            self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; "

            bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify(
                comb) + '/'

            if os.path.isdir(bucketname) and os.path.exists(bucketname +
                                                            '/last_gener.txt'):
                logger.info(thread_name + "Resuming AEVOL from NFS backup")

                gen_file = open(bucketname + '/last_gener.txt', 'r')

                last_gen = gen_file.read()

                if int(last_gen) < 500000:
                    logger.info(thread_name + "Resuming AEVOL Run from " +
                                str(int(last_gen)))
                    rem = Remote(
                        self.export + 'cd ' + bucketname +
                        '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16'
                        + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log',
                        [host]).run()
                    if rem.ok:
                        comb_ok = True
                else:
                    comb_ok = True
            else:
                Remote('mkdir -p ' + bucketname, [host]).run()

                param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in'

                logger.info(thread_name + 'Generate config file ' + param_file)

                f_template = open(param_file)
                fd, outfile = mkstemp(dir='/tmp/',
                                      prefix=slugify(comb) + '_param')
                f = os.fdopen(fd, 'w')

                for line in f_template:
                    line = line.replace('SEED_NUMBER', str(comb['seed']))
                    line = line.replace('FUZZY_VERSION', str(comb['fuzzy']))
                    if comb['move']:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65')
                    else:
                        line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2')
                        line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6')
                    line = line.replace('GAUSSIAN_HEIGHT', str(comb['height']))
                    f.write(line)

                f_template.close()
                f.close()

                put_file = Put([host], [outfile],
                               remote_location=bucketname).run()
                if not put_file.ok:
                    exit()

                os.remove(outfile)

                Remote(
                    'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] +
                    ' param.in', [host]).run()

                logger.info(thread_name + "Launching AEVOL Create")
                Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log',
                    [host]).run()

                logger.info(thread_name + "Launching AEVOL Run")
                rem = Remote(
                    self.export + 'cd ' + bucketname +
                    '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log',
                    [host]).run()
                if rem.ok:
                    comb_ok = True

            logger.info(thread_name + 'Get results ' + comb_dir + "/" +
                        slugify(comb))

        #try:
        #os.mkdir(comb_dir + "/" + slugify(comb))
        #except:
        #logger.warning(thread_name +
        #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb))

#shutil.rmtree(comb_dir+ "/" + slugify(comb))
#try:
#os.mkdir(comb_dir + "/" + slugify(comb))
#except:
#logger.warning(thread_name +
#'%s already exists, recreating directory', comb_dir + "/" + slugify(comb))

#get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'],
#local_location=comb_dir + "/" + slugify(comb)).run()

#for p in get_results.processes:
#if not p.ok:
#logger.error(thread_name +
#': Unable to retrieve the files for combination %s',
#slugify(comb))
#exit()

        finally:
            if comb_ok:
                self.sweeper.done(comb)
                # shutil.rmtree(bucketname)
                logger.info(thread_name + ': ' + slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(thread_name + ': ' + slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                    len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez = get_oar_job_info(self.oar_job_id)
        if rez['state'] == 'Error':
            return False

        if (rez["start_date"] + rez["walltime"] > time.time()):
            return True
        else:
            return False

    def get_immediately_available_nodes(self,
                                        list_of_clusters,
                                        walltime,
                                        ignore_besteffort=False):
        planning = get_planning(list_of_clusters, ignore_besteffort=False)
        slots = compute_slots(planning, walltime)
        wanted = {cluster: 0 for cluster in list_of_clusters}
        start_date, end_date, resources = find_first_slot(slots, wanted)
        actual_resources = {
            resource: n_nodes
            for resource, n_nodes in resources.iteritems()
            if resource in list_of_clusters and n_nodes > 0
        }
        return start_date, end_date, actual_resources

    def submit_all_available_best_effort(self, list_of_clusters, walltime):
        start_date, end_date, resources = self.get_immediately_available_nodes(
            list_of_clusters, walltime, ignore_besteffort=False)
        job_specs = get_jobs_specs(resources)
        for j, f in job_specs:
            j.job_type = "besteffort"
            j.additional_options = "-t allow_classic_ssh"
        #jobs = oarsub(job_specs)
        (self.oar_job_id, self.frontend) = oarsub(job_specs)[0]
        print job_specs
        return self.oar_job_id
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {'workload_filename': workloads}
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(str(
            self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(job_id,
                                   site,
                                   prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn("NOT deployed nodes: {}".format(
                        str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info("installing OAR nodes: {}".format(
                        str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = (
                        "oar-server oar-server-pgsql oar-user "
                        "oar-user-pgsql postgresql python3-pip "
                        "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(
                        str(nodes[0])))
                    install_master = SshProcess(
                        install_cmd + server_packages + ";" +
                        install_oar_sched_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd,
                                           nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = (
                        "oarnotify --remove-queue default;"
                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info("configuring OAR database: {}".format(
                        str(nodes[0])))
                    config_master = SshProcess(
                        create_db + ";" + config_oar_sched + ";" + start_oar,
                        nodes[0],
                        connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process(
                        'scp -o BatchMode=yes -o PasswordAuthentication=no '
                        '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                        '-o ConnectTimeout=20 -rp -o User=root ' +
                        nodes[0].address + ":/var/lib/oar/.ssh"
                        ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key],
                        "/var/lib/oar/",
                        connection_params={
                            'user': '******'
                        }).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(
                            node=node.address)

                    add_resources = SshProcess(
                        add_resources_cmd,
                        nodes[0],
                        connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError(
                            "error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(
                        combi['workload_filename'])
                    oar_replay = SshProcess(
                        script_path + "/oar_replay.py " +
                        combi['workload_filename'] + " " + self.result_dir +
                        "  oar_gant_" + workload_file, nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
示例#26
0
class l2c_fft(Engine):
    workingPath = '/home/jrichard/l2c-fft-new-distrib/bin'
    genLadScript = '/home/jrichard/l2c-fft-new-distrib/src/utils/gen-lad/genPencil.py'

    def run(self):
        """
            Main engine method to perform the experiment
        """
        self.define_parameters()
        
        while len(self.sweeper.get_remaining()) > 0:
            # Getting the next combination
            comb = self.sweeper.get_next()
            logger.info(style.host(slugify(comb)) + ' has been started')
            self.get_nodes(comb)

            # If the job is broken, the program is stopped
            if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': 
                break

            try:
                self.workflow(comb)

                # Process all combinations that can use the same submission
                while True:
                    # Find the next combination combinations that can use the same submission
                    subcomb = self.sweeper.get_next(lambda r: 
                        filter(lambda x: x['cores'] == comb['cores']
                                        and x['cluster'] == comb['cluster'], r))

                    if not subcomb: 
                        logger.info('No more combination for cluster=%s and cores=%s',
                            comb['cluster'], comb['cores'])
                        break
                    else:
                        logger.info(style.host(slugify(subcomb)) + ' has been started')

                        if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error':
                            self.workflow(subcomb)
                        else:
                            break
            
            # Whatever happens (errors, end of loop), the job is deleted
            finally:
                logger.info('Deleting job...')
                oardel([(self.oar_job_id, self.frontend)])

    def workflow(self, comb):
        """
            Compute one application launch 
            using a given parameter group
        """
        comb_ok = False
        try:
            # Generate configuration file needed by MPI processes
            logger.info("Generating assembly file...")
            py = comb['cores'] / comb['px']
            prepare = Process('cd %s && python %s %d %d %d %d %d %s app.lad' % 
                (self.workingPath, self.genLadScript, comb['datasize'], comb['datasize'], 
                    comb['datasize'], comb['px'], py, comb['transposition']))
            prepare.shell = True
            prepare.run()

            # Generate the MPI host file
            mfile = self.generate_machine_file()

            # Start L2C
            lad = "./app.lad"
            logger.info("Computing...")
            res = Process("export OAR_JOB_KEY_FILE=~/.oar_key ; cd %s && l2c_loader -M,-machinefile,%s --mpi -c %d %s" % (self.workingPath, mfile, comb['cores'], lad))
            res.shell = True
            res.stdout_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.out'))
            res.stdout_handlers.append(sys.stdout)
            res.stderr_handlers.append(os.path.join(self.result_dir, slugify(comb) + '.err'))
            res.stderr_handlers.append(sys.stderr)
            res.run()
            if not res.ok:
                logger.error('Bad L2C termination')
                raise Exception('Bad L2C termination')
            if len(res.stderr) > 0: # WARNING: when L2C cannot find the LAD file or something strange like this
                logger.warning('Not empty error output')

            # Clean configuration files
            logger.info("Removing assembly files...")
            res = Process('cd %s && rm -f app.lad*' % self.workingPath)
            res.shell = True
            res.run()
                
            comb_ok = True
        except Exception:
            pass
        finally:
            if comb_ok:
                self.sweeper.done(comb)
                logger.info(style.host(slugify(comb)) + ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(style.host(slugify(comb)) + ' has been canceled')
        
            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def define_parameters(self):
        """
            Define the parametters used by the L2C application
        """
        parameters = {
            'cluster': [cluster for site in ['grenoble', 'nancy'] 
                        for cluster in get_site_clusters(site) if cluster != 'graphite'],
            'cores': {i: {'px': expRange(1, i)} 
                      for i in expRange(4, 64)},
            'datasize': expRange(256, 256),
            'transposition': ['XYZ', 'XZY', 'YXZ', 'YZX', 'ZXY', 'ZYX']}
        
        logger.info(pformat(parameters))
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)        
        logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))

    def get_nodes(self, comb):
        """
            Perform a submission for a given comb and 
            retrieve the submission node list
        """
        logger.info('Performing submission')
        n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size']
        submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), 
                   sql_properties="cluster='%s'"%comb['cluster'],
                   job_type="besteffort", 
                   name="l2c_fft_eval")
        self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0]
        logger.info("Waiting for job start")
        wait_oar_job_start(self.oar_job_id, self.frontend)
        logger.info("Retrieving hosts list")
        nodes = get_oar_job_nodes(self.oar_job_id, self.frontend)
        self.hosts = [host for host in nodes for i in range(n_core)]

    def generate_machine_file(self):
        """
            Generate a machine file used by MPI 
            to know which nodes use during the computation
        """
        fd, mfile = mkstemp(dir='/tmp/', prefix='mfile_')
        f = os.fdopen(fd, 'w')
        f.write('\n'.join([host.address for host in self.hosts]))
        f.close()
        return mfile
示例#27
0
    def run(self):
        # Defining experiment parameters
        self.parameters = {
            'n_clients': [400, 450, 500, 550, 600],
            'n_transitions': [10000]
        }
        cluster = 'griffon'
        sweeps = sweep(self.parameters)
        sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        server_out_path = os.path.join(self.result_dir, "server.out")

        self._updateStat(sweeper.stats())

        # Loop on the number of nodes
        while True:
            # Taking the next parameter combinations
            comb = sweeper.get_next()
            if not comb: break

            # Performing the submission on G5K
            site = get_cluster_site(cluster)
            self._log("Output will go to " + self.result_dir)

            n_nodes = int(
                math.ceil(
                    float(comb['n_clients']) / EX5.get_host_attributes(
                        cluster + '-1')['architecture']['smt_size'])) + 1
            self._log("Reserving {0} nodes on {1}".format(n_nodes, site))

            resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes)
            submission = EX5.OarSubmission(resources=resources,
                                           job_type='allow_classic_ssh',
                                           walltime='00:10:00')

            job = EX5.oarsub([(submission, site)])
            self.__class__._job = job

            # Sometimes oarsub fails silently
            if job[0][0] is None:
                print("\nError: no job was created")
                sys.exit(1)

            # Wait for the job to start
            self._log(
                "Waiting for job {0} to start...\n".format(BOLD_MAGENTA +
                                                           str(job[0][0]) +
                                                           NORMAL))
            EX5.wait_oar_job_start(job[0][0],
                                   job[0][1],
                                   prediction_callback=prediction)
            nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1])

            # Deploying nodes
            #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file')
            #run_deploy = EX5.deploy(deployment)
            #nodes_deployed = run_deploy.hosts[0]

            # Copying active_data program on all deployed hosts
            EX.Put([nodes[0]],
                   '../dist/active-data-lib-0.1.2.jar',
                   connexion_params={
                       'user': '******'
                   }).run()
            EX.Put([nodes[0]],
                   '../server.policy',
                   connexion_params={
                       'user': '******'
                   }).run()

            # Loop on the number of requests per client process
            while True:
                # Split the nodes
                clients = nodes[1:]
                server = nodes[0]

                self._log(
                    "Running experiment with {0} nodes and {1} transitions per client"
                    .format(len(clients), comb['n_transitions']))

                # Launching Server on one node
                out_handler = FileOutputHandler(server_out_path)
                launch_server = EX.Remote(
                    'java -jar active-data-lib-0.1.2.jar', [server],
                    stdout_handler=out_handler,
                    stderr_handler=out_handler).start()
                self._log("Server started on " + server.address)
                time.sleep(2)

                # Launching clients
                rank = 0
                n_cores = EX5.get_host_attributes(
                    clients[0])['architecture']['smt_size']
                cores = nodes * n_cores
                cores = cores[
                    0:comb['n_clients']]  # Cut out the additional cores

                client_connection_params = {
                    'taktuk_gateway': 'lyon.grid5000.fr',
                    'host_rewrite_func': None
                }

                self._log("Launching {0} clients...".format(len(cores)))

                client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \
                    "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions'])
                client_out_handler = FileOutputHandler(
                    os.path.join(self.result_dir, "clients.out"))
                client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \
                     stdout_handler = client_out_handler, stderr_handler = client_out_handler)

                client_request.run()

                if not client_request.ok():
                    # Some client failed, please panic
                    self._log(
                        "One or more client process failed. Enjoy reading their outputs."
                    )
                    self._log(
                        "OUTPUT STARTS -------------------------------------------------\n"
                    )
                    for process in client_request.processes():
                        print("----- {0} returned {1}".format(
                            process.host().address, process.exit_code()))
                        if not process.stdout() == "":
                            print(GREEN + process.stdout() + NORMAL)
                        if not process.stderr() == "":
                            print(RED + process.stderr() + NORMAL)
                        print("")
                    self._log(
                        "OUTPUT ENDS ---------------------------------------------------\n"
                    )
                    sweeper.skip(comb)
                    launch_server.kill()
                    launch_server.wait()
                else:
                    # Waiting for server to end
                    launch_server.wait()

                    # Getting log files
                    distant_path = OUT_FILE_FORMAT.format(
                        len(cores), comb['n_transitions'])
                    local_path = distant_path

                    EX.Get([server], distant_path).run()

                    EX.Local('mv ' + distant_path + ' ' +
                             os.path.join(self.result_dir, local_path)).run()

                    EX.Get([server],
                           'client_*.out',
                           local_location=self.result_dir)
                    EX.Remote('rm -f client_*.out', [server])

                    self._log(
                        "Finishing experiment with {0} clients and {1} transitions per client"
                        .format(comb['n_clients'], comb['n_transitions']))

                    sweeper.done(comb)

                sub_comb = sweeper.get_next(filtr=lambda r: filter(
                    lambda s: s["n_clients"] == comb['n_clients'], r))
                self._updateStat(sweeper.stats())

                if not sub_comb:
                    # Killing job
                    EX5.oar.oardel(job)
                    self.__class__._job = None
                    break
                else:
                    comb = sub_comb

        print ""
示例#28
0
class overturn(Engine):

    def create_sweeper(self):
        """Define the parameter space and return a sweeper."""
        parameters = {
            'RA': ['1.e5', '1.e6', '1.e7'],
            'RCMB' : [2.],
            'KFe' : [0.85, 0.9, 0.95, 0.99]
            }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)

    def create_par_file(self, comb):
        """Create Run directory on remote server and upload par file"""
        logger.info('Creating and uploading par file')
        comb_dir = parent_dir + slugify(comb) + '/'
        logger.info('comb_dir = ' + comb_dir)
        # Create remote directories
        make_dirs = SshProcess('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' +
                               comb_dir + 'Op ; ', jobserver).run()
        # Generate par file
        par_file = 'par_' + slugify(comb)
        logger.info('par_file = %s', style.emph(par_file))
        nml = f90nml.read('template.nml')
        nml['refstate']['ra0'] = float(comb['RA'])
        nml['tracersin']['K_Fe'] = comb['KFe']
        nml['geometry']['r_cmb'] = comb['RCMB']
        nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128)
        nml['geometry']['nztot'] = nztot
        nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot)
        nml.write(par_file, force=True)
        logger.info('Created par file ' + par_file)
        # Upload par file to remote directory
        Put([jobserver], [par_file], remote_location=comb_dir).run()
        SshProcess('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', jobserver).run()
        logger.info('Done')

    def submit_job(self, comb):
        """Use the batch script on psmn"""
        logger.info('Submit job on '+ jobserver)
        comb_dir = parent_dir + slugify(comb) + '/'
        job_sub = SshProcess('cd ' + comb_dir +
                             ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single',
                             jobserver).run()

        return job_sub.stdout.splitlines()[-1].split('.')[0]

    def is_job_running(self, job_id=None):
        """ """
        get_state = SshProcess('qstat -f ' + str(job_id), jobserver)
        get_state.ignore_exit_code = True
        get_state.run()
        return get_state.ok

    def retrieve(self):
        """ """
        SshProcess('')

    def workflow(self, comb):
        self.create_par_file(comb)
        job_id = self.submit_job(comb)
        logger.info('Combination %s will be treated by job %s',
                    slugify(comb), str(job_id))

        while self.is_job_running(job_id):
            sleep(10)

        self.sweeper.done(comb)


    def run(self):

        self.create_sweeper()
        logger.info('%s parameters combinations to be treated', len(self.sweeper.get_sweeps()))
        threads = []
        while len(self.sweeper.get_remaining()) > 0:
            comb = self.sweeper.get_next()
            logger.info('comb = %s', comb)
            t = Thread(target=self.workflow, args=(comb,))
            t.daemon = True
            threads.append(t)
            t.start()
        
        for t in threads:
			t.join()
示例#29
0
@enostask()
def backup(env=None):
    LOG.info(f"Running backup on {env['roles']}")


@enostask()
def destroy(env=None):
    LOG.info(f"Running destroy on {env['roles']}")


# Iterate over a set of parameters
parameters = {"param1": [1, 4], "param2": ["a", "b"]}
sweeps = sweep(parameters)
sweeper = ParamSweeper(
    persistence_dir=str(Path("sweeps")), sweeps=sweeps, save_sweeps=True
)
parameter = sweeper.get_next()
while parameter:
    try:
        deploy()
        bench(parameter)
        backup()
        sweeper.done(parameter)
    except Exception as e:
        traceback.print_exc()
        sweeper.skip(parameter)
    finally:
        destroy()
        parameter = sweeper.get_next()
示例#30
0
class compil_aevol(Engine):

    def __init__(self):
        """Overloading class initialization with parent and adding options"""
        super(compil_aevol, self).__init__()

    def run(self):
        """ """
        try:
            # Creation of the main iterator which is used for the first control loop.
            self.define_parameters()
	    
            # While there are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
	      comb = self.sweeper.get_next()
              if comb:
		self.workflow(comb)
	finally:
	  logger.info("Compilation DONE")
	  
    def define_parameters(self):
        """ """
        parameters = {
	  'blas' : ['none','mkl','atlas','openblas'],
	  'experiment' : ['aevol','raevol'],
	  'compilator' : ['gcc','intel'],
	  'parallel' : ['openmp','tbb']
        }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps)
        logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))

    def workflow(self, comb):
        """ """
        comb_ok = False
        logger.info(slugify(comb) + \
                             ' starts to compile')
        try:
	    export = "source /opt/intel/bin/compilervars.sh intel64; "
	    
	    src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/"
	    
	    bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/"
    
	    configure_option = "--with-tracing --without-x"
	    
	    if comb['parallel'] == 'tbb':
	      configure_option += " --with-tbb"
	      
	    if comb['blas'] == 'openblas':
	      configure_option += " --with-blas"
	    elif comb['blas'] == 'mkl':
	      configure_option += " --with-mkl"
	    elif comb['blas'] == 'atlas':
	      configure_option += " --with-atlas"
	        
	    if comb['experiment'] == 'raevol':
	      configure_option += " --with-raevol"
	      
	    if comb['compilator'] == 'intel':
	      configure_option += " CXX=icc"
	      
	    full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas']
	    
	    try:
	      os.mkdir(full_bin_directory)
            except:
	      for f in os.listdir(full_bin_directory):
		os.remove(full_bin_directory + "/" + f)
		
	    p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/')
	    p.shell = True
	    #
	    p.run()
	    
	    print p.stdout
	    
            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
	        logger.info(slugify(comb) + \
                             ' has been done')
            else:
                self.sweeper.cancel(comb)
                logger.warning(slugify(comb) + \
                            ' has been canceled')
        logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def is_job_alive(self):
        rez=get_oar_job_info(self.oar_job_id)
        if (rez["start_date"]+rez["walltime"] > time.time()):
            return True
        else:
            return False
示例#31
0
def campaign(broker, provider, conf, test, env):
    def generate_id(params):
        def clean(s):
            return str(s).replace("/", "_sl_") \
                .replace(":", "_sc_")

        return "-".join([
            "%s__%s" % (clean(k), clean(v)) for k, v in sorted(params.items())
        ])

    def accept(params):
        call_ratio_max = 3
        cast_ratio_max = 3
        call_type = params["call_type"]
        if params["nbr_servers"] > params["nbr_clients"]:
            return False
        if call_type == "rpc-call":
            if not params["pause"]:
                # maximum rate
                return call_ratio_max * params["nbr_servers"] >= params[
                    "nbr_clients"]
            else:
                # we can afford more clients
                # based on our estimation a client sends 200msgs at full rate
                return call_ratio_max * params["nbr_servers"] >= params[
                    "nbr_clients"] * 200 * params["pause"]
        else:
            if not params["pause"]:
                # maximum rate
                return cast_ratio_max * params["nbr_servers"] >= params[
                    "nbr_clients"]
            else:
                # we can afford more clients
                # based on our estimation a client sends 200msgs at full rate
                return cast_ratio_max * params["nbr_servers"] >= params[
                    "nbr_clients"] * 1000 * params["pause"]

    # Function to pass in parameter to ParamSweeper.get_next()
    # Give the illusion that the Set of params is sorted by nbr_clients
    def sort_params_by_nbr_clients(set):
        return sorted((list(set)), key=lambda k: k['nbr_clients'])

    # Dump each params in the backup dir
    def dump_param(params):
        if not os.path.exists("%s/params.json" % test):
            with open("%s/params.json" % test, 'w') as outfile:
                json.dump([], outfile)
        #Add the current params to the json
        with open("%s/params.json" % test, 'r') as outfile:
            all_params = json.load(outfile)
        all_params.append(params)
        with open("%s/params.json" % test, 'w') as outfile:
            json.dump(all_params, outfile)

    # Loading the conf
    config = {}
    with open(conf) as f:
        config = yaml.load(f)
    parameters = config["campaign"][test]

    sweeps = sweep(parameters)
    filtered_sweeps = [param for param in sweeps if accept(param)]
    sweeper = ParamSweeper(
        # Maybe puts the sweeper under the experimentation directory
        # This should be current/sweeps
        persistence_dir=os.path.join("%s/sweeps" % test),
        sweeps=filtered_sweeps,
        save_sweeps=True,
        name=test)
    params = sweeper.get_next(sort_params_by_nbr_clients)
    PROVIDERS[provider](broker=broker, config=config, env=test)
    t.inventory()

    while params:
        params.pop("backup_dir", None)
        params.update({"backup_dir": generate_id(params)})
        t.prepare(broker=broker)
        t.test_case_1(**params)
        sweeper.done(params)
        dump_param(params)
        params = sweeper.get_next(sort_params_by_nbr_clients)
        t.destroy()
示例#32
0
class overturn(Engine):

    def create_sweeper(self):
        """Define the parameter space and return a sweeper."""
        parameters = {
            'RA': ['1.e5', '1.e6'],
            'RCMB' : [1.19, 3.29],
            'KFe' : [0.85, 0.9]
            }
        sweeps = sweep(parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweeps)

    def create_par_file(self, comb):
        """Create Run directory on remote server and upload par file"""
        logger.info('Creating par file')
        comb_dir = parent_dir + slugify(comb) + '/'
        logger.info('comb_dir = ' + comb_dir)
        # Create remote directories
        
        mdir = sp.call('mkdir -p ' + comb_dir + 'Img ; mkdir -p ' + comb_dir + 'Op ; ', shell=True)
        # Generate par file
        par_file = 'par_' + slugify(comb)
        nml = f90nml.read('template.nml')
        nml['refstate']['ra0'] = float(comb['RA'])
        nml['tracersin']['K_Fe'] = comb['KFe']
        nml['geometry']['r_cmb'] = comb['RCMB']
        nztot = min(int(2**(math.log10(float(comb['RA']))+1)), 128)
        nml['geometry']['nztot'] = nztot
        nml['geometry']['nytot'] = int(math.pi*(comb['RCMB']+0.5)*nztot)
        nml.write(par_file, force=True)
        logger.info('Created par file ' + par_file)
        # Upload par file to remote directory
        cpar = sp.call('cp ' + par_file + ' ' + comb_dir, shell=True)
        mpar = sp.call('cd ' + comb_dir + ' ; mv ' + par_file+ ' par', shell=True)
        logger.info('Done')

    def submit_job(self, comb):
        """Use the batch script"""
        logger.info('Submiting job on '+ jobserver)
        comb_dir = parent_dir + slugify(comb) + '/'
        job_sub = sp.Popen('cd ' + comb_dir +
                             ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single',
                             shell=True,
                             stdout=sp.PIPE, stderr=sp.STDOUT)
        return job_sub.stdout.readlines()[-1].split('.')[0]

    def workflow(self, comb):
        self.create_par_file(comb)
        job_id = self.submit_job(comb)
        logger.info('Combination %s will be treated by job %s',
                    slugify(comb), str(job_id))
        self.sweeper.done(comb)

    def run(self):
        self.create_sweeper()
        logger.info('%s parameters combinations to be treated', len(self.sweeper.get_sweeps()))
        threads = []
        while len(self.sweeper.get_remaining()) > 0:
            comb = self.sweeper.get_next()
            t = Thread(target=self.workflow, args=(comb,))
            t.daemon = True
            threads.append(t)
            t.start()
        for t in threads:
			t.join()
    def run(self):
        """Run the experiment"""
        already_configured = self.options.already_configured
        reservation_job_id = int(self.options.reservation_id) \
            if self.options.reservation_id is not None else None
        is_a_test = self.options.is_a_test

        if is_a_test:
            logger.warn('THIS IS A TEST! This run will use only a few '
                        'resources')

        # make the result folder writable for all
        os.chmod(self.result_dir, 0o777)
        # Import configuration
        with open(self.args[0]) as config_file:
            config = json.load(config_file)
        # backup configuration
        copy(self.args[0], self.result_dir)

        site = config["grid5000_site"]
        resources = config["resources"]
        nb_experiment_nodes = config["nb_experiment_nodes"]
        walltime = str(config["walltime"])
        env_name = config["kadeploy_env_name"]
        workloads = config["workloads"]
        # check if workloads exists (Suppose that the same NFS mount point
        # is present on the remote and the local environment
        for workload_file in workloads:
            with open(workload_file):
                pass
            # copy the workloads files to the results dir
            copy(workload_file, self.result_dir)

        # define the workloads parameters
        self.parameters = {
            'workload_filename': workloads
        }
        logger.info('Workloads: {}'.format(workloads))

        # define the iterator over the parameters combinations
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"),
                                    sweep(self.parameters))

        # Due to previous (using -c result_dir) run skip some combination
        logger.info('Skipped parameters:' +
                    '{}'.format(str(self.sweeper.get_skipped())))

        logger.info('Number of parameters combinations {}'.format(
            str(len(self.sweeper.get_remaining()))))
        logger.info('combinations {}'.format(
            str(self.sweeper.get_remaining())))

        if reservation_job_id is not None:
            jobs = [(reservation_job_id, site)]
        else:
            jobs = oarsub([(OarSubmission(resources=resources,
                                          job_type='deploy',
                                          walltime=walltime), site)])
        job_id, site = jobs[0]
        if job_id:
            try:
                logger.info("waiting job start %s on %s" % (job_id, site))
                wait_oar_job_start(
                    job_id, site, prediction_callback=prediction_callback)
                logger.info("getting nodes of %s on %s" % (job_id, site))
                nodes = get_oar_job_nodes(job_id, site)
                # sort the nodes
                nodes = sorted(nodes, key=lambda node: node.address)
                # get only the necessary nodes under the switch
                if nb_experiment_nodes > len(nodes):
                    raise RuntimeError('The number of given node in the '
                                       'reservation ({}) do not match the '
                                       'requested resources '
                                       '({})'.format(len(nodes),
                                                     nb_experiment_nodes))
                nodes = nodes[:nb_experiment_nodes]
                logger.info("deploying nodes: {}".format(str(nodes)))
                deployed, undeployed = deploy(
                    Deployment(nodes, env_name=env_name),
                    check_deployed_command=already_configured)
                if undeployed:
                    logger.warn(
                        "NOT deployed nodes: {}".format(str(undeployed)))
                    raise RuntimeError('Deployement failed')

                if not already_configured:

                    # install OAR
                    install_cmd = "apt-get update; apt-get install -y "
                    node_packages = "oar-node"
                    logger.info(
                        "installing OAR nodes: {}".format(str(nodes[1:])))
                    install_oar_nodes = Remote(
                        install_cmd + node_packages,
                        nodes[1:],
                        connection_params={'user': '******'})
                    install_oar_nodes.start()

                    server_packages = ("oar-server oar-server-pgsql oar-user "
                                       "oar-user-pgsql postgresql python3-pip "
                                       "libjson-perl postgresql-server-dev-all")
                    install_oar_sched_cmd = """
                    mkdir -p /opt/oar_sched; \
                    cd /opt/oar_sched; \
                    git clone https://github.com/oar-team/oar3.git; \
                    cd oar3; \
                    git checkout dce942bebc2; \
                    pip3 install -e .; \
                    cd /usr/lib/oar/schedulers; \
                    ln -s /usr/local/bin/kamelot; \
                    pip3 install psycopg2
                    """
                    logger.info("installing OAR server node: {}".format(str(nodes[0])))
                    install_master = SshProcess(install_cmd + server_packages +
                                                ";" + install_oar_sched_cmd, nodes[0],
                                                connection_params={'user': '******'})
                    install_master.run()
                    install_oar_nodes.wait()

                    if not install_master.ok:
                        Report(install_master)

                    configure_oar_cmd = """
                    sed -i \
                        -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \
                        -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \
                        -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \
                        -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \
                        -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \
                        -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \
                        -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \
                        -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \
                        -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \
                        -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \
                        -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \
                        /etc/oar/oar.conf
                    """.format(result_dir=self.result_dir)
                    configure_oar = Remote(configure_oar_cmd, nodes,
                                           connection_params={'user': '******'})
                    configure_oar.run()
                    logger.info("OAR is configured on all nodes")

                    # Configure server
                    create_db = "oar-database --create --db-is-local"
                    config_oar_sched = ("oarnotify --remove-queue default;"
                                        "oarnotify --add-queue default,1,kamelot")
                    start_oar = "systemctl start oar-server.service"
                    logger.info(
                        "configuring OAR database: {}".format(str(nodes[0])))
                    config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar,
                                               nodes[0],
                                               connection_params={'user': '******'})
                    config_master.run()

                    # propagate SSH keys
                    logger.info("configuring OAR SSH")
                    oar_key = "/tmp/.ssh"
                    Process('rm -rf ' + oar_key).run()
                    Process('scp -o BatchMode=yes -o PasswordAuthentication=no '
                            '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null '
                            '-o ConnectTimeout=20 -rp -o User=root ' +
                            nodes[0].address + ":/var/lib/oar/.ssh"
                            ' ' + oar_key).run()
                    # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run()
                    Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run()
                    add_resources_cmd = """
                    oarproperty -a cpu || true; \
                    oarproperty -a core || true; \
                    oarproperty -c -a host || true; \
                    oarproperty -a mem || true; \
                    """
                    for node in nodes[1:]:
                        add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address)

                    add_resources = SshProcess(add_resources_cmd, nodes[0],
                                               connection_params={'user': '******'})
                    add_resources.run()

                    if add_resources.ok:
                        logger.info("oar is now configured!")
                    else:
                        raise RuntimeError("error in the OAR configuration: Abort!")

                # TODO backup de la config de OAR

                # Do the replay
                logger.info('begining the replay')
                while len(self.sweeper.get_remaining()) > 0:
                    combi = self.sweeper.get_next()
                    workload_file = os.path.basename(combi['workload_filename'])
                    oar_replay = SshProcess(script_path + "/oar_replay.py " +
                                            combi['workload_filename'] + " " +
                                            self.result_dir + "  oar_gant_" +
                                            workload_file,
                                            nodes[0])
                    oar_replay.stdout_handlers.append(self.result_dir + '/' +
                                                      workload_file + '.out')
                    logger.info("replaying workload: {}".format(combi))
                    oar_replay.run()
                    if oar_replay.ok:
                        logger.info("Replay workload OK: {}".format(combi))
                        self.sweeper.done(combi)
                    else:
                        logger.info("Replay workload NOT OK: {}".format(combi))
                        self.sweeper.cancel(combi)
                        raise RuntimeError("error in the OAR replay: Abort!")

            except:
                traceback.print_exc()
                ipdb.set_trace()

            finally:
                if is_a_test:
                    ipdb.set_trace()
                if reservation_job_id is None:
                    logger.info("delete job: {}".format(jobs))
                    oardel(jobs)
示例#34
0
class fp_hadoop(Engine):

    def __init__(self):
        """ Surchargement la methode init pour ajouter des options"""
        super(fp_hadoop, self).__init__()
        self.options_parser.set_usage("usage: %prog <cluster>")
        self.options_parser.add_argument("cluster",
                    "The cluster on which to run the experiment")
        self.options_parser.add_option("-k", dest="keep_alive",
                    help="keep reservation alive ..",
                    action="store_true")
        self.options_parser.add_option("-j", dest="oar_job_id",
                    help="oar_job_id to relaunch an engine",
                    type=int)
        self.options_parser.add_option("-o", dest="outofchart",
                    help="Run the engine outside days",
                    action="store_true")
        self.n_nodes = 10
        self.options_parser.add_option("-w", dest="walltime",
                    help="walltime for the reservation",
                    type="string",
                    default="3:00:00")

    def xp(self, comb):
        comb_ok = False
        try:
            """ tout ton xp """

            comb_ok = True
        finally:
            if comb_ok:
                self.sweeper.done(comb)
            else:
                self.sweeper.cancel(comb)
            logger.info(style.step('%s Remaining'),
                        len(self.sweeper.get_remaining()))

    def run(self):
        """Inherited method, put here the code for running the engine"""
        self.define_parameters()
        self.cluster = self.args[0]
        self.site = get_cluster_site(self.cluster)
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        try:
            # Creation of the main iterator which is used for the first control loop.
            # You need have a method called define_parameters, that returns a list of parameter dicts
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:
                # If no job, we make a reservation and prepare the hosts for the experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                # Retrieving the hosts and subnets parameters
                self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend)
                # Hosts deployment
                deployed, undeployed = deploy(Deployment(self.hosts, 
                    env_file="/home/mliroz/deploys/hadoop6.env"))
                logger.info("%i deployed, %i undeployed" % (len(deployed), 
                                                            len(undeployed)))
                if len(deployed) == 0:
                    break
                # Configuration du systeme => look at the execo_g5k.topology module 
                attr = get_host_attributes(self.cluster + '-1')
                
                ## SETUP FINISHED
                
                # Getting the next combination
                comb = self.sweeper.get_next()
                self.prepare_dataset(comb)
                self.xp(comb)
                # subloop over the combinations that have the same sizes
                while True:
                    newcomb = self.sweeper.get_next(lambda r:
                            filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r))
                    if newcomb:
                        try:
                            self.xp(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging') 

    def define_parameters(self):
        """Create the iterator that contains the parameters to be explored """
        self.parameters = {
            'sizes': [100],
            'zipf': [1],
            'pop_keys': [100],
            'min_size': [500, 1000],
            'int_phases': [1, 2, 3, 4, 5, 10],
            'iosf': [100]
                      }
        logger.info(self.parameters)
        self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), 
                                    sweep(self.parameters))
        logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))

    def _get_nodes(self, starttime, endtime):
        """ """
        planning = get_planning(elements=[self.cluster],
                                starttime=starttime,
                                endtime=endtime,
                                out_of_chart=self.options.outofchart)
        slots = compute_slots(planning, self.options.walltime)
        startdate = slots[0][0]
        i_slot = 0
        n_nodes = slots[i_slot][2][self.cluster]
        while n_nodes < self.n_nodes:
            logger.debug(slots[i_slot])
            startdate = slots[i_slot][0]
            n_nodes = slots[i_slot][2][self.cluster]
            i_slot += 1
            if i_slot == len(slots) - 1:
                return False, False
        return startdate, self.n_nodes

    def make_reservation(self):
        """Perform a reservation of the required number of nodes"""
        logger.info('Performing reservation')
        starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1)))
        endtime = int(starttime + timedelta_to_seconds(datetime.timedelta(days=3,
                                                                 minutes=1)))
        startdate, n_nodes = self._get_nodes(starttime, endtime)
        while not n_nodes:
            logger.info('No enough nodes found between %s and %s, ' + \
                        'increasing time window',
                        format_date(starttime), format_date(endtime))
            starttime = endtime
            endtime = int(starttime + timedelta_to_seconds(datetime.timedelta(days=3,
                                                                minutes=1)))
            startdate, n_nodes = self._get_nodes(starttime, endtime)
            if starttime > int(time.time() + timedelta_to_seconds(
                                            datetime.timedelta(weeks=6))):
                logger.error('There are not enough nodes on %s for your ' + \
                             'experiments, abort ...', self.cluster)
                exit()
        jobs_specs = get_jobs_specs({self.cluster: n_nodes},
                                    name=self.__class__.__name__)
        sub = jobs_specs[0][0]
        sub.walltime = self.options.walltime
        sub.additional_options = '-t deploy'
        sub.reservation_date = startdate
        (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0]
        logger.info('Startdate: %s, n_nodes: %s', format_date(startdate),
                    str(n_nodes))