def get_oar_job_vm5k_resources(jobs): """Retrieve the hosts list and (ip, mac) list from a list of oar_job and return the resources dict needed by vm5k_deployment """ resources = {} for oar_job_id, site in jobs: logger.detail('Retrieving resources from %s:%s', style.emph(site), oar_job_id) oar_job_id = int(oar_job_id) wait_oar_job_start(oar_job_id, site) logger.debug('Retrieving hosts') hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)] logger.debug('Retrieving subnet') ip_mac, _ = get_oar_job_subnets(oar_job_id, site) kavlan = None if len(ip_mac) == 0: logger.debug('Retrieving kavlan') kavlan = get_oar_job_kavlan(oar_job_id, site) if kavlan: assert(len(kavlan) == 1) kavlan = kavlan[0] ip_mac = get_kavlan_ip_mac(kavlan, site) resources[site] = {'hosts': hosts, 'ip_mac': ip_mac[300:], 'kavlan': kavlan} return resources
def worker_start(self, cluster, site, oarsubmission, data, worker_index): th = current_thread() th.cluster = cluster th.site = site th.worker_index = worker_index th.jobid = None try: with th.oarsublock: if th.willterminate: return worker_log.detail("submit oar job") ((th.jobid, _), ) = oarsub([(oarsubmission, site)]) if not th.jobid: worker_log.detail("job submission failed") self.worker(cluster, site, data, None, worker_index, oarsubmission, None) worker_log.detail("job submitted - wait job start") wait_oar_job_start(th.jobid, site, prediction_callback=lambda ts: worker_log. detail("job start prediction: %s" % (format_date(ts), ))) th.waiting = False worker_log.detail("job started - get job nodes") nodes = get_oar_job_nodes(th.jobid, site) worker_log.detail("got %i nodes" % (len(nodes), )) self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid) finally: with th.oarsublock: if th.jobid: worker_log.detail("delete oar job") oardel([(th.jobid, site)]) th.jobid = None worker_log.detail("exit")
def worker_start(self, cluster, site, oarsubmission, data, worker_index): th = current_thread() th.cluster = cluster th.site = site th.worker_index = worker_index th.jobid = None try: with th.oarsublock: if th.willterminate: return worker_log.detail("submit oar job") ((th.jobid, _),) = oarsub([(oarsubmission, site)]) if not th.jobid: worker_log.detail("job submission failed") self.worker(cluster, site, data, None, worker_index, oarsubmission, None) worker_log.detail("job submitted - wait job start") wait_oar_job_start(th.jobid, site, prediction_callback = lambda ts: worker_log.detail("job start prediction: %s" % (format_date(ts),))) th.waiting = False worker_log.detail("job started - get job nodes") nodes = get_oar_job_nodes(th.jobid, site) worker_log.detail("got %i nodes" % (len(nodes),)) self.worker(cluster, site, data, nodes, worker_index, oarsubmission, th.jobid) finally: with th.oarsublock: if th.jobid: worker_log.detail("delete oar job") oardel([(th.jobid, site)]) th.jobid = None worker_log.detail("exit")
def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def get_oar_job_vm5k_resources(jobs): """Retrieve the hosts list and (ip, mac) list from a list of oar_job and return the resources dict needed by vm5k_deployment """ resources = {} for oar_job_id, site in jobs: logger.detail('Retrieving resources from %s:%s', style.emph(site), oar_job_id) oar_job_id = int(oar_job_id) wait_oar_job_start(oar_job_id, site) logger.debug('Retrieving hosts') hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)] logger.debug('Retrieving subnet') ip_mac, _ = get_oar_job_subnets(oar_job_id, site) kavlan = None if len(ip_mac) == 0: logger.debug('Retrieving kavlan') kavlan = get_oar_job_kavlan(oar_job_id, site) if kavlan: assert (len(kavlan) == 1) kavlan = kavlan[0] ip_mac = get_kavlan_ip_mac(kavlan, site) resources[site] = { 'hosts': hosts, 'ip_mac': ip_mac[300:], 'kavlan': kavlan } return resources
def get_host(self): """Returns the hosts from an existing reservation (if any), or from a new reservation""" # Look if there is a running job self.site = get_cluster_site(self.config['cluster']) jobs = EX5.get_current_oar_jobs([self.site]) self.job_id = None for t in jobs: if EX5.get_oar_job_info( t[0], self.site)['name'] == self.options.job_name: self.job_id = t[0] break if self.job_id: logger.info('Using job %s' % style.emph(self.job_id)) else: logger.info('Making a new reservation') self._make_reservation(self.site) if not self.job_id: logger.error("Could not get a reservation for the job") exit(6) EX5.wait_oar_job_start(self.job_id, self.site) pp(EX5.get_oar_job_nodes(self.job_id, self.site)) return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
def get_or_create_job(resources, job_name, walltime, reservation_date, queue, reservation_type): gridjob, _ = ex5.planning.get_job_by_name(job_name) if gridjob is None: gridjob = make_reservation(resources, job_name, walltime, reservation_date, queue, reservation_type) if reservation_type == "oar": logger.info("Waiting for oarjob %s to start" % gridjob) ex5.wait_oar_job_start(gridjob) else: logger.info("Waiting for oargridjob %s to start" % gridjob) ex5.wait_oargrid_job_start(gridjob) return gridjob
def get_resources(self): """Retrieve the hosts address list and (ip, mac) list from a list of oar_result and return the resources which is a dict needed by g5k_provisioner """ logger.info("Getting resources specs") self.resources = dict() self.hosts = list() for oar_job_id, site in self.oar_result: logger.info('Waiting for the reserved nodes on %s to be up' % site) if not wait_oar_job_start(oar_job_id, site): logger.error('The reserved resources cannot be used.\nThe program is terminated.') exit() for oar_job_id, site in self.oar_result: logger.info('Retrieving resource information on %s' % site) logger.debug('Retrieving hosts') hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)] # if len(hosts) != self.clusters[site]: logger.debug('Retrieving subnet') ip_mac, _ = get_oar_job_subnets(oar_job_id, site) kavlan = None if len(ip_mac) == 0: logger.debug('Retrieving kavlan') kavlan = get_oar_job_kavlan(oar_job_id, site) if kavlan: ip_mac = self.get_kavlan_ip_mac(kavlan, site) self.resources[site] = {'hosts': hosts, 'ip_mac': ip_mac, 'kavlan': kavlan} for site, resource in self.resources.items(): self.hosts += resource['hosts']
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission(resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={ 'user': default_frontend_connection_params['user'] }).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), sql_properties="cluster='%s'"%comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission( resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={'user': default_frontend_connection_params['user']} ).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission( resources="nodes=%d" % (max(1, comb['cores'] / n_core), ), sql_properties="cluster='%s'" % comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([ (submission, get_cluster_site(comb['cluster'])) ])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def launch_bench(oarsubmission, site, folder): """Copy required files on frontend(s) and compile bench suite.""" logger.info("Reserving a node.") jobs = oarsub([(oarsubmission, site)]) (job_id, site) = jobs[0] logger.info(jobs) if job_id: try: logger.info("Node reserved.") wait_oar_job_start(job_id, site) logger.info("Deploying environment.") node = deploy_node(job_id, site, oarsubmission) logger.info("Compiling Bots.") setup_node(node) logger.info("Starting benchs.") run_bench(folder, node) except: logger.error("Unable to deploy & compile Bench.") oardel(jobs) return False logger.info("Benchs completed. Deleting jobs.") oardel(jobs) return True
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int( math.ceil( float(comb['n_clients']) / EX5.get_host_attributes( cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources=resources, job_type='allow_classic_ssh', walltime='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log( "Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback=prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params={ 'user': '******' }).run() EX.Put([nodes[0]], '../server.policy', connexion_params={ 'user': '******' }).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log( "Running experiment with {0} nodes and {1} transitions per client" .format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote( 'java -jar active-data-lib-0.1.2.jar', [server], stdout_handler=out_handler, stderr_handler=out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank = 0 n_cores = EX5.get_host_attributes( clients[0])['architecture']['smt_size'] cores = nodes * n_cores cores = cores[ 0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler( os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log( "One or more client process failed. Enjoy reading their outputs." ) self._log( "OUTPUT STARTS -------------------------------------------------\n" ) for process in client_request.processes(): print("----- {0} returned {1}".format( process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log( "OUTPUT ENDS ---------------------------------------------------\n" ) sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format( len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location=self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log( "Finishing experiment with {0} clients and {1} transitions per client" .format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next(filtr=lambda r: filter( lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
def default(self, line): global interrupted, workers, cores interrupted = False print 'interrupting previous command' workers.kill() execo.sleep(1) print 'sending command: ' + line workers = execo.Remote(line, cores).start() app = App() if jobid: try: print 'Waiting for job to start' execo_g5k.wait_oar_job_start(jobid, site) print 'Retrieving nodes' nodes = execo_g5k.get_oar_job_nodes(jobid, site) # Setup nodes print 'Preparing workers with cmd: ' + setup_cmd workers = execo.Remote(setup_cmd, nodes).start() workers.expect('Worker Setup Completed') workers.kill() # Possibly open more than one connection per machine cores = nodes * args.nb_cores print cores print 'Example cmd: %s' % (workers_cmd) app.prompt = '%s (%d node(s), %d core(s)/node)> ' % ( site, args.volunteers, args.nb_cores) app.cmdloop() # execo.sleep(600)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess( comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run(self): # Defining experiment parameters self.parameters = { 'n_clients': [400, 450, 500, 550, 600], 'n_transitions': [10000] } cluster = 'griffon' sweeps = sweep(self.parameters) sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) server_out_path = os.path.join(self.result_dir, "server.out") self._updateStat(sweeper.stats()) # Loop on the number of nodes while True: # Taking the next parameter combinations comb = sweeper.get_next() if not comb: break # Performing the submission on G5K site = get_cluster_site(cluster) self._log("Output will go to " + self.result_dir) n_nodes = int(math.ceil(float(comb['n_clients']) / EX5.get_host_attributes(cluster + '-1')['architecture']['smt_size'])) + 1 self._log("Reserving {0} nodes on {1}".format(n_nodes, site)) resources = "{cluster=\\'" + cluster + "\\'}/nodes=" + str(n_nodes) submission = EX5.OarSubmission(resources = resources, job_type = 'allow_classic_ssh', walltime ='00:10:00') job = EX5.oarsub([(submission, site)]) self.__class__._job = job # Sometimes oarsub fails silently if job[0][0] is None: print("\nError: no job was created") sys.exit(1) # Wait for the job to start self._log("Waiting for job {0} to start...\n".format(BOLD_MAGENTA + str(job[0][0]) + NORMAL)) EX5.wait_oar_job_start(job[0][0], job[0][1], prediction_callback = prediction) nodes = EX5.get_oar_job_nodes(job[0][0], job[0][1]) # Deploying nodes #deployment = EX5.Deployment(hosts = nodes, env_file='path_to_env_file') #run_deploy = EX5.deploy(deployment) #nodes_deployed = run_deploy.hosts[0] # Copying active_data program on all deployed hosts EX.Put([nodes[0]], '../dist/active-data-lib-0.1.2.jar', connexion_params = {'user': '******'}).run() EX.Put([nodes[0]], '../server.policy', connexion_params = {'user': '******'}).run() # Loop on the number of requests per client process while True: # Split the nodes clients = nodes[1:] server = nodes[0] self._log("Running experiment with {0} nodes and {1} transitions per client".format(len(clients), comb['n_transitions'])) # Launching Server on one node out_handler = FileOutputHandler(server_out_path) launch_server = EX.Remote('java -jar active-data-lib-0.1.2.jar', [server], stdout_handler = out_handler, stderr_handler = out_handler).start() self._log("Server started on " + server.address) time.sleep(2) # Launching clients rank=0 n_cores = EX5.get_host_attributes(clients[0])['architecture']['smt_size']; cores = nodes * n_cores cores = cores[0:comb['n_clients']] # Cut out the additional cores client_connection_params = { 'taktuk_gateway': 'lyon.grid5000.fr', 'host_rewrite_func': None } self._log("Launching {0} clients...".format(len(cores))) client_cmd = "/usr/bin/env java -cp /home/ansimonet/active-data-lib-0.1.2.jar org.inria.activedata.examples.perf.TransitionsPerSecond " + \ "{0} {1} {2} {3} {4}".format(server.address, 1200, "{{range(len(cores))}}", len(cores), comb['n_transitions']) client_out_handler = FileOutputHandler(os.path.join(self.result_dir, "clients.out")) client_request = EX.TaktukRemote(client_cmd, cores, connexion_params = client_connection_params, \ stdout_handler = client_out_handler, stderr_handler = client_out_handler) client_request.run() if not client_request.ok(): # Some client failed, please panic self._log("One or more client process failed. Enjoy reading their outputs.") self._log("OUTPUT STARTS -------------------------------------------------\n") for process in client_request.processes(): print("----- {0} returned {1}".format(process.host().address, process.exit_code())) if not process.stdout() == "": print(GREEN + process.stdout() + NORMAL) if not process.stderr() == "": print(RED + process.stderr() + NORMAL) print("") self._log("OUTPUT ENDS ---------------------------------------------------\n") sweeper.skip(comb) launch_server.kill() launch_server.wait() else: # Waiting for server to end launch_server.wait() # Getting log files distant_path = OUT_FILE_FORMAT.format(len(cores), comb['n_transitions']) local_path = distant_path EX.Get([server], distant_path).run() EX.Local('mv ' + distant_path + ' ' + os.path.join(self.result_dir, local_path)).run() EX.Get([server], 'client_*.out', local_location = self.result_dir) EX.Remote('rm -f client_*.out', [server]) self._log("Finishing experiment with {0} clients and {1} transitions per client".format(comb['n_clients'], comb['n_transitions'])) sweeper.done(comb) sub_comb = sweeper.get_next (filtr = lambda r: filter(lambda s: s["n_clients"] == comb['n_clients'], r)) self._updateStat(sweeper.stats()) if not sub_comb: # Killing job EX5.oar.oardel(job) self.__class__._job = None break else: comb = sub_comb print ""
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # OARSUB jobs = oarsub([(OarSubmission(resources='nodes=' + _nbrNodes.__str__(), job_type='deploy', walltime=_walltime, sql_properties=_properties), _site)]) job_id, site = jobs[0] try: # KADEPLOY logger.info("Waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) deployed, undeployed = deploy(Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes : {}".format(str(undeployed))) raise RuntimeError('Deployement failed') # STARPU INSTALLATION spack_spec = 'chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' spack_command = 'spack install -v' + ' ' + spack_spec logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") self.checkProcess(spack_process) spack_process.kill() # STARPU DIRECTORY logger.info("Searching and going to StarPU installation directory...") starpu_location_process = Process(spack_spec).start() starpu_location_process.wait() self.checkProcess(starpu_location) starpu_cd_process = Process('cd ' + starpu_location_process.stdout + '/lib/chameleon').start() starpu_cd_process.wait() self.checkProcess(starpu_cd_process) starpu_location_process.kill() starpu_cd_process.kill() # RUNNING EXPERIMENT logger.info("Starting StarPU experiment...") starpu_experiment_process = Process(""" export STARPU_WORKER_STATS=1 export STARPU_CALIBRATE=2 ./timing/time_spotrf_tile --warmup --gpus=3 --threads=9 --nb=960 --ib=96 --n_range=48000:48000:9600 """) starpu_experiment_process.stdout_handlers.append(self.result_dir + '/' + 'StarPU.out') # create output file for StarPU starpu_experiment_process.start() starpu_experiment_process.wait() logger.info("StarPU experiment DONE...") self.checkProcess(starpu_experiment_process) starpu_experiment_process.kill() finally: logger.info("Delete job : {}".format(jobs)) oardel(jobs)
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
import string import re def host_rewrite_func(host): return re.sub("\.grid5000\.fr$", ".g5k", host) # sites = EX5.get_g5k_sites() # sites.remove('bordeaux') EX.logger.setLevel('INFO') jobs = EX5.get_current_oar_jobs(['reims']) if len(jobs) == 0: jobs = EX5.oarsub([( EX5.OarSubmission(resources = "{type=\\'kavlan\\'}/vlan=1+/nodes=2", walltime="3:00:00", job_type ='deploy'), "reims")]) EX5.wait_oar_job_start( oar_job_id=jobs[0][0], frontend=jobs[0][1]) print jobs hosts = EX5.get_oar_job_nodes(jobs[0][0], jobs[0][1]) print hosts kavlan_id = EX5.get_oar_job_kavlan(jobs[0][0], jobs[0][1]) print kavlan_id deployment = EX5.Deployment( hosts = hosts, env_file= "ubuntu-x64-1204", vlan = kavlan_id) deployed_hosts, undeployed_hosts = EX5.deploy(deployment) #deployed_hosts, undeployed_hosts = EX5.deploy(deployment, num_tries=0,check_deployed_command=True) if kavlan_id is not None: hosts = [ EX5.get_kavlan_host_name(host, kavlan_id) for host in deployed_hosts ] print hosts[0]
def run(self): """ """ if self.options.oargrid_job_id is not None: self.oar_job_id = self.options.oargrid_job_id else: self.oar_job_id = None self.list_of_clusters = [ 'parasilo', 'paravance', 'parapluie', 'paranoia' ] try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() self.working_dir = '/data/jorouzaudcornabas_' + str( self.options.storage5k_job_id) job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.submit_all_available_best_effort( self.list_of_clusters, self.options.walltime) # self.make_reservation_local() # Wait that the job starts logger.info('Waiting that the job start ' + str(self.oar_job_id)) wait_oar_job_start(self.oar_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') #=============================================================== # deployment = Deployment(hosts = self.hosts, # env_file='/home/sirimie/env/mywheezy-x64-base.env') # self.hosts, _ = deploy(deployment) #=============================================================== if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = self.hosts threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/logs' if not os.path.exists(comb_dir): os.mkdir(comb_dir) logger.info("Starting the thread " + str(self.is_job_alive()) + " " + str(len(threads.keys()))) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] logger.info("Launching thread") t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([self.oar_job_id]) else: logger.info('Keeping job alive for debugging')
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess(comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def run(self): rtt_file = self.result_dir + "/rtt.csv" resolver = None client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient' try: logger.debug("Experiment ID: {}".format(self.exp_id)) if self.multi_site(): logger.info("Running in multi-site mode") if not self.multi_site(): self.reserve_resources_singlejob() logger.debug("Waiting for OAR job to start...") g5k.wait_oar_job_start(*self.vmhosts_job) self.prepare_subnet() logger.debug("Prepared subnet") # Dependencies (besides the obvious ones): # - deploy_server depends on prepare_global_vlan # - prepare_server depends on deploy_server # - prepare_server depends on prepare_subnet # - prepare_vm depends on deploy_server if self.multi_site(): self.reserve_global_vlan() g5k.wait_oar_job_start(*self.globalvlan_job) logger.debug("Waiting for global VLAN job to start...") self.prepare_global_vlan() self.log_experimental_conditions() logger.debug("Deploying VM hosts...") machines_deploy_process = self.start_deploy_vmhosts() logger.debug("Deploying server image...") server_deploy_process = self.start_deploy_server() machines_deploy_process.wait() logger.debug("Finishing deploying VM hosts...") self.finish_deploy_vmhosts(machines_deploy_process) logger.debug("Setting up VM hosts...") machines_setup_process = self.prepare_vmhosts() machines_setup_process.wait() logger.debug("VM hosts are setup.") server_deploy_process.wait() logger.debug("Finishing deploying server...") self.finish_deploy_server(server_deploy_process) logger.debug("Server is deployed.") self.vm_process = self.start_all_vm() # Ensure VM are killed when we exit with self.vm_process: server_setup_process = self.prepare_server() self.wait_until_vm_ready() vm_setup_process = self.prepare_vm() server_setup_process.wait() self.log_output(server_setup_process, "server_setup_process") if not server_setup_process.ok: logger.error( "Error while preparing server, please check logs for 'server_setup_process'" ) raise Exception logger.debug("Prepared server: {}".format(self.server.address)) vm_setup_process.wait() self.log_output(vm_setup_process, "vm_setup_process") if not vm_setup_process.ok: logger.error( "Error while preparing VMs, please check logs for 'vm_setup_process'" ) raise Exception logger.debug("Prepared VM") logger.info("Started {} VMs.".format(len(self.vm))) cpunetlog_vms = self.start_cpunetlog(self.vm) cpunetlog_server = self.start_cpunetlog( [self.server], self.server_conn_params) resolver = self.start_dns_server() logger.info("Started resolver ({}) on {}.".format( self.resolver_name, self.server.address)) # Leave time for resolver to start if self.args.resolver_slots_per_thread < 1000000: execo.sleep(15) else: execo.sleep(60) logger.info("Starting {} on all VMs...".format(client)) clients = self.start_client_vm() clients.wait() logger.info("{} finished!".format(client)) logger.info("Writing cpunetlog output to disk.") cpunetlog_server.kill().wait() cpunetlog_vms.kill().wait() self.log_output(cpunetlog_server, "cpunetlog_server") self.log_output(cpunetlog_vms, "cpunetlog_vms") logger.info("writing {} results to disk.".format(client)) self.log_output(clients, "clients", log_stdout=False) with open(rtt_file, 'w') as rtt_output: need_header = True rtt = csv.writer(rtt_output) for client_id, client in enumerate(clients.processes): first_line = True for line in iter(client.stdout.splitlines()): # Skip anything that does not look like CSV if ',' not in line: continue if need_header: # Take CSV header from first client and add a column data = line.split(",") data.insert(0, "vm_id") rtt.writerow(data) need_header = False first_line = False elif first_line: # Skip first line of subsequent clients first_line = False else: # Add column with VM ID data = line.split(",") data.insert(0, client_id) rtt.writerow(data) except Exception as e: logger.error("Exception raised: {}\n{}".format(e, format_exc())) finally: #self.kill_all_vm() if self.vm_process: self.vm_process.kill() if resolver: resolver.kill() logger.debug("Waiting for resolver to exit") resolver.wait() self.log_output(resolver, "resolver") if self.vm_process: logger.debug("Waiting for VM to exit") self.vm_process.wait() logger.info("Resolver and all VMs are shut down") self.log_output(self.vm_process, "vm_process") print(execo.Report([self.vm_process]).to_string()) #for s in self.vm_process.processes: # print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr)) g5k.oardel([self.vmhosts_job])
nodes = [] if oargrid_job_id < 0: job = oarsub(subs) # oargrid_job_id = job[0][0] #ssh_key = job[1] if oargrid_job_id < 0: print oargrid_job_id logger.info("No ressources availables") logger.info("End of program") sys.exit(0) logger.info("Wait for job to start...") print oargrid_job_id wait_oar_job_start(oar_job_id = oargrid_job_id) logger.info("Wait for job to start...") wait_oar_job_start(oargrid_job_id) #wait_oargrid_job_start(oargrid_job_id) # print oargrid_job_id print ssh_key nodes = get_oar_job_nodes(oargrid_job_id) #nodes = get_oargrid_job_nodes(oargrid_job_id) logger.info("Job has started") print nodes logger.info("Deployment started") #logger.setLevel(1) nodes = deploy(Deployment(hosts = nodes, env_name = "wheezy-x64-diet", user = "******", other_options='-d -V4'), out = True, check_deployed_command=True)#, check_deployed_command = False) deploy_nodes = nodes[0]