def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace( '(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace('(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def default(self, line): global interrupted, workers, cores interrupted = False print 'interrupting previous command' workers.kill() execo.sleep(1) print 'sending command: ' + line workers = execo.Remote(line, cores).start()
def workflow(self, comb): self.create_par_file(comb) job_id = self.submit_job(comb) logger.info('Combination %s will be treated by job %s', slugify(comb), str(job_id)) while self.is_job_running(job_id): sleep(10) self.sweeper.done(comb)
def wait_hosts_up(hosts, timeout=300): """ """ down_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join(down_hosts)) f.close() timer = Timer() while len(down_hosts) > 0 and timer.elapsed() < timeout: nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" % (hosts_file, ), shell=True).run() logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(), nmap.stdout.strip()) for line in nmap.stdout.strip().split('\n'): s = line.split()[2] host = s[s.find("(") + 1:s.find(")")] if host in down_hosts: logger.detail('%s is up', host) down_hosts.remove(host) Process('rm ' + hosts_file).run() sleep(3) return len(down_hosts) == 0
def run(self): """ """ if self.options.oargrid_job_id is not None: self.oar_job_id = self.options.oargrid_job_id else: self.oar_job_id = None self.list_of_clusters = [ 'parasilo', 'paravance', 'parapluie', 'paranoia' ] try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() self.working_dir = '/data/jorouzaudcornabas_' + str( self.options.storage5k_job_id) job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.submit_all_available_best_effort( self.list_of_clusters, self.options.walltime) # self.make_reservation_local() # Wait that the job starts logger.info('Waiting that the job start ' + str(self.oar_job_id)) wait_oar_job_start(self.oar_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') #=============================================================== # deployment = Deployment(hosts = self.hosts, # env_file='/home/sirimie/env/mywheezy-x64-base.env') # self.hosts, _ = deploy(deployment) #=============================================================== if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = self.hosts threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/logs' if not os.path.exists(comb_dir): os.mkdir(comb_dir) logger.info("Starting the thread " + str(self.is_job_alive()) + " " + str(len(threads.keys()))) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] logger.info("Launching thread") t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([self.oar_job_id]) else: logger.info('Keeping job alive for debugging')
def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging')
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([s for (c, s) in clusters_to_submit]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update( sites_clusters_threads[site].keys()) all_involved_clusters.update( [c for (c, s) in clusters_to_submit if s == site]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key( site) and sites_clusters_threads[site].has_key( cluster): num_workers = len( sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min( self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % (num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target=self.worker_start, args=( cluster, site, oarsubmission, data, num_total_workers, )) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key( cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append( th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail( "no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail( "cleaning: delete job %i of worker #%i on %s" % (th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while self.is_job_alive()['state'] != 'Error' \ or len(threads.keys()) > 0: # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ # or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend(tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if self.is_job_alive()['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': if self.is_job_alive()['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def check_hosts_up(hosts, timeout=None, connection_params=None, polling_interval=5): """Check that a list of host are joinable with ssh. Checks that all hosts of the list are joinable with ssh. Retry continuously to connect to them every <polling_interval> seconds, until either all are reachable or the timeout is reached. Returns the list of hosts which are joinable. :param hosts: list of hosts :param timeout: timeout of the checks. No timeout if None. :param connection_params: to connect to the hosts. Note that the ssh_option entry of the connection_params is overwritten by this function :param polling_interval: tries to connect each <polling_interval> seconds. :returns: list of joinable hosts """ start_ts = time.time() if timeout != None: completion_ts = start_ts + timeout remaining_hosts = set(hosts) if connection_params != None: real_connection_params = connection_params else: real_connection_params = {} while len(remaining_hosts) > 0 and (timeout == None or time.time() <= completion_ts): #print('remaining_hosts=%s' % (remaining_hosts,)) if timeout != None: next_poll_ts = min(time.time() + polling_interval, completion_ts) else: next_poll_ts = time.time() + polling_interval poll_timeout = max(0, next_poll_ts - time.time()) real_connection_params.update({ 'ssh_options': ('-tt', '-o', 'BatchMode=yes', '-o', 'PasswordAuthentication=no', '-o', 'StrictHostKeyChecking=no', '-o', 'UserKnownHostsFile=/dev/null', '-o', 'ConnectTimeout=%s' % (int(poll_timeout), )) }) check = execo.Remote('true', remaining_hosts, connection_params=real_connection_params, process_args={ 'timeout': poll_timeout, 'nolog_exit_code': True, 'nolog_timeout': True }).run() hosts_up = [p.host for p in check.processes if p.finished_ok] #print('hosts_up=%s' %(hosts_up,)) remaining_hosts = remaining_hosts.difference(hosts_up) if len(remaining_hosts) > 0: execo.sleep(max(0, next_poll_ts - time.time())) return list(set(hosts).difference(remaining_hosts))
def run(self): rtt_file = self.result_dir + "/rtt.csv" resolver = None client = 'tcpclient' if self.args.mode == 'tcp' else 'udpclient' try: logger.debug("Experiment ID: {}".format(self.exp_id)) if self.multi_site(): logger.info("Running in multi-site mode") if not self.multi_site(): self.reserve_resources_singlejob() logger.debug("Waiting for OAR job to start...") g5k.wait_oar_job_start(*self.vmhosts_job) self.prepare_subnet() logger.debug("Prepared subnet") # Dependencies (besides the obvious ones): # - deploy_server depends on prepare_global_vlan # - prepare_server depends on deploy_server # - prepare_server depends on prepare_subnet # - prepare_vm depends on deploy_server if self.multi_site(): self.reserve_global_vlan() g5k.wait_oar_job_start(*self.globalvlan_job) logger.debug("Waiting for global VLAN job to start...") self.prepare_global_vlan() self.log_experimental_conditions() logger.debug("Deploying VM hosts...") machines_deploy_process = self.start_deploy_vmhosts() logger.debug("Deploying server image...") server_deploy_process = self.start_deploy_server() machines_deploy_process.wait() logger.debug("Finishing deploying VM hosts...") self.finish_deploy_vmhosts(machines_deploy_process) logger.debug("Setting up VM hosts...") machines_setup_process = self.prepare_vmhosts() machines_setup_process.wait() logger.debug("VM hosts are setup.") server_deploy_process.wait() logger.debug("Finishing deploying server...") self.finish_deploy_server(server_deploy_process) logger.debug("Server is deployed.") self.vm_process = self.start_all_vm() # Ensure VM are killed when we exit with self.vm_process: server_setup_process = self.prepare_server() self.wait_until_vm_ready() vm_setup_process = self.prepare_vm() server_setup_process.wait() self.log_output(server_setup_process, "server_setup_process") if not server_setup_process.ok: logger.error( "Error while preparing server, please check logs for 'server_setup_process'" ) raise Exception logger.debug("Prepared server: {}".format(self.server.address)) vm_setup_process.wait() self.log_output(vm_setup_process, "vm_setup_process") if not vm_setup_process.ok: logger.error( "Error while preparing VMs, please check logs for 'vm_setup_process'" ) raise Exception logger.debug("Prepared VM") logger.info("Started {} VMs.".format(len(self.vm))) cpunetlog_vms = self.start_cpunetlog(self.vm) cpunetlog_server = self.start_cpunetlog( [self.server], self.server_conn_params) resolver = self.start_dns_server() logger.info("Started resolver ({}) on {}.".format( self.resolver_name, self.server.address)) # Leave time for resolver to start if self.args.resolver_slots_per_thread < 1000000: execo.sleep(15) else: execo.sleep(60) logger.info("Starting {} on all VMs...".format(client)) clients = self.start_client_vm() clients.wait() logger.info("{} finished!".format(client)) logger.info("Writing cpunetlog output to disk.") cpunetlog_server.kill().wait() cpunetlog_vms.kill().wait() self.log_output(cpunetlog_server, "cpunetlog_server") self.log_output(cpunetlog_vms, "cpunetlog_vms") logger.info("writing {} results to disk.".format(client)) self.log_output(clients, "clients", log_stdout=False) with open(rtt_file, 'w') as rtt_output: need_header = True rtt = csv.writer(rtt_output) for client_id, client in enumerate(clients.processes): first_line = True for line in iter(client.stdout.splitlines()): # Skip anything that does not look like CSV if ',' not in line: continue if need_header: # Take CSV header from first client and add a column data = line.split(",") data.insert(0, "vm_id") rtt.writerow(data) need_header = False first_line = False elif first_line: # Skip first line of subsequent clients first_line = False else: # Add column with VM ID data = line.split(",") data.insert(0, client_id) rtt.writerow(data) except Exception as e: logger.error("Exception raised: {}\n{}".format(e, format_exc())) finally: #self.kill_all_vm() if self.vm_process: self.vm_process.kill() if resolver: resolver.kill() logger.debug("Waiting for resolver to exit") resolver.wait() self.log_output(resolver, "resolver") if self.vm_process: logger.debug("Waiting for VM to exit") self.vm_process.wait() logger.info("Resolver and all VMs are shut down") self.log_output(self.vm_process, "vm_process") print(execo.Report([self.vm_process]).to_string()) #for s in self.vm_process.processes: # print("\n%s\nstdout:\n%s\nstderr:\n%s\n" % (s, s.stdout, s.stderr)) g5k.oardel([self.vmhosts_job])
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([ s for (c, s) in clusters_to_submit ]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update(sites_clusters_threads[site].keys()) all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster): num_workers = len(sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min(self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % ( num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target = self.worker_start, args = (cluster, site, oarsubmission, data, num_total_workers,)) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key(cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append(th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail("no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail("cleaning: delete job %i of worker #%i on %s" % ( th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend( tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
worker_cmd = 'node pando.js/test/volunteer.js %s' params = execo_g5k.default_oarsh_oarcp_params if jobid: try: print 'Waiting for job to start' execo_g5k.wait_oar_job_start(jobid, site) print 'Retrieving nodes' nodes = execo_g5k.get_oar_job_nodes(jobid, site) # Open one connection per core (there are 8 cores per node in grenoble) cores = nodes * 8 if (len(cores) >= 2): print 'Starting server' server = execo.TaktukRemote(server_cmd, cores[0]) with server.start(): execo.sleep(0.5) (h, i, m) = server.expect(r'^(\/ip4\/172.*)')[0] multiaddr = m.group() print 'Starting workers with cmd: ' + worker_cmd % (multiaddr) workers = execo.TaktukRemote(worker_cmd % (multiaddr), cores[1:]).start() workers.expect('Node ready') print 'Workers ready' start_time = time.time() print 'Started processing' server.expect('done') stop_time = time.time() print 'Processing done in %fs' % (stop_time - start_time) print execo.Report([server, workers]).to_string() for index, p in enumerate(server.processes): with open('server-out.log', 'w') as f: