def is_job_alive(oar_job_ids): """Check if the given OAR_JOB_IDs are still alive on Grid5000 system or not Parameters ---------- oar_job_ids: dict a dictionary that contains the reserved information key: str, the name of the site on Grid5000 system value: int, the number of the reservation on that site Returns ------ bool True: if the given oar_job_ids is still alive False: if the given oar_job_ids is dead """ for oar_job_id, site in oar_job_ids: job_info = get_oar_job_info(oar_job_id, site) while 'state' not in job_info: job_info = get_oar_job_info(oar_job_id, site) sleep(5) if job_info['state'] == 'Error': return False return True
def is_job_alive(self): rez = get_oar_job_info(self.oar_job_id) while 'start_date' not in rez and 'state' not in rez: rez = get_oar_job_info(self.oar_job_id) if rez['state'] == 'Error': return False if (rez["start_date"] + rez["walltime"] > time.time()): return True else: return False
def get_host(self): """Returns the hosts from an existing reservation (if any), or from a new reservation""" # Look if there is a running job self.site = get_cluster_site(self.config['cluster']) jobs = EX5.get_current_oar_jobs([self.site]) self.job_id = None for t in jobs: if EX5.get_oar_job_info( t[0], self.site)['name'] == self.options.job_name: self.job_id = t[0] break if self.job_id: logger.info('Using job %s' % style.emph(self.job_id)) else: logger.info('Making a new reservation') self._make_reservation(self.site) if not self.job_id: logger.error("Could not get a reservation for the job") exit(6) EX5.wait_oar_job_start(self.job_id, self.site) pp(EX5.get_oar_job_nodes(self.job_id, self.site)) return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter( lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info( 'No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info( style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def run(self): """ Main engine method to perform the experiment """ self.define_parameters() while len(self.sweeper.get_remaining()) > 0: # Getting the next combination comb = self.sweeper.get_next() logger.info(style.host(slugify(comb)) + ' has been started') self.get_nodes(comb) # If the job is broken, the program is stopped if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': break try: self.workflow(comb) # Process all combinations that can use the same submission while True: # Find the next combination combinations that can use the same submission subcomb = self.sweeper.get_next(lambda r: filter(lambda x: x['cores'] == comb['cores'] and x['cluster'] == comb['cluster'], r)) if not subcomb: logger.info('No more combination for cluster=%s and cores=%s', comb['cluster'], comb['cores']) break else: logger.info(style.host(slugify(subcomb)) + ' has been started') if get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error': self.workflow(subcomb) else: break # Whatever happens (errors, end of loop), the job is deleted finally: logger.info('Deleting job...') oardel([(self.oar_job_id, self.frontend)])
def get_job_by_name(job_name, sites=None): """ """ logger.detail('Looking for a job named %s', style.emph(job_name)) if not sites: sites = get_g5k_sites() oargrid_jobs = get_current_oargrid_jobs() if len(oargrid_jobs) > 0: for g_job in oargrid_jobs: for job in get_oargrid_job_oar_jobs(g_job): info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Oargridjob %s found !', style.emph(g_job)) return g_job, None running_jobs = get_current_oar_jobs(sites) for job in running_jobs: info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Job %s found on site %s !', style.emph(job[0]), style.host(job[1])) return job return None, None
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy( Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter( lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def get_job_info(site, job_id): jobs = EX5.get_current_oar_jobs([site]) for t in jobs: info = EX5.get_oar_job_info(t[0], site) return EX5.get_oar_job_nodes(job_id, site)[0]
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy(Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def is_job_alive(self): rez=get_oar_job_info(self.oar_job_id) if (rez["start_date"]+rez["walltime"] > time.time()): return True else: return False
def is_job_alive(self): rez=get_oar_job_info(self.oar_job_id, self.frontend) while 'state' not in rez: logger.info('Retrying getting oar_job_info') rez=get_oar_job_info(self.oar_job_id, self.frontend) return rez
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend(tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend( tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')