def setup(self): """Setup the cluster of hosts. Depending on the engine parameters it will bootstrap hadoop directly or deploy a given environment. """ self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) if self.use_kadeploy: (deployed, undeployed) = self.deploy_nodes() return (len(deployed) != 0) else: if not self.hc: self.hc = HadoopCluster(self.hosts) self.hc.bootstrap(self.hadoop_tar_file) return True
def setup(self): """Setup the cluster of hosts. Optionally deploy env and then copy the executable jar to all the nodes. """ self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) if self.use_kadeploy: (deployed, undeployed) = self.deploy_nodes() return (len(deployed) != 0) copy_code = TaktukPut(self.hosts, [self.jar_file], self.remote_dir) copy_code.run() return True
def run(self): """Execute a test suite. The execution workflow is as follows: 1. Parse command-line arguments. 2. Define the parameters of the tests from the specified configuration file. Generate all the combination to test from the given parameters. 3. Consume the combinations. 3.1. Setup the cluster if it has not been done (first time or after a reservation ends. 3.2. Load the dataset into the Hadoop cluster. 3.3. Perform the experiments corresponding to the combinations linked to the loaded dataset. 4. Clean all resources. """ # Get parameters self.cluster = self.args[0] self.n_nodes = int(self.args[1]) self.config_file = self.args[2] self.site = get_cluster_site(self.cluster) if not os.path.exists(self.config_file): logger.error("Params file " + self.config_file + " does not exist") sys.exit(1) # Set oar job id if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None # Main try: # Creation of the main iterator used for the first control loop. self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # SETUP # If no job, we make a reservation and prepare the hosts for the # experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() success = self.setup() if not success: break else: self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) if not self.hc: self.hc = HadoopCluster(self.hosts) # SETUP FINISHED # Getting the next combination (which requires a ds deployment) comb = self.sweeper.get_next() self.raw_comb = comb.copy() self.comb = comb self.prepare_dataset(comb) self.xp_wrapper(comb) # subloop over the combinations that use the same dataset while True: newcomb = self.sweeper.get_next( lambda r: filter(self._uses_same_ds, r)) if newcomb: self.raw_comb = newcomb.copy() try: self.xp_wrapper(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: pass logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') # Clean cluster if self.hc: if self.hc.initialized: self.hc.clean() # Close summary files if self.summary_file: self.summary_file.close() if self.ds_summary_file: self.ds_summary_file.close()
from execo.log import logger, style from execo_g5k.oar import get_oar_job_nodes from execo_g5k.utils import hosts_list from networkx.algorithms.shortest_paths.generic import shortest_path from execo_g5k.api_utils import get_host_shortname from random import uniform jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')] logger.info( 'Retrieving hosts used for jobs %s', ', '.join([ style.host(site) + ':' + style.emph(job_id) for job_id, site in jobs ])) hosts = [ get_host_shortname(h) for job_id, site in jobs for h in get_oar_job_nodes(job_id, site) ] logger.info(hosts_list(hosts)) logger.info('Creating topological graph') g = g5k_graph(elements=hosts) i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts))) path = shortest_path(g, hosts[i], hosts[j]) logger.info( 'Communication between %s and %s go through ' 'the following links: \n%s', style.host(hosts[i]), style.host(hosts[j]), ' -> '.join(path)) logger.info('Active links between nodes %s and %s are: \n%s', style.host(path[0]), style.host(path[1]),
def run(self): """Inherited method, put here the code for running the engine.""" # Get parameters self.cluster = self.args[0] self.n_nodes = int(self.args[1]) self.config_file = self.args[2] self.site = get_cluster_site(self.cluster) if not os.path.exists(self.config_file): logger.error("Params file " + self.config_file + " does not exist") sys.exit(1) # Set oar job id if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None # Main try: # Creation of the main iterator used for the first control loop. self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: ## SETUP # If no job, we make a reservation and prepare the hosts for the # experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() success = self.setup() if not success: break else: self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) ## SETUP FINISHED logger.info("Setup finished in hosts " + str(self.hosts)) test_threads = [] for h in self.hosts: t = TestThread(h, self.comb_manager, self.stats_manager) test_threads.append(t) t.name = "th_" + str(h.address).split(".")[0] t.start() for t in test_threads: t.join() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: pass logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') # Close stats self.stats_manager.close()
blacklisted = ['talc', 'mbi'] slots = compute_slots(planning, walltime, excluded_elements=blacklisted) wanted = {'grid5000': 1} start_date, end_date, resources = find_first_slot(slots, wanted) for c in filter(lambda x: x in get_g5k_clusters(), resources.keys()): if resources[c] > 1: wanted = {c: 1} break jobs_specs = get_jobs_specs(wanted, name=job_name) for sub, frontend in jobs_specs: sub.walltime = walltime sub.job_type = "deploy" job = oarsub(jobs_specs)[0] nodes = get_oar_job_nodes(job[0], job[1]) logger.info('Deploying host %s', nodes[0].address) deployed, undeployed = deploy(Deployment(nodes, env_name="jessie-x64-base")) execware_host = list(deployed)[0] logger.info('Installing required packages %s', style.emph(packages)) install_packages = SshProcess( 'apt-get update && apt-get install -y ' + packages, execware_host).run() logger.info('Copying files to host') put_files = Put(execware_host, [source_code], remote_location="/tmp").run() xml_file = """ <settings> <proxies> <proxy> <id>g5k-proxy</id>
blacklisted = ['talc', 'mbi'] slots = compute_slots(planning, walltime, excluded_elements=blacklisted) wanted = {'grid5000': 1} start_date, end_date, resources = find_first_slot(slots, wanted) for c in filter(lambda x: x in get_g5k_clusters(), resources.keys()): if resources[c] > 1: wanted = {c: 1} break jobs_specs = get_jobs_specs(wanted, name=job_name) for sub, frontend in jobs_specs: sub.walltime = walltime sub.job_type = "deploy" job = oarsub(jobs_specs)[0] nodes = get_oar_job_nodes(job[0], job[1]) logger.info('Deploying host %s', nodes[0].address) deployed, undeployed = deploy(Deployment(nodes, env_name="jessie-x64-base")) execware_host = list(deployed)[0] logger.info('Installing required packages %s', style.emph(packages)) install_packages = SshProcess('apt-get update && apt-get install -y ' + packages, execware_host).run() logger.info('Copying files to host') put_files = Put(execware_host, [source_code], remote_location="/tmp").run() xml_file = """ <settings> <proxies> <proxy>
#!/usr/bin/env python from execo_g5k.topology import g5k_graph, treemap from execo.log import logger, style from execo_g5k.oar import get_oar_job_nodes from execo_g5k.utils import hosts_list from networkx.algorithms.shortest_paths.generic import shortest_path from execo_g5k.api_utils import get_host_shortname from random import uniform jobs = [(1696863, 'grenoble'), (1502558, 'lille'), (74715, 'luxembourg')] logger.info('Retrieving hosts used for jobs %s', ', '.join([style.host(site) + ':' + style.emph(job_id) for job_id, site in jobs])) hosts = [get_host_shortname(h) for job_id, site in jobs for h in get_oar_job_nodes(job_id, site)] logger.info(hosts_list(hosts)) logger.info('Creating topological graph') g = g5k_graph(elements=hosts) i, j = int(uniform(1, len(hosts))), int(uniform(1, len(hosts))) path = shortest_path(g, hosts[i], hosts[j]) logger.info('Communication between %s and %s go through ' 'the following links: \n%s', style.host(hosts[i]), style.host(hosts[j]), ' -> '.join(path)) logger.info('Active links between nodes %s and %s are: \n%s', style.host(path[0]),