def _exec_on_node(self, command, machine, log): logger.info(log) rem = ex.action.Remote(command, machine, connection_params={'user':'******'}).run() if rem.ok : logger.info("Success") else: logger.error("Failure")
def _initialize_conf(self): """Merge locally-specified configuration files with default files from the distribution.""" if os.path.exists(self.local_base_conf_dir): base_conf_files = [os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir)] for f in base_conf_files: shutil.copy(f, self.init_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] missing_conf_files = self.conf_mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str( missing_conf_files)) remote_missing_files = [os.path.join(self.conf_dir, f) for f in missing_conf_files] action = Get([self.master], remote_missing_files, self.init_conf_dir) action.run()
def serialize_cluster(cluster_type, cid, cluster_object): """Serialize the cluster object. Replace also the linked Hadoop cluster if it exists. Args: cluster_type (str): The type of cluster to serialize. cid (int): The id of the cluster. cluster_object: The cluster to serialize. """ fname = __get_cluster_file(cluster_type, cid) logger.info("Serialize cluster (" + cluster_type + ") in " + fname) c_file = open(fname, 'wb') pickle.dump(cluster_object, c_file) if cluster_type != HadoopCluster.get_cluster_type(): hc_link_fname = __get_hc_link_file(cluster_type, cid) if os.path.exists(hc_link_fname): with open(hc_link_fname) as link_file: hc_id = int(link_file.readline()) serialize_cluster(HadoopCluster.get_cluster_type(), hc_id, cluster_object.hc)
def change_conf(self, params, conf_file=None, default_file=MR_CONF_FILE): """Modify Hadoop configuration. This method copies the configuration files from the first host of each g5k cluster conf dir into a local temporary dir, do all the changes in place and broadcast the new configuration files to all hosts. Args: params (dict of str:str): The parameters to be changed in the form key:value. conf_file (str, optional): The file where parameters should be set. If not specified, all files are checked for the parameter name and the parameter is set in the file where the property is found. If not found, the parameter is set in the default file. default_file (str, optional): The default conf file where to set the parameter if not found. Only applies when conf_file is not set. """ for cluster in self.hw.get_clusters(): hosts = cluster.get_hosts() # Copy conf files from first host in the cluster action = Remote("ls " + self.conf_dir + "/*.xml", [hosts[0]]) action.run() output = action.processes[0].stdout remote_conf_files = [] for f in output.split(): remote_conf_files.append(os.path.join(self.conf_dir, f)) tmp_dir = "/tmp/mliroz_temp_hadoop/" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) action = Get([hosts[0]], remote_conf_files, tmp_dir) action.run() # Do replacements in temp file if conf_file: f = os.path.join(tmp_dir, conf_file) for name, value in params.iteritems(): replace_in_xml_file(f, name, value, True) else: temp_conf_files = [os.path.join(tmp_dir, f) for f in os.listdir(tmp_dir)] for name, value in params.iteritems(): for f in temp_conf_files: if replace_in_xml_file(f, name, value): break else: # Property not found - add it in MR_CONF_FILE logger.info("Parameter with name " + name + " has not " "been found in any conf file. Setting it " "in " + default_file) f = os.path.join(tmp_dir, default_file) replace_in_xml_file(f, name, value, True) # Copy back the files to all hosts self._copy_conf(tmp_dir, hosts)
def initial_state(self, outdir=None): """ Convert the dict given from parameters to Numpy array """ logger.info(style.log_header('Initial boxes configuration\n') + ''.ljust(8) + ''.join([style.emph(box.rjust(10)) for box in self.Boxes.iterkeys()]) + style.object_repr('\n' + 'Delta'.ljust(8)) + ''.join([str(box['Delta']).rjust(10) for box in self.Boxes.itervalues()]) + style.object_repr('\n' + 'Mass'.ljust(8)) + ''.join([str(box['Mass']).rjust(10) for box in self.Boxes.itervalues()]) ) if outdir is None: outdir = self.result_dir + '/' self.plot_state(self.Boxes.keys(), array([box['Delta'] for box in self.Boxes.itervalues()]), name='_initial', outdir=outdir) self._Mass = array([box['Mass'] for box in self.Boxes.itervalues()]) self._Flux = array([box.values() for box in self.Flux.values()]) self._Partcoeff = array([box.values() for box in self.Partcoeff.values()]) f = open(outdir + '/Delta.initial', 'w') for box, value in self.Boxes.iteritems(): f.write(box + ' ' + str(value['Delta']) + '\n') f.close() return [box['Delta'] for box in self.Boxes.itervalues()]
def install_os(reconfigure, tags = None): update_config_state() # Clone or pull Kolla if os.path.isdir('kolla'): logger.info("Remove previous Kolla installation") kolla_path = os.path.join(SCRIPT_PATH, "kolla") call("rm -rf %s" % kolla_path, shell=True) logger.info("Cloning Kolla") call("cd %s ; git clone %s -b %s > /dev/null" % (SCRIPT_PATH, KOLLA_REPO, KOLLA_BRANCH), shell=True) logger.warning("Patching kolla, this should be \ deprecated with the new version of Kolla") playbook = os.path.join(SCRIPT_PATH, "ansible/patches.yml") inventory_path = os.path.join(SYMLINK_NAME, 'multinode') run_ansible([playbook], inventory_path, STATE['config']) kolla_cmd = [os.path.join(SCRIPT_PATH, "kolla", "tools", "kolla-ansible")] if reconfigure: kolla_cmd.append('reconfigure') else: kolla_cmd.append('deploy') kolla_cmd.extend(["-i", "%s/multinode" % SYMLINK_NAME, "--passwords", "%s/passwords.yml" % SYMLINK_NAME, "--configdir", "%s" % SYMLINK_NAME]) if tags is not None: kolla_cmd.extend(["--tags", args]) call(kolla_cmd)
def __init__(self): """ Add options for the number of measures, migration bandwidth, number of nodes walltime, env_file or env_name, stress, and clusters and initialize the engine """ super(BoxModel, self).__init__() self.init_plots() logger.setLevel('INFO') logger.info(set_style('\n\n Welcome to the human isotopic Box Model\n', 'log_header'))
def _run_or_abort(self, cmd, host, error_message, tear_down=True, conn_params=None): """Attempt to run a command on the given host. If the command fails, error_message and the process error output will be printed. In addition, if tear_down is True, the tear_down() method will be called and the process will exit with return code 1""" if conn_params: p = EX.SshProcess(cmd, host, conn_params) else: p = EX.SshProcess(cmd, host) p.run() if p.exit_code != 0: logger.warn(error_message) if p.stderr is not None: logger.warn(p.stderr) logger.info(' '.join(p.cmd)) if tear_down: self.tear_down() exit(1)
def __force_clean(self): """Stop previous Hive processes (if any) and remove all remote files created by it.""" hive_processes = [] force_kill = False for h in self.hosts: proc = SshProcess("jps", self.master) proc.run() ids_to_kill = [] for line in proc.stdout.splitlines(): field = line.split() if field[1] in hive_processes: ids_to_kill.append(field[0]) if ids_to_kill: force_kill = True ids_to_kill_str = "" for pid in ids_to_kill: ids_to_kill_str += " " + pid proc = SshProcess("kill -9" + ids_to_kill_str, h) proc.run() if force_kill: logger.info( "Processes from previous hadoop deployments had to be killed") self.clean_logs()
def _copy_base_conf(self): """Copy base configuration files to tmp dir.""" self.temp_conf_dir = tempfile.mkdtemp("", "spark-", "/tmp") if os.path.exists(self.local_base_conf_dir): base_conf_files = [ os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir) ] for f in base_conf_files: shutil.copy(f, self.temp_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] mandatory_files = [] missing_conf_files = mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str(missing_conf_files)) remote_missing_files = [ os.path.join(self.conf_dir, f) for f in missing_conf_files ] action = Get([self.master], remote_missing_files, self.temp_conf_dir) action.run()
def __force_clean(self): """Stop previous Spark processes (if any) and remove all remote files created by it.""" spark_processes = ["Master", "Worker"] force_kill = False for h in self.hosts: proc = SshProcess("jps", h) proc.run() ids_to_kill = [] for line in proc.stdout.splitlines(): field = line.split() if field[1] in spark_processes: ids_to_kill.append(field[0]) if ids_to_kill: force_kill = True ids_to_kill_str = "" for pid in ids_to_kill: ids_to_kill_str += " " + pid logger.warn("Killing running Spark processes in host %s" % style.host(h.address.split('.')[0])) proc = SshProcess("kill -9" + ids_to_kill_str, h) proc.run() if force_kill: logger.info( "Processes from previous hadoop deployments had to be killed") self.clean_logs()
def start_spark(self): """Start spark processes. In STANDALONE mode it starts the master and slaves. In YARN mode it just checks that Hadoop is running, and starts it if not. """ logger.info("Starting Spark") if self.running: logger.warn("Spark was already started") return if self.mode == STANDALONE_MODE: proc = SshProcess( self.sbin_dir + "/start-master.sh;" + self.sbin_dir + "/start-slaves.sh;", self.master) proc.run() if not proc.finished_ok: logger.warn("Error while starting Spark") return elif self.mode == YARN_MODE: if not self.hc.running: logger.warn("YARN services must be started first") self.hc.start_and_wait() self.running = True
def start_spark(self): """Start spark processes. In STANDALONE mode it starts the master and slaves. In YARN mode it just checks that Hadoop is running, and starts it if not. """ logger.info("Starting Spark") if self.running: logger.warn("Spark was already started") return if self.mode == STANDALONE_MODE: proc = SshProcess(self.sbin_dir + "/start-master.sh;" + self.sbin_dir + "/start-slaves.sh;", self.master) proc.run() if not proc.finished_ok: logger.warn("Error while starting Spark") return elif self.mode == YARN_MODE: if not self.hc.running: logger.warn("YARN services must be started first") self.hc.start_and_wait() self.running = True
def __init__(self): """Initialize the execo engine""" super(IsotopicBoxModel, self).__init__() self.init_plots() logger.info(style.log_header('\n\n Welcome to the ' + 'human isotopic Box Model\n')) logger.debug(pformat(self.__dict__))
def get_host(self): """Returns the hosts from an existing reservation (if any), or from a new reservation""" # Look if there is a running job self.site = get_cluster_site(self.config['cluster']) jobs = EX5.get_current_oar_jobs([self.site]) self.job_id = None for t in jobs: if EX5.get_oar_job_info( t[0], self.site)['name'] == self.options.job_name: self.job_id = t[0] break if self.job_id: logger.info('Using job %s' % style.emph(self.job_id)) else: logger.info('Making a new reservation') self._make_reservation(self.site) if not self.job_id: logger.error("Could not get a reservation for the job") exit(6) EX5.wait_oar_job_start(self.job_id, self.site) pp(EX5.get_oar_job_nodes(self.job_id, self.site)) return EX5.get_oar_job_nodes(self.job_id, self.site)[0]
def run(self): # Go to the result folder before everything os.chdir(self.result_dir) # jobs = [(_jobID, _site)] # Get nodes # nodes = get_oar_job_nodes(_jobID, _site) try: # logger.info("Creating hostfiles for all combinations...") # for nbr_node in _nbrNodes: # hostfile_filename = self.result_dir + '/' + 'hostfile-' + nbr_node # with open(hostfile_filename, 'w') as hostfile: # for node in nodes[:int(nbr_node)]: # print>>hostfile, node.address spack_command = 'spack install -v chameleon@trunk+starpu+fxt ^starpu@svn-trunk+fxt' # spack_process = Remote(spack_command, nodes) logger.info("Starting StarPU installation...") spack_process = Process(spack_command).start() spack_process.wait() logger.info("StarPU installation DONE...") if (not spack_process.ok): logger.info("Error : " + spack_process.error_reason) else: logger.info("spac stdout: {}".format(spack_process.stdout)); spack_process.kill() # Pilotage except: traceback.print_exc() finally: logger.info("Fin...")
def _copy_xp_output(self): """Copy experiment's output.""" if self.output_path: remote_path = self.macro_manager.test_macros["xp.output"] # TODO: what happens if not specified? local_path = os.path.join(self.output_path, str(self.comb_id)) logger.info("Copying output to " + local_path) tmp_dir = "/tmp" # Remove file in tmp dir if exists proc = SshProcess("rm -rf " + os.path.join(tmp_dir, os.path.basename(remote_path)), self.hc.master) proc.run() # Get files in master self.hc.execute("fs -get " + remote_path + " " + tmp_dir, verbose=False) # Copy files from master action = Get([self.hc.master], [os.path.join(tmp_dir, os.path.basename(remote_path))], local_path) action.run()
def _initialize_conf(self): """Merge locally-specified configuration files with default files from the distribution.""" if os.path.exists(self.local_base_conf_dir): base_conf_files = [ os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir) ] for f in base_conf_files: shutil.copy(f, self.init_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] missing_conf_files = self.conf_mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str(missing_conf_files)) remote_missing_files = [ os.path.join(self.conf_dir, f) for f in missing_conf_files ] action = Get([self.master], remote_missing_files, self.init_conf_dir) action.run()
def __init__(self, hosts, topo_list=None): """Create a Hadoop topology object assigning each host to the corresponding rack. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). """ if topo_list: if len(hosts) == len(topo_list): self.topology = topo_list return else: logger.warn("hosts and topology have not the same length.") logger.info("Discovering topology automatically") self.topology = {} for h in hosts: nw_adapters = get_host_attributes(h)[u'network_adapters'] for nwa in nw_adapters: if (u'network_address' in nwa and nwa[u'network_address'] == h.address): self.topology[h] = "/" + nwa[u'switch'] break
def _copy_base_conf(self): """Copy base configuration files to tmp dir.""" self.temp_conf_dir = tempfile.mkdtemp("", "hadoop-", "/tmp") if os.path.exists(self.local_base_conf_dir): base_conf_files = [os.path.join(self.local_base_conf_dir, f) for f in os.listdir(self.local_base_conf_dir)] for f in base_conf_files: shutil.copy(f, self.temp_conf_dir) else: logger.warn( "Local conf dir does not exist. Using default configuration") base_conf_files = [] mandatory_files = [CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE] missing_conf_files = mandatory_files for f in base_conf_files: f_base_name = os.path.basename(f) if f_base_name in missing_conf_files: missing_conf_files.remove(f_base_name) logger.info("Copying missing conf files from master: " + str( missing_conf_files)) remote_missing_files = [os.path.join(self.conf_dir, f) for f in missing_conf_files] action = Get([self.master], remote_missing_files, self.temp_conf_dir) action.run()
def load(self): """Load the configuration file""" # Load the configuration file try: with open(self.config_path) as config_file: config = yaml.load(config_file) except: logger.error("Error reading configuration file %s" % self.config_path) t, value, tb = sys.exc_info() print("%s %s" % (str(t), str(value))) sys.exit(23) # Load g5k networks with open(NETWORK_FILE) as network_file: self.networks = yaml.load(network_file) self.config = {} self.config.update(DEFAULT_CONFIG) self.config.update(config) logger.info("Configuration file loaded : %s" % self.config_path) logger.info(pf(self.config)) return self.config
def _copy_xp_output(self): """Copy experiment's output.""" if self.output_path: remote_path = self.macro_manager.test_macros[ "xp.output"] # TODO: what happens if not specified? local_path = os.path.join(self.output_path, str(self.comb_id)) logger.info("Copying output to " + local_path) tmp_dir = "/tmp" # Remove file in tmp dir if exists proc = SshProcess( "rm -rf " + os.path.join(tmp_dir, os.path.basename(remote_path)), self.hc.master) proc.run() # Get files in master self.hc.execute("fs -get " + remote_path + " " + tmp_dir, verbose=False) # Copy files from master action = Get( [self.hc.master], [os.path.join(tmp_dir, os.path.basename(remote_path))], local_path) action.run()
def boot_vms_by_core(vms): """ """ n_vm = len(vms) if n_vm == 0: return True if isinstance(vms[0]['host'], Host): host = vms[0]['host'].address.split('.')[0] else: host = vms[0]['host'].split('.')[0] sub_vms = {} for i_core in list(set(vm['cpuset'] for vm in vms)): sub_vms[i_core] = list() for vm in vms: if vm['cpuset'] == i_core: sub_vms[i_core].append(vm) booted_vms = 0 while len(sub_vms.keys()) > 0: vms_to_boot = [] for i_core in sub_vms.keys(): vms_to_boot.append(sub_vms[i_core][0]) sub_vms[i_core].pop(0) if len(sub_vms[i_core]) == 0: del sub_vms[i_core] logger.info(style.Thread(host) + ': Starting VMS '+', '.join( [vm['id'] for vm in sorted(vms_to_boot)])) start_vms(vms_to_boot).run() booted = wait_vms_have_started(vms_to_boot) if not booted: return False booted_vms += len(vms_to_boot) logger.info(style.Thread(host)+': '+style.emph(str(booted_vms)+'/'+str(n_vm))) return True
def prepare_dataset(self, comb): """Prepare the dataset to be used in the next set of experiments. Args: comb (dict): The combination containing the dataset's parameters. Returns: dict: The dataset parameters. """ # Create ds_comb (ds_class_name, ds_params) = self.comb_manager.get_ds_class_params(comb) local_path = ds_params["local_path"] remote_path = os.path.join(self.div_p2p.remote_dir, os.path.basename(local_path)) ds_comb = {"ds.class.path": remote_path, "ds.class": ds_class_name} # Copy dataset to host logger.info(self._th_prefix() + "Prepare dataset with combination " + str(self.comb_manager.get_ds_parameters(comb))) copy_code = TaktukPut([self.div_p2p.host], [local_path], remote_path) copy_code.run() # Notify stats manager self.stats_manager.add_ds(self.ds_id, comb) return ds_comb
def plot_state(self, boxes, deltas, name = '', outdir = None): """ Make a graph of a given state """ graph = Dot(graph_type='digraph', fontname="Verdana", size="10, 5", fixedsize= True) i_box = 0 for box in boxes: textcolor = 'white' if sum( [ self.color_chars.index(col) for col in self.plots_conf[box]['color'].split('#')[1] ] ) < 35 else 'black' node_box = Node(box, style="filled", label = '<<font POINT-SIZE="10" color="'+textcolor+'">'+box+'<br/> '+ "%.7f" % round(deltas[i_box], 7)+'</font>>', fillcolor = self.plots_conf[box]['color'], shape = self.plots_conf[box]['shape']) i_box += 1 graph.add_node(node_box) for box_from, boxes_to in self.Flux.iteritems(): for box_to, flux in boxes_to.iteritems(): if flux !=0: if flux > 0: edge = Edge(box_from, box_to, label = '<<font POINT-SIZE="10">'+str(flux)+'</font>>') elif flux < 0: edge = Edge(box_to, box_from, label = '<<font POINT-SIZE="10">'+str(flux)+'</font>>') graph.add_edge(edge) if outdir is None: outdir = self.result_dir outfile = outdir+'/state'+name+'.png' graph.write_png(outfile) logger.info('State has been saved to '+set_style(outfile, 'emph'))
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" # Configure master and slaves self.hosts = hosts self.master = hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.host_clusters = {} for h in self.hosts: g5k_cluster = get_host_cluster(h) if g5k_cluster in self.host_clusters: self.host_clusters[g5k_cluster].append(h) else: self.host_clusters[g5k_cluster] = [h] # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def define_parameters(self): """ """ parameters = self.get_parameters("conf.xml") sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def update_config_state(): """ Update STATE['config'] with the config file options """ config_file = STATE['config_file'] with open(config_file, 'r') as f: STATE['config'].update(yaml.load(f)) logger.info("Reloaded config %s", STATE['config'] )
def update_config_state(): """ Update STATE['config'] with the config file options """ config_file = STATE['config_file'] with open(config_file, 'r') as f: STATE['config'].update(yaml.load(f)) logger.info("Reloaded config %s", STATE['config'])
def _remove_xp_output(self): """Remove experiment's output.""" if self.remove_output: logger.info("Remove output") self.hc.execute( "fs -rmr " + self.macro_manager.test_macros["xp.output"], verbose=False) # TODO: what happens if not specified?
def _remove_xp_output(self): """Remove experiment's output.""" if self.remove_output: logger.info("Remove output") self.hc.execute("fs -rmr " + self.macro_manager.test_macros["xp.output"], verbose=False) # TODO: what happens if not specified?
def _copy_xp_stats(self): """Copy job stats and clean them in the cluster.""" if self.stats_path: local_path = os.path.join(self.stats_path, str(self.comb_id)) logger.info("Copying stats to " + local_path) self.hc.stop() self.hc.copy_history(local_path) self.hc.clean_history()
def submit_job(self, comb): """Use the batch script""" logger.info('Submiting job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = sp.Popen('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', shell=True, stdout=sp.PIPE, stderr=sp.STDOUT) return job_sub.stdout.readlines()[-1].split('.')[0]
def setup_result_dir(self): is_a_test = self.options.is_a_test run_type = "" if is_a_test: run_type = "test_" self.result_dir = script_path + '/' + run_type + 'results_' + \ time.strftime("%Y-%m-%d--%H-%M-%S") logger.info('resutlt directory: {}'.format(self.result_dir))
def create_paramsweeper(self): """Generate an iterator over combination parameters""" if self.parameters is None: parameters = self.define_parameters() logger.detail(pformat(parameters)) sweeps = sweep(parameters) logger.info('% s combinations', len(sweeps)) self.sweeper = ParamSweeper(path.join(self.result_dir, "sweeps"), sweeps)
def submit_job(self, comb): """Use the batch script on psmn""" logger.info('Submit job on '+ jobserver) comb_dir = parent_dir + slugify(comb) + '/' job_sub = SshProcess('cd ' + comb_dir + ' ; /usr/local/bin/qsub /home/stephane/ExamplePBS/batch_single', jobserver).run() return job_sub.stdout.splitlines()[-1].split('.')[0]
def stop(self): self._check_initialization() logger.info("Stopping Cassandra") self.running_cassandra = False self.running = False pass
def execute_job(self, job, node=None, verbose=True): """Execute the given Spark job in the specified node. Args: job (SparkJob): The job object. node (Host, optional): The host were the command should be executed. If not provided, self.master is chosen. verbose (bool, optional): If True stdout and stderr of remote process is displayed. Returns (tuple of str): A tuple with the standard and error outputs of the process executing the job. """ if not self.running: logger.warn("The cluster was stopped. Starting it automatically") self.start() if node is None: node = self.master exec_dir = "/tmp" # Copy necessary files to cluster files_to_copy = job.get_files_to_copy() action = Put([node], files_to_copy, exec_dir) action.run() # Get command command = job.get_command(exec_dir) # Execute logger.info("Executing spark job. Command = {" + self.bin_dir + "/spark-submit " + command + "} in " + str(node)) proc = SshProcess(self.bin_dir + "/spark-submit " + command, node) if verbose: red_color = '\033[01;31m' proc.stdout_handlers.append(sys.stdout) proc.stderr_handlers.append( ColorDecorator(sys.stderr, red_color)) proc.start() proc.wait() # Get job info job.stdout = proc.stdout job.stderr = proc.stderr job.success = (proc.exit_code == 0) return proc.stdout, proc.stderr
def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def workflow(self, comb): self.create_par_file(comb) job_id = self.submit_job(comb) logger.info('Combination %s will be treated by job %s', slugify(comb), str(job_id)) while self.is_job_running(job_id): sleep(10) self.sweeper.done(comb)
def _get_primary_vlan(self): """ Returns the primary vlan It's the vlan where node are put in when deploying """ vlan = None if len(self.vlans) > 0: vlan = self.vlans[0] logger.info("Using vlan %s" % str(vlan)) return vlan
def execute_job(self, job, node=None, verbose=True): """Execute the given Spark job in the specified node. Args: job (SparkJob): The job object. node (Host, optional): The host were the command should be executed. If not provided, self.master is chosen. verbose (bool, optional): If True stdout and stderr of remote process is displayed. Returns (tuple of str): A tuple with the standard and error outputs of the process executing the job. """ if not self.running: logger.warn("The cluster was stopped. Starting it automatically") self.start() if node is None: node = self.master exec_dir = "/tmp" # Copy necessary files to cluster files_to_copy = job.get_files_to_copy() action = Put([node], files_to_copy, exec_dir) action.run() # Get command command = job.get_command(exec_dir) # Execute logger.info("Executing spark job. Command = {" + self.bin_dir + "/spark-submit " + command + "} in " + str(node)) proc = SshProcess(self.bin_dir + "/spark-submit " + command, node) if verbose: red_color = '\033[01;31m' proc.stdout_handlers.append(sys.stdout) proc.stderr_handlers.append(ColorDecorator(sys.stderr, red_color)) proc.start() proc.wait() # Get job info job.stdout = proc.stdout job.stderr = proc.stderr job.success = (proc.exit_code == 0) return proc.stdout, proc.stderr
def clean_data(self): """Remove all data used by Hive""" logger.info("Cleaning data") # Warehouse self.hc.execute("fs -rm -r /user/hive/warehouse", verbose=False) # Metastore # TODO shutil.rmtree(self.metastore_dir)
def define_parameters(self): """ """ parameters = { 'blas' : ['none','mkl','atlas','openblas'], 'experiment' : ['aevol','raevol'], 'compilator' : ['gcc','intel'], 'parallel' : ['openmp','tbb'] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining()))
def _create_warehouse(self): """ """ if not self.hc.running: logger.warn("Hadoop must be started first") self.hc.start_and_wait() logger.info("Creating warehouse dirs in HDFS") self.hc.execute("fs -mkdir -p /tmp", verbose=False) self.hc.execute("fs -mkdir -p /user/hive/warehouse", verbose=False) self.hc.execute("fs -chmod g+w /tmp", verbose=False) self.hc.execute("fs -chmod g+w /user/hive/warehouse", verbose=False)
def run_bench(output_folder, node): # debian part logger.info("Starting debian benchs...") debian_folder = create_subdir(output_folder, "debian") debian_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/debian-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(debian_folder) debian_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(debian_bench_command), node).run() # hermitux part logger.info("Starting hermitux benchs...") hermitux_folder = create_subdir(output_folder, "hermitux") hermitux_bench_command = "../../tools/benchs.sh \"commands/omp-tasks/hermitux-omp-tasks\" {} 10 \"1 2 4 8 16\"".format(hermitux_folder) hermitux_bench = Remote('cd ./unikernel-tools/benchs/bots && {}'.format(hermitux_bench_command), node).run()
def build_roles(self): """ Returns a dict that maps each role to a list of G5K nodes:: { 'controller': [paravance-1, paravance-5], 'compute': [econome-1] } """ def mk_pools(): "Indexes each node by its cluster to construct pools of nodes." pools = {} for cluster, nodes in groupby( self.deployed_nodes, lambda node: node.address.split('-')[0]): pools.setdefault(cluster, []).extend(list(nodes)) return pools def pick_nodes(pool, n): "Picks n node in a pool of nodes." nodes = pool[:n] del pool[:n] return nodes # Maps a role (eg, controller) with a list of G5K node roles_set = set() for roles in self.config['resources'].values(): roles_set.update(roles.keys()) roles = {k: [] for k in roles_set} roles_goal = {k: 0 for k in roles_set} # compute the aggregated number of nodes per roles for r in self.config['resources'].values(): for k,v in r.items(): roles_goal[k] = roles_goal[k] + v pools = mk_pools() for cluster, rs in self.config['resources'].items(): current = pick_nodes(pools[cluster], 1) # distribute node into roles for r in rs.keys() * len(self.deployed_nodes): if current == []: break if current != [] and len(roles[r]) < roles_goal[r]: roles.setdefault(r, []).extend(current) current = pick_nodes(pools[cluster], 1) logger.info("Roles: %s" % pf(roles)) at_least_one = all(len(n) >= 1 for n in roles.values()) if not at_least_one: # Even if we aren't in strict mode we garantee that # there will be at least on node per role raise Exception("Role doesn't have at least one node each") return roles
def run(self): """ """ try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb: self.workflow(comb) finally: logger.info("Compilation DONE")
def xp(self, comb): comb_ok = False try: """ tout ton xp """ comb_ok = True finally: if comb_ok: self.sweeper.done(comb) else: self.sweeper.cancel(comb) logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining()))
def create_paramsweeper(self): """Test all the sites, with or without a KaVLAN and for several env.""" params = { "version": ['kadeploy3-dev', 'kadeploy3'], "kavlan": [True, False], "site": get_g5k_sites(), "n_nodes": [1, 4, 10], "env": ['wheezy-x64-base', 'wheezy-x64-prod', 'wheezy-x64-xen'] } logger.info('Defining parameters: %s', pformat(params)) combs = sweep(params) return ParamSweeper(self.result_dir + "/sweeper", combs)