class mpi_bench(Engine): def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() if self.prepare_bench(): logger.info('Bench prepared on all frontends') self.run_xp() def define_parameters(self): """Create the iterator on the parameters combinations to be explored""" # fixed number of nodes self.n_nodes = 4 # choose a list of clusters clusters = ['graphene', 'petitprince', 'edel', 'paradent', 'stremi'] #clusters = ['petitprince', 'paradent'] # compute the maximum number of cores among all clusters max_core = self.n_nodes * max([ get_host_attributes(cluster + '-1')['architecture']['smt_size'] for cluster in clusters]) # define the parameters self.parameters = { 'cluster' : clusters, 'n_core': filter(lambda i: i >= self.n_nodes, list(takewhile(lambda i: i<max_core, (2**i for i in count(0, 1))))), 'size' : ['A', 'B', 'C'] } logger.info(self.parameters) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s' % len(self.sweeper.get_remaining())) def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
class oar_replay_workload(Engine): def init(self): parser = self.options_parser parser.add_option('--is_a_test', dest='is_a_test', action='store_true', default=False, help='prefix the result folder with "test", enter ' 'a debug mode if fails and remove the job ' 'afterward, unless it is a reservation') parser.add_option('--already_configured', dest='already_configured', action='store_true', default=False, help='if set, the OAR cluster is not re-configured') parser.add_option('--reservation_id', help="Grid'5000 reservation job ID") parser.add_argument('experiment_config', 'The config JSON experiment description file') def setup_result_dir(self): is_a_test = self.options.is_a_test run_type = "" if is_a_test: run_type = "test_" self.result_dir = script_path + '/' + run_type + 'results_' + \ time.strftime("%Y-%m-%d--%H-%M-%S") logger.info('resutlt directory: {}'.format(self.result_dir)) def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = { 'workload_filename': workloads } logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format( str(self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start( job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn( "NOT deployed nodes: {}".format(str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info( "installing OAR nodes: {}".format(str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ("oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format(str(nodes[0]))) install_master = SshProcess(install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ("oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info( "configuring OAR database: {}".format(str(nodes[0]))) config_master = SshProcess(create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process('scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={'user': '******'}).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format(node=node.address) add_resources = SshProcess(add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError("error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename(combi['workload_filename']) oar_replay = SshProcess(script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
class raevol_matrix(Engine): def __init__(self): """Overloading class initialization with parent and adding options""" super(raevol_matrix, self).__init__() self.options_parser.set_usage("usage: %prog ") self.options_parser.set_description("Execo Engine that can be used to" + \ "perform automatic virtual machines experiments") self.options_parser.add_option("-n", dest="n_nodes", help="maximum number of nodes used", type="int", default=200) self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="02:00:00") self.options_parser.add_option( "-j", dest="oargrid_job_id", help="oargrid_job_id to relaunch an engine", type=int, default=None) self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") self.options_parser.add_option("-u", dest="selected_cluster", help="run on a specific cluster.", type="string", default="taurus") self.options_parser.add_option("-o", dest="outofchart", help="Run the engine outside days", action="store_true") self.options_parser.add_option( "-s", dest="storage5k_job_id", help="storage5k_job_id to store the data", type=int) def run(self): """ """ if self.options.oargrid_job_id is not None: self.oar_job_id = self.options.oargrid_job_id else: self.oar_job_id = None self.list_of_clusters = [ 'parasilo', 'paravance', 'parapluie', 'paranoia' ] try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() self.working_dir = '/data/jorouzaudcornabas_' + str( self.options.storage5k_job_id) job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.submit_all_available_best_effort( self.list_of_clusters, self.options.walltime) # self.make_reservation_local() # Wait that the job starts logger.info('Waiting that the job start ' + str(self.oar_job_id)) wait_oar_job_start(self.oar_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') #=============================================================== # deployment = Deployment(hosts = self.hosts, # env_file='/home/sirimie/env/mywheezy-x64-base.env') # self.hosts, _ = deploy(deployment) #=============================================================== if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = self.hosts threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/logs' if not os.path.exists(comb_dir): os.mkdir(comb_dir) logger.info("Starting the thread " + str(self.is_job_alive()) + " " + str(len(threads.keys()))) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] logger.info("Launching thread") t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([self.oar_job_id]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """ """ parameters = { 'seed': [ 51456165, 33263658, 7158785, 456847894, 1223144, 878944, 121145, 3587842 ], 'mutation': ['5e-4', '1e-4', '5e-5', '5e-6'], 'env': ['const', 'lat_3', 'lat_all'], 'selection': [750, 2000, 4000] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def make_reservation(self): """ """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) planning = get_planning(elements=[self.options.selected_cluster], starttime=starttime) slots = compute_slots(planning, self.options.walltime) wanted = {self.options.selected_cluster: 0} start_date, end_date, resources = find_first_slot(slots, wanted) wanted[self.options.selected_cluster] = resources[ self.options.selected_cluster] actual_resources = distribute_hosts(resources, wanted) job_specs = get_jobs_specs(actual_resources, name='Aevol_diff_area') logger.info("try to reserve " + str(actual_resources)) self.oargrid_job_id, _ = oargridsub( job_specs, walltime=end_date - start_date, job_type=['besteffort"' 'allow_classic_ssh']) logger.info("Reservation done") def make_reservation_local(self): """Perform a reservation of the required number of nodes, with 4000 IP. """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) self.cluster = self.options.selected_cluster startdate, n_nodes = self._get_nodes(starttime, endtime) while not n_nodes: logger.info('No enough nodes found between %s and %s, ' + \ 'increasing time window', format_date(starttime), format_date(endtime)) starttime = endtime endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate, n_nodes = self._get_nodes(starttime, endtime) if starttime > int(time.time() + timedelta_to_seconds(datetime.timedelta( weeks=6))): logger.error('There are not enough nodes on %s for your ' + \ 'experiments, abort ...', self.cluster) exit() startdate = [] jobs_specs = get_jobs_specs({self.cluster: n_nodes}, name=self.__class__.__name__) sub = jobs_specs[0][0] tmp = str(sub.resources).replace('\\', '') sub.resources = tmp.replace('"', '') sub.walltime = self.options.walltime sub.additional_options = '-t allow_classic_ssh -t besteffort' (self.oar_job_id, self.frontend) = oarsub(jobs_specs)[0] logger.info('Startdate: besteffort, n_nodes: %s', str(n_nodes)) def _get_nodes(self, starttime, endtime): """ """ planning = get_planning(elements=[self.cluster], starttime=starttime, endtime=endtime, out_of_chart=self.options.outofchart) slots = compute_slots(planning, self.options.walltime) startdate = slots[0][0] i_slot = 0 n_nodes = slots[i_slot][2][self.cluster] logger.info("nodes %s in %s at %s", str(n_nodes), str(self.cluster), format_date(startdate)) while n_nodes < self.options.n_nodes: logger.debug(slots[i_slot]) startdate = slots[i_slot][0] n_nodes = slots[i_slot][2][self.cluster] i_slot += 1 if i_slot == len(slots) - 1: return False, False return startdate, n_nodes def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(str(host).split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: self.export = "source ~/aevol_binary/intel/linux/mkl/bin/mklvars.sh intel64; " bucketname = self.working_dir + '/raevol_5_mut_lat/' + slugify( comb) + '/' if os.path.isdir(bucketname) and os.path.exists(bucketname + '/last_gener.txt'): logger.info(thread_name + "Resuming AEVOL from NFS backup") gen_file = open(bucketname + '/last_gener.txt', 'r') last_gen = gen_file.read() if int(last_gen) < 500000: logger.info(thread_name + "Resuming AEVOL Run from " + str(int(last_gen))) rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_binary/aevol/src/aevol_run -p 16' + ' -e 300000 -r ' + last_gen + ' >> aevol_run.log', [host]).run() if rem.ok: comb_ok = True else: comb_ok = True else: Remote('mkdir -p ' + bucketname, [host]).run() param_file = '/home/jorouzaudcornabas/aevol_binary/execo/mut_lat/param_tmpl.in' logger.info(thread_name + 'Generate config file ' + param_file) f_template = open(param_file) fd, outfile = mkstemp(dir='/tmp/', prefix=slugify(comb) + '_param') f = os.fdopen(fd, 'w') for line in f_template: line = line.replace('SEED_NUMBER', str(comb['seed'])) line = line.replace('FUZZY_VERSION', str(comb['fuzzy'])) if comb['move']: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.25') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.65') else: line = line.replace('FIRST_GAUSSIAN_MEDIAN', '0.2') line = line.replace('THIRD_GAUSSIAN_MEDIAN', '0.6') line = line.replace('GAUSSIAN_HEIGHT', str(comb['height'])) f.write(line) f_template.close() f.close() put_file = Put([host], [outfile], remote_location=bucketname).run() if not put_file.ok: exit() os.remove(outfile) Remote( 'cd ' + bucketname + '; cp ' + outfile.split('/')[-1] + ' param.in', [host]).run() logger.info(thread_name + "Launching AEVOL Create") Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_create > aevol_create.log', [host]).run() logger.info(thread_name + "Launching AEVOL Run") rem = Remote( self.export + 'cd ' + bucketname + '; /home/jorouzaudcornabas/aevol_diff_area/aevol/src/aevol_run -p 16 -n 500000 > aevol_run.log', [host]).run() if rem.ok: comb_ok = True logger.info(thread_name + 'Get results ' + comb_dir + "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, removing existing files', comb_dir + "/" + slugify(comb)) #shutil.rmtree(comb_dir+ "/" + slugify(comb)) #try: #os.mkdir(comb_dir + "/" + slugify(comb)) #except: #logger.warning(thread_name + #'%s already exists, recreating directory', comb_dir + "/" + slugify(comb)) #get_results = Get([host], [bucketname+ "/aevol_create.log", bucketname+ "/aevol_run.log", bucketname+'/stats/'], #local_location=comb_dir + "/" + slugify(comb)).run() #for p in get_results.processes: #if not p.ok: #logger.error(thread_name + #': Unable to retrieve the files for combination %s', #slugify(comb)) #exit() finally: if comb_ok: self.sweeper.done(comb) # shutil.rmtree(bucketname) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez = get_oar_job_info(self.oar_job_id) if rez['state'] == 'Error': return False if (rez["start_date"] + rez["walltime"] > time.time()): return True else: return False def get_immediately_available_nodes(self, list_of_clusters, walltime, ignore_besteffort=False): planning = get_planning(list_of_clusters, ignore_besteffort=False) slots = compute_slots(planning, walltime) wanted = {cluster: 0 for cluster in list_of_clusters} start_date, end_date, resources = find_first_slot(slots, wanted) actual_resources = { resource: n_nodes for resource, n_nodes in resources.iteritems() if resource in list_of_clusters and n_nodes > 0 } return start_date, end_date, actual_resources def submit_all_available_best_effort(self, list_of_clusters, walltime): start_date, end_date, resources = self.get_immediately_available_nodes( list_of_clusters, walltime, ignore_besteffort=False) job_specs = get_jobs_specs(resources) for j, f in job_specs: j.job_type = "besteffort" j.additional_options = "-t allow_classic_ssh" #jobs = oarsub(job_specs) (self.oar_job_id, self.frontend) = oarsub(job_specs)[0] print job_specs return self.oar_job_id
class paasage_simu(Engine): JVM = 'java' SGCBJAR = 'SGCB_nTier.jar' PJDUMP = 'pj_dump' RSCRIPT = 'Rscript' def __init__(self): """Overloading class initialization with parent and adding options""" super(paasage_simu, self).__init__() self.options_parser.set_usage("usage: %prog ") self.options_parser.set_description("Execo Engine that can be used to" + \ "perform automatic virtual machines experiments") self.options_parser.add_option("-n", dest="n_nodes", help="maximum number of nodes used", type="int", default=200) self.options_parser.add_option("-w", dest="walltime", help="walltime for the reservation", type="string", default="05:00:00") self.options_parser.add_option( "-j", dest="oargrid_job_id", help="oargrid_job_id to relaunch an engine", type=int) self.options_parser.add_option("-k", dest="keep_alive", help="keep reservation alive ..", action="store_true") def run(self): """ """ if self.options.oargrid_job_id: self.oargrid_job_id = self.options.oargrid_job_id else: self.oargrid_job_id = None try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() job_is_dead = False # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oargrid_job_id is None: self.make_reservation() # Wait that the job starts logger.info('Waiting that the job start') wait_oargrid_job_start(self.oargrid_job_id) # Retrieving the hosts and subnets parameters self.hosts = get_oargrid_job_nodes(self.oargrid_job_id) # Hosts deployment and configuration default_connection_params['user'] = '******' logger.info("Start hosts configuration") ex_log.setLevel('INFO') deployment = Deployment( hosts=self.hosts, env_file='/home/sirimie/env/mywheezy-x64-base.env') self.hosts, _ = deploy(deployment) Remote("rm -f /home/Work/sgcbntier/paasage_demo/csv/REQTASK_*", self.hosts).run() Remote( "rm -f /home/Work/sgcbntier/paasage_demo/platform_aws.xml", self.hosts).run() Remote("rm -f /home/Work/sgcbntier/paasage_demo/cloud_ec2.xml", self.hosts).run() Put(self.hosts, [ "run_all_execo.py", "xml_gen_execo.py", "conf.xml", "platform_aws.xml", "cloud_ec2.xml" ], remote_location="/home/Work/sgcbntier/paasage_demo/").run( ) logger.info("Done") if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = [ host for host in self.hosts for i in range( get_host_attributes(host)['architecture']['smt_size']) ] threads = {} # Creating the unique folder for storing the results comb_dir = self.result_dir + '/csv_results' if not os.path.exists(comb_dir): os.mkdir(comb_dir) # Checking that the job is running and not in Error while self.is_job_alive() or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.append(tmp_threads[t]['host']) del threads[t] sleep(5) if not self.is_job_alive(): job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break host = available_hosts[0] available_hosts = available_hosts[1:] t = Thread(target=self.workflow, args=(comb, host, comb_dir)) threads[t] = {'host': host} t.daemon = True t.start() if not self.is_job_alive(): job_is_dead = True if job_is_dead: self.oargrid_job_id = None finally: if self.oargrid_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oargriddel([self.oargrid_job_id]) else: logger.info('Keeping job alive for debugging') def define_parameters(self): """ """ parameters = self.get_parameters("conf.xml") sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def make_reservation(self): """ """ logger.info('Performing reservation') starttime = int(time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) planning = get_planning(elements=['grid5000'], starttime=starttime) slots = compute_slots(planning, self.options.walltime) wanted = {"grid5000": 0} start_date, end_date, resources = find_first_slot(slots, wanted) wanted['grid5000'] = min(resources['grid5000'], self.options.n_nodes) actual_resources = distribute_hosts(resources, wanted) job_specs = get_jobs_specs(actual_resources, name='Paasage_Simu') logger.info("try to reserve " + str(actual_resources)) self.oargrid_job_id, _ = oargridsub(job_specs, start_date, walltime=end_date - start_date, job_type="deploy") logger.info("Reservation done") def create_string(self, param): res_str = "" for key, value in param.iteritems(): res_str += key + "_" + str(value) + "_" return res_str def workflow(self, comb, host, comb_dir): """ """ comb_ok = False thread_name = style.Thread(host.split('.')[0]) + ': ' logger.info(thread_name + 'Starting combination ' + slugify(comb)) try: logger.info(thread_name + 'Generate conf file') param_str = self.create_string(comb) Remote( "python /home/Work/sgcbntier/paasage_demo/xml_gen_execo.py --cb " + param_str, [host]).run() logger.info(thread_name + 'Run code') Remote( "cd /home/Work/sgcbntier/paasage_demo/ ; python run_all_execo.py --cb %s" % param_str, [host]).run() logger.info(thread_name + 'Get results') traceFile = "ntier_" + param_str get_results = Get([host], [ "/home/Work/sgcbntier/paasage_demo/csv/REQTASK_" + traceFile + ".csv" ], local_location=comb_dir).run() for p in get_results.processes: if not p.ok: logger.error( host + ': Unable to retrieve the files for combination %s', slugify(comb)) exit() comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(thread_name + ': ' + slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(thread_name + ': ' + slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez = get_oargrid_job_info(self.oargrid_job_id) return (rez["start_date"] + rez["walltime"] > time.time()) def get_parameters(self, file_name): """Get the parameters to sweep, from the configuration file""" tree = ET.parse(file_name) rootSrc = tree.getroot() param = dict() for inst in rootSrc.iter("instance"): ty = inst.get("type") qt = inst.get("quantity") if (qt.isdigit()): param[ty] = qt else: ends = qt.split("-") param[ty] = range(int(ends[0]), int(ends[1]) + 1) return param
class compil_aevol(Engine): def __init__(self): """Overloading class initialization with parent and adding options""" super(compil_aevol, self).__init__() def run(self): """ """ try: # Creation of the main iterator which is used for the first control loop. self.define_parameters() # While there are combinations to treat while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb: self.workflow(comb) finally: logger.info("Compilation DONE") def define_parameters(self): """ """ parameters = { 'blas' : ['none','mkl','atlas','openblas'], 'experiment' : ['aevol','raevol'], 'compilator' : ['gcc','intel'], 'parallel' : ['openmp','tbb'] } sweeps = sweep(parameters) self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweeps) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def workflow(self, comb): """ """ comb_ok = False logger.info(slugify(comb) + \ ' starts to compile') try: export = "source /opt/intel/bin/compilervars.sh intel64; " src_directory = "/home/arrouan/workspace/aevol/git/world/aevol/" bin_directory = "/home/arrouan/workspace/aevol/compiled_binary/" configure_option = "--with-tracing --without-x" if comb['parallel'] == 'tbb': configure_option += " --with-tbb" if comb['blas'] == 'openblas': configure_option += " --with-blas" elif comb['blas'] == 'mkl': configure_option += " --with-mkl" elif comb['blas'] == 'atlas': configure_option += " --with-atlas" if comb['experiment'] == 'raevol': configure_option += " --with-raevol" if comb['compilator'] == 'intel': configure_option += " CXX=icc" full_bin_directory = bin_directory + comb['experiment']+'_'+comb['compilator']+'_'+comb['parallel']+'_'+comb['blas'] try: os.mkdir(full_bin_directory) except: for f in os.listdir(full_bin_directory): os.remove(full_bin_directory + "/" + f) p = Process(export+'cd '+src_directory+'; autoreconf; ./configure '+configure_option+'; make clean; make; cp src/aevol_run '+full_bin_directory+'/; cp src/aevol_create '+full_bin_directory+'/') p.shell = True # p.run() print p.stdout comb_ok = True finally: if comb_ok: self.sweeper.done(comb) logger.info(slugify(comb) + \ ' has been done') else: self.sweeper.cancel(comb) logger.warning(slugify(comb) + \ ' has been canceled') logger.info(style.step('%s Remaining'), len(self.sweeper.get_remaining())) def is_job_alive(self): rez=get_oar_job_info(self.oar_job_id) if (rez["start_date"]+rez["walltime"] > time.time()): return True else: return False
class oar_replay_workload(Engine): def init(self): parser = self.options_parser parser.add_option('--is_a_test', dest='is_a_test', action='store_true', default=False, help='prefix the result folder with "test", enter ' 'a debug mode if fails and remove the job ' 'afterward, unless it is a reservation') parser.add_option('--already_configured', dest='already_configured', action='store_true', default=False, help='if set, the OAR cluster is not re-configured') parser.add_option('--reservation_id', help="Grid'5000 reservation job ID") parser.add_argument('experiment_config', 'The config JSON experiment description file') def setup_result_dir(self): is_a_test = self.options.is_a_test run_type = "" if is_a_test: run_type = "test_" self.result_dir = script_path + '/' + run_type + 'results_' + \ time.strftime("%Y-%m-%d--%H-%M-%S") logger.info('resutlt directory: {}'.format(self.result_dir)) def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)
class DVFS(Engine): def __init__(self, result_dir, cluster, site): Engine.__init__(self) self.result_dir = result_dir self.cluster = cluster self.site = site def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.run_xp() def define_parameters(self): nbNodes = len(self.cluster) # build parameters and make nbCore list per benchmark freqList = [2534000, 2000000, 1200000] n_nodes = float(len(self.cluster)) max_core = SshProcess('cat /proc/cpuinfo | grep -i processor |wc -l', self.cluster[0], connection_params={ 'user': '******' }).run().stdout max_core = n_nodes * float(max_core) even = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (2**i for i in count(0, 1))))) powerTwo = filter( lambda i: i > n_nodes, list(takewhile(lambda i: i < max_core, (i**2 for i in count(0, 1))))) # Define parameters self.parameters = { 'Repeat': [1], "Freq": [2534000], "NPBclass": ['C'], "Benchmark": { # 'ft': { # 'n_core': even # }, # 'ep': { # 'n_core': even # }, # 'lu': { # 'n_core': even # }, # 'is': { # 'n_core': even # }, # 'sg': { # 'n_core': even # }, # 'bt': { # 'n_core': powerTwo # }, 'sp': { 'n_core': powerTwo } } } logger.info(self.parameters) # make all possible parameters object, self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) logger.info('Number of parameters combinations %s', len(self.sweeper.get_remaining())) def run_xp(self): master = self.cluster[0] opt = '' """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: # Take sweeper comb = self.sweeper.get_next() logger.info('Processing new combination %s' % (comb, )) try: # metric from linux sar tools, works with clock def takeMetric( path, startTime, endTime, metric=['cpu', 'mem', 'disk', 'swap', 'network']): opt = '' cmd_template_sar = ( "sar -f /var/log/sysstat/sa* -{opt} -s {startTime} -e {endTime}" ) for met in metric: if met == 'cpu': opt = 'u' elif met == 'mem': opt = 'r' elif met == 'disk': opt = 'dp' elif met == 'swap': opt = 'S' elif met == 'network': opt = 'n DEV' cmd = cmd_template_sar.format(opt=opt, startTime=startTime, endTime=endTime) for host in self.cluster: hE = SshProcess(cmd, host, connection_params={'user': '******'}) hE.run() stdMetric = host + '-' + met + '.txt' with open(os.path.join(path, stdMetric), "w") as sout: sout.write(hE.stdout) #Set CPU Freq and Policy according current combination cmd_template_Freq_Policy = ("cpufreq-set -r -g {policy}") cmd_template_Freq = ("cpufreq-set -r -f {freq}") if comb['Freq'] == 'OnDemand': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='ondemand') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() elif comb['Freq'] == 'conservative': cmd_freq_policy = cmd_template_Freq_Policy.format( policy='conservative') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() else: cmd_freq_policy = cmd_template_Freq_Policy.format( policy='userspace') Remote(cmd_freq_policy, master, connection_params={ 'user': '******' }).run() cmd_freq = cmd_template_Freq.format(freq=comb['Freq']) Remote(cmd_freq, master, connection_params={ 'user': '******' }).run() # build command src = 'source /opt/intel-performance-snapshoot/apsvars.sh' cmd_mpirun_template = ( "mpirun {opt} -f /root/cluster.txt -np {pr1} aps -r '/tmp/log/' /tmp/NPB/npb-mpi/bin/{typeNPB}.{NPBclass}.{pr2}" ) cmd_mpirun = cmd_mpirun_template.format( opt='', pr1=comb['n_core'], typeNPB=comb['Benchmark'], NPBclass=comb['NPBclass'], pr2=comb['n_core']) cmd = "{}; /tmp/NPB/bin/runMPI.sh '{}' '{}'".format( src, cmd_mpirun, slugify(comb)) curPath = self.result_dir + slugify(comb) # run Mpi through execo remote SshProcess def runMpi(cmd): act = SshProcess(cmd, master, connection_params={'user': '******'}, shell=True) act.run() if not os.path.exists(curPath): os.makedirs(curPath) with open(os.path.join(curPath, "stdout.txt"), "a+") as sout, open( os.path.join(curPath, "stderr.txt"), "w") as serr: sout.write(act.stdout) serr.write(act.stderr) return act.ok # start clock and exec command in the master node time.sleep(5) startUnix = int(time.time()) start24Hour = datetime.datetime.fromtimestamp( startUnix).strftime('%H:%M:%S') task1 = runMpi(cmd) endUnix = int(time.time()) end24Hour = datetime.datetime.fromtimestamp(endUnix).strftime( '%H:%M:%S') time.sleep(5) with open(os.path.join(curPath, "executionTime.txt"), "w") as sout: sout.write( 'ExecTime:{}\nStartDate:{}\nEndDate:{}\n'.format( str(endUnix - startUnix), start24Hour, end24Hour)) takeMetric(curPath, start24Hour, end24Hour, ['cpu', 'mem', 'disk', 'swap', 'network']) # collect power from kWAPI: grid5000 infrastructure made tool for hostname in self.cluster: powerOut = '{}_power'.format(hostname) collect_metric(startUnix, endUnix, 'power', curPath, self.site, powerOut, hostname) st = '/tmp/out/' + slugify(comb) intelAppPerf = str(st + '.html') # get the data from ['Application Performance Snapshot', 'Storage Performance Snapshot'] # https://software.intel.com/en-us/performance-snapshot Get(master, [intelAppPerf], curPath, connection_params={ 'user': '******' }).run() if task1: logger.info("comb ok: %s" % (comb, )) self.sweeper.done(comb) continue except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise logger.info("comb NOT ok: %s" % (comb, )) self.sweeper.cancel(comb)