def prepare_engine(submit_type='local', duration_job_min=60*4): # --------------------- Log.set_loglevel(20) logger.info("Start") foldername = expanduser("~")+'/slurm_jobs' if not os.path.exists(foldername): os.makedirs(foldername) logger.info("Setting engine folder to %s" % foldername) logger.info("Creating batch parameter instance") johns_slurm_hack = "#SBATCH --partition=intel-ivy,wrkstn,compute" timestr = time.strftime("%Y%m%d-%H%M%S") batch_parameters = BatchClusterParameters(max_walltime=duration_job_min, foldername=foldername, job_name_base="sim_"+timestr+"_", parameter_prefix=johns_slurm_hack) if submit_type =='slurm': logger.info("Creating slurm engine instance") engine = SlurmComputationEngine(batch_parameters) elif submit_type == "local": logger.info("Creating serial engine instance") engine = SerialComputationEngine() # --------------------- return engine
def compute(self): p = self.p data_source = self.data_source r = self.rep n = self.n job_func = self.job_func data = data_source.sample(n, seed=r) with util.ContextTimer() as t: tr, te = data.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label logger.info("computing. %s. prob=%s, r=%d,\ n=%d" % (job_func.__name__, prob_label, r, n)) job_result = job_func(p, data_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex2: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, prob_label, r, n, t.secs)) # save result fname = "%s-%s-n%d_r%d_a%.3f_trp%.2f.p" % ( prob_label, func_name, n, r, alpha, tr_proportion, ) glo.ex_save_result(ex, job_result, prob_label, fname)
def compute(self): sample_source = self.sample_source r = self.rep ni = self.ni n = self.n job_func = self.job_func logger.info("computing. %s. r=%d, n=%d"%(job_func.__name__, r, n)) tst_data = sample_source.sample(n, seed=r) tr, te = tst_data.split_tr_te(tr_proportion=tr_proportion, seed=r+20 ) prob_label = self.prob_label test_result = job_func(prob_label, tr, te, r, ni, n) # create ScalarResult instance result = SingleResult(test_result) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done. ex1: %s, r=%d, n=%d, "%(job_func.__name__, r, n)) # save result func_name = job_func.__name__ fname = '%s-%s-J%d_r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, n, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname)
def compute(self): sample_source = self.sample_source r = self.rep d = sample_source.dim() job_func = self.job_func logger.info("computing. %s. r=%d, d=%d"%(job_func.__name__, r, d)) # sample_size is a global variable tst_data = sample_source.sample(sample_size, seed=r) tr, te = tst_data.split_tr_te(tr_proportion=tr_proportion, seed=r+20 ) prob_label = self.prob_label test_result = job_func(sample_source, tr, te, r, self.n_locs) # create ScalarResult instance result = SingleResult(test_result) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done. ex2: %s, r=%d, d=%d, "%(job_func.__name__, r, d)) # save result func_name = job_func.__name__ J = self.n_locs fname = '%s-%s-J%d_n%d_r%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, sample_size, r, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname)
def compute(self): # randomly wait a few seconds so that multiple processes accessing the same # Theano function do not cause a lock problem. I do not know why. # I do not know if this does anything useful. # Sleep in seconds. time.sleep(np.random.rand(1) * 3) paired_source = self.paired_source r = self.rep n = self.n job_func = self.job_func pdata = paired_source.sample(n, seed=r) with util.ContextTimer() as t: logger.info("computing. %s. prob=%s, r=%d, n=%d" % (job_func.__name__, pdata.label, r, n)) tr, te = pdata.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label job_result = job_func(paired_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, pdata.label, r, n, t.secs)) # save result fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, r, n, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def compute(self): p = self.p data_source = self.data_source r = self.rep prob_param = self.prob_param job_func = self.job_func # sample_size is a global variable data = data_source.sample(sample_size, seed=r) with util.ContextTimer() as t: tr, te = data.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label logger.info("computing. %s. prob=%s, r=%d,\ param=%.3g" % (job_func.__name__, prob_label, r, prob_param)) job_result = job_func(p, data_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex2: %s, prob=%s, r=%d, param=%.3g. Took: %.3g s " % (func_name, prob_label, r, prob_param, t.secs)) # save result fname = '%s-%s-n%d_r%d_p%g_a%.3f_trp%.2f.p' \ %(prob_label, func_name, sample_size, r, prob_param, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def compute(self): P = self.P Q = self.Q data_source = self.data_source r = self.rep n = self.n met_func = self.met_func prob_label = self.prob_label logger.info("computing. %s. prob=%s, r=%d,\ n=%d" % (met_func.__name__, prob_label, r, n)) with util.ContextTimer() as t: job_result = met_func(P, Q, data_source, n, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = met_func.__name__ logger.info("done. ex2: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, prob_label, r, n, t.secs)) # save result fname = '%s-%s-n%d_r%d_a%.3f.p' \ %(prob_label, func_name, n, r, alpha ) glo.ex_save_result(ex, job_result, prob_label, fname)
def submit_job(self, job): # first step: check how many jobs are there in the (internal, not cluster) queue, and if we # should wait for submission until this has dropped under a certain value if self.max_jobs_in_queue > 0 and \ self._get_num_unfinished_jobs() >= self.max_jobs_in_queue and \ not isinstance(job, FireAndForgetJob): # never block for fire and forget jobs logger.info( "Reached maximum number of %d unfinished jobs in queue." % self.max_jobs_in_queue) self._wait_until_n_unfinished(self.max_jobs_in_queue) # save myself every few submissions (also done one wait_for_all is called) if len(self.all_jobs) % 100 == 0: self.save_all_job_list() # replace job's wrapped_aggregator by PBS wrapped_aggregator to allow # FS based communication # use a unique job name, but check that this folder doesnt yet exist job_name = self.create_job_name() aggregator_filename = self.get_aggregator_filename(job_name) job.aggregator = ResultAggregatorWrapper(job.aggregator, aggregator_filename, job_name, self.do_clean_up, self.store_fire_and_forget) self.submit_wrapped_pbs_job(job, job_name) return job.aggregator
def submit_job(self, job): # first step: check how many jobs are there in the (internal, not cluster) queue, and if we # should wait for submission until this has dropped under a certain value if self.max_jobs_in_queue > 0 and \ self._get_num_unfinished_jobs() >= self.max_jobs_in_queue: logger.info("Reached maximum number of %d unfinished jobs in queue." % self.max_jobs_in_queue) self._wait_until_n_unfinished(self.max_jobs_in_queue) # save myself every few submissions (also done one wait_for_all is called) if len(self.all_jobs) % 100 == 0: self.save_all_job_list() # replace job's wrapped_aggregator by PBS wrapped_aggregator to allow # FS based communication # use a unique job name, but check that this folder doesnt yet exist job_name = self.create_job_name() aggregator_filename = self.get_aggregator_filename(job_name) job.aggregator = PBSResultAggregatorWrapper(job.aggregator, aggregator_filename, job_name, self.do_clean_up, self.store_fire_and_forget) self.submit_wrapped_pbs_job(job, job_name) return job.aggregator
def compute(self): sample_source = self.sample_source r = self.rep ni = self.ni n = self.n job_func = self.job_func logger.info("computing. %s. r=%d, n=%d" % (job_func.__name__, r, n)) tst_data = sample_source.sample(n, seed=r) tr, te = tst_data.split_tr_te(tr_proportion=tr_proportion, seed=r + 20) prob_label = self.prob_label test_result = job_func(prob_label, tr, te, r, ni, n) # create ScalarResult instance result = SingleResult(test_result) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done. ex1: %s, r=%d, n=%d, " % (job_func.__name__, r, n)) # save result func_name = job_func.__name__ fname = '%s-%s-J%d_r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, n, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname)
def compute(self): sample_source = self.sample_source r = self.rep d = sample_source.dim() job_func = self.job_func logger.info("computing. %s. r=%d, d=%d"%(job_func.__name__, r, d)) # sample_size is a global variable tst_data = sample_source.sample(sample_size, seed=r) tr, te = tst_data.split_tr_te(tr_proportion=tr_proportion, seed=r+20 ) prob_label = self.prob_label test_result = job_func(sample_source, tr, te, r) # create ScalarResult instance result = SingleResult(test_result) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done. ex2: %s, r=%d, d=%d, "%(job_func.__name__, r, d)) # save result func_name = job_func.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname)
def store_results(self, result, runtime): logger.info("Storing results in %s" % self.db_fname) submit_dict = {} for k, v in self.param_dict.items(): submit_dict[k] = v submit_dict[self.result_name] = result submit_dict["_runtime"] = runtime submit_dict["_seed"] = self.seed store_results(self.db_fname, **submit_dict)
def store_results(self, result, runtime): logger.info("Storing results in %s" % self.db_fname) submit_dict = {} for k, v in list(self.param_dict.items()): submit_dict[k] = v submit_dict[self.result_name] = result submit_dict["_runtime"] = runtime submit_dict["_seed"] = self.seed store_results(self.db_fname, **submit_dict)
def compute(self): logger.info("computing") sleep_time = np.random.randint(10) logger.info("sleeping for %d seconds" % sleep_time) sleep(sleep_time) # the compute method submits a result object to the aggregator result = ScalarResult(sleep_time) self.aggregator.submit_result(result)
def compute_result(self): """ Note that this method directly computes and returns the result itself. There is no aggregators and no result instances being passed around at this point. """ sleep_time = np.random.randint(3) logger.info("sleeping for %d seconds" % sleep_time) sleep(sleep_time) return self.x**2 + self.y**2 + np.random.randn()*0.1
def compute(self): result = ScalarResult(self.sleep_time) if self.sleep_time >= 0: sleep_time = self.sleep_time else: sleep_time = np.random.randint(10) logger.info("Sleeping for %d" % sleep_time) sleep(sleep_time) self.aggregator.submit_result(result)
def submit_wrapped_pbs_job(self, wrapped_job, job_name): job_folder = self.get_job_foldername(job_name) # try to create folder if not yet exists job_filename = self.get_job_filename(job_name) logger.info("Creating job with file %s" % job_filename) try: makedirs(job_folder) except OSError: pass Serialization.serialize_object(wrapped_job, job_filename) # allow the queue to process things time.sleep(self.submission_delay) dispatcher_string = self._get_dispatcher_string(job_filename) # get computing ressource constraints from job walltime, memory, nodes = wrapped_job.get_walltime_mem_nodes() job_string = self.create_batch_script(job_name, dispatcher_string, walltime, memory, nodes) # put the custom parameter string in front if existing # but not as first line to avoid problems with #/bin/bash things if self.batch_parameters.parameter_prefix != "": lines = job_string.split(os.linesep) job_string = os.linesep.join( [lines[0], self.batch_parameters.parameter_prefix] + lines[1:]) f = open( job_folder + os.sep + BatchClusterComputationEngine.batch_script_filename, "w") f.write(job_string) f.close() job_id = self.submit_to_batch_system(job_string) if job_id == "": raise RuntimeError( "Could not parse job_id. Something went wrong with the job submission" ) f = open( job_folder + os.sep + BatchClusterComputationEngine.job_id_filename, 'w') f.write(job_id + os.linesep) f.close() if not isinstance(wrapped_job, FireAndForgetJob): # track submitted (and unfinished) jobs and their start time self._insert_job_time_sorted(job_name, job_id)
def run_dataset(prob_label): """Run the experiment""" sample_source, n = get_sample_source(prob_label) # /////// submit jobs ////////// # create folder name string home = os.path.expanduser("~") foldername = os.path.join(home, "freqopttest_slurm", 'e%d'%ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters( foldername=foldername, job_name_base="e%d_"%ex, parameter_prefix="")
def compute(self): param_string = ", ".join(["%s=%s" % (str(k), str(v)) for k, v in self.param_dict.items()]) logger.info("Setting numpy random seed to %d" % self.seed) np.random.seed(self.seed) logger.info("Computing result for %s" % param_string) start_time = time.time() result = self.compute_result() end_time = time.time() runtime = end_time - start_time self.store_results(result, runtime) self.aggregator.submit_result(result) # the engine will not call this, as it "forgets" self.aggregator.clean_up()
def submit_wrapped_pbs_job(self, wrapped_job, job_name): job_folder = self.get_job_foldername(job_name) # try to create folder if not yet exists job_filename = self.get_job_filename(job_name) logger.info("Creating job with file %s" % job_filename) try: makedirs(job_folder) except OSError: pass Serialization.serialize_object(wrapped_job, job_filename) # allow the queue to process things time.sleep(self.submission_delay) dispatcher_string = self._get_dispatcher_string(job_filename) # get computing ressource constraints from job walltime, memory, nodes = wrapped_job.get_walltime_mem_nodes() job_string = self.create_batch_script(job_name, dispatcher_string, walltime, memory, nodes) # put the custom parameter string in front if existing # but not as first line to avoid problems with #/bin/bash things if self.batch_parameters.parameter_prefix != "": lines = job_string.split(os.linesep) job_string = os.linesep.join([lines[0], self.batch_parameters.parameter_prefix] + lines[1:]) f = open(job_folder + os.sep + BatchClusterComputationEngine.batch_script_filename, "w") f.write(job_string) f.close() job_id = self.submit_to_batch_system(job_string) if job_id == "": raise RuntimeError("Could not parse job_id. Something went wrong with the job submission") f = open(job_folder + os.sep + BatchClusterComputationEngine.job_id_filename, 'w') f.write(job_id + os.linesep) f.close() if not isinstance(wrapped_job, FireAndForgetJob): # track submitted (and unfinished) jobs and their start time self._insert_job_time_sorted(job_name, job_id)
def compute(self): param_string = ", ".join( ["%s=%s" % (str(k), str(v)) for k, v in self.param_dict.items()]) logger.info("Setting numpy random seed to %d" % self.seed) np.random.seed(self.seed) logger.info("Computing result for %s" % param_string) start_time = time.time() result = self.compute_result() end_time = time.time() runtime = end_time - start_time self.store_results(result, runtime) self.aggregator.submit_result(result) # the engine will not call this, as it "forgets" self.aggregator.clean_up()
def compute(self): # randomly wait a few seconds so that multiple processes accessing the same # Theano function do not cause a lock problem. I do not know why. # I do not know if this does anything useful. # Sleep in seconds. time.sleep(np.random.rand(1) * 2) # load the data and construct a PairedSource here # The data can be big. We have to load it in this job function i.e., # each computing node loads by itself (no data passing). folder_path = self.folder_path prob_label = self.prob_label paired_source, _, is_h0 = exglo.get_problem_pickle( folder_path, prob_label + '.n0') n = self.n r = self.rep job_func = self.job_func pdata = paired_source.sample(n, seed=r) with util.ContextTimer() as t: logger.info("computing. %s. prob=%s, r=%d, n=%d" % (job_func.__name__, pdata.label, r, n)) tr, te = pdata.split_tr_te(tr_proportion=tr_proportion, seed=r + 21) prob_label = self.prob_label job_result = job_func(paired_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) func_name = job_func.__name__ logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " % (func_name, pdata.label, r, n, t.secs)) # save result fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, r, n, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def _resubmit(self, job_name): new_job_name = self.create_job_name() logger.info("Re-submitting under name %s" % new_job_name) # remove from unfinished jobs list for i in range(len(self.submitted_jobs)): if self.submitted_jobs[i][0] == job_name: del self.submitted_jobs[i] break # remove from all jobs list for i in range(len(self.all_jobs)): if self.all_jobs[i] == job_name: del self.all_jobs[i] break # load job from disc and re-submit under new name job_filename = self.get_job_filename(job_name) wrapped_job = Serialization.deserialize_object(job_filename) self.submit_wrapped_pbs_job(wrapped_job, new_job_name)
def _resubmit(self, job_name): new_job_name = self.create_job_name() logger.info("Re-submitting under name %s" % new_job_name) # remove from unfinished jobs list for i in range(len(self.submitted_jobs)): if self.submitted_jobs[i][0] == job_name: del self.submitted_jobs[i] break # remove from all jobs list for i in range(len(self.all)): if self.all_jobs[i] == job_name: del self.all_jobs[i] break # load job from disc and re-submit under new name job_filename = self.get_job_filename(job_name) wrapped_job = Serialization.deserialize_object(job_filename) self.submit_wrapped_pbs_job(wrapped_job, new_job_name)
def compute(self, data,mod_prm): logger.info("computing") # #------------------------------ mu_g = mod_prm['mu_g'] s_g = mod_prm['s_g'] h = mod_prm['h'] s_s = mod_prm['s_s'] model = Model(mu_g,s_g,h,s_s) #------------------------------ F1 = data['F1'] F2 = data['F2'] Y = data['Y'] #------------------------------ x = model.llh(F1,F2,Y,n_samp=n_samp) #------------------------------ # result = SingleResult([x]) self.aggregator.submit_result(result) logger.info("done computing") mypath = self.save_dir+'/'+self.name+".p" logger.info("saving:"+mypath) d = {'data':self.data,'mod_prm':self.mod_prm,'llh':x} pickle.dump( d, open(mypath, "wb" ) )
def store_results(self, result, runtime): logger.info("Storing results in %s" % self.db_fname) if '_array' in result: N_samples = result['N_samples'] submit_dict = {} del result['N_samples'] del result['_array'] submit_dict = result for k, v in self.param_dict.items(): submit_dict[k] = str(v) #submit_dict[self.result_name] = result submit_dict["_runtime"] = runtime submit_dict["_seed"] = self.seed submit_dict["_job_ID"] = self.job_ID current_time = time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime()) submit_dict["current_time"] = current_time df = pd.DataFrame(submit_dict) else: submit_dict = result for k, v in self.param_dict.items(): submit_dict[k] = v #submit_dict[self.result_name] = result submit_dict["_runtime"] = runtime submit_dict["_seed"] = self.seed submit_dict["_job_ID"] = self.job_ID current_time = time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime()) columns = list(submit_dict.keys()) df = pd.DataFrame([[submit_dict[k] for k in columns]], index=[current_time], columns=columns) store_results(self.db_fname, df)
def compute(self): r = self.rep sample_source, nmax = get_sample_source(self.prob_label) d = sample_source.dim() job_func = self.job_func logger.info("computing. %s. r=%d "%(job_func.__name__, r )) tst_data = sample_source.sample(self.n, seed=r) tr, te = tst_data.split_tr_te(tr_proportion=tr_proportion, seed=r+20 ) prob_label = self.prob_label job_result = job_func(sample_source, tr, te, r) # create ScalarResult instance result = SingleResult(job_result) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done. ex2: %s, r=%d "%(job_func.__name__, r)) # save result func_name = job_func.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) glo.ex_save_result(ex, job_result, prob_label, fname)
def compute(self): logger.info("computing") # job is to sleep for some time and return this time as an instance # of ScalarResult, which is a provided sub-class of JobResult sleep_time = randint(10) logger.info("sleeping for %d seconds" % sleep_time) sleep(sleep_time) # create ScalarResult instance result = ScalarResult(sleep_time) # submit the result to my own aggregator self.aggregator.submit_result(result) logger.info("done computing")
def _wait_until_n_unfinished(self, desired_num_unfinished): """ Iteratively checks all non-finished jobs and updates whether they are finished. Blocks until there are less or exactly desired_num_unfinished unfinished jobs in the queue. Messages a "waiting for" info message for the oldest job in the queue. """ # save all job list to file for reconstructing results later self.save_all_job_list() last_printed = self._get_oldest_job_in_queue() logger.info("Waiting for %s and %d other jobs" % (last_printed, self._get_num_unfinished_jobs() - 1)) while self._get_num_unfinished_jobs() > desired_num_unfinished: oldest = self._get_oldest_job_in_queue() if oldest != last_printed: last_printed = oldest logger.info( "Waiting for %s and %d other jobs" % (last_printed, self._get_num_unfinished_jobs() - 1)) # delete all finished jobs from internal list i = 0 while i < len(self.submitted_jobs): job_name = self.submitted_jobs[i][0] if self._check_job_done(job_name): del self.submitted_jobs[i] # dont change i as it is now the index of the next element else: i += 1 # check for re-submissions if self.batch_parameters.resubmit_on_timeout: for job_name in self._get_max_wait_time_exceed_jobs(): # load job ressources job_filename = self.get_job_filename(job_name) job = Serialization.deserialize_object(job_filename) logger.info("%s exceeded maximum waiting time of %dh" % (job_name, job.walltime)) self._resubmit(job_name) time.sleep(self.check_interval)
def _wait_until_n_unfinished(self, desired_num_unfinished): """ Iteratively checks all non-finished jobs and updates whether they are finished. Blocks until there are less or exactly desired_num_unfinished unfinished jobs in the queue. Messages a "waiting for" info message for the oldest job in the queue. """ # save all job list to file for reconstructing results later self.save_all_job_list() last_printed = self._get_oldest_job_in_queue() logger.info("Waiting for %s and %d other jobs" % (last_printed, self._get_num_unfinished_jobs() - 1)) while self._get_num_unfinished_jobs() > desired_num_unfinished: oldest = self._get_oldest_job_in_queue() if oldest != last_printed: last_printed = oldest logger.info("Waiting for %s and %d other jobs" % (last_printed, self._get_num_unfinished_jobs() - 1)) # delete all finished jobs from internal list i = 0 while i < len(self.submitted_jobs): job_name = self.submitted_jobs[i][0] if self._check_job_done(job_name): del self.submitted_jobs[i] # dont change i as it is now the index of the next element else: i += 1 # check for re-submissions if self.batch_parameters.resubmit_on_timeout: for job_name in self._get_max_wait_time_exceed_jobs(): # load job ressources job_filename = self.get_job_filename(job_name) job = Serialization.deserialize_object(job_filename) logger.info("%s exceeded maximum waiting time of %dh" % (job_name, job.walltime)) self._resubmit(job_name) time.sleep(self.check_interval)
# the serial one runs everything locally engine = SerialComputationEngine() # engine = SGEComputationEngine(batch_parameters) # engine = SlurmComputationEngine(batch_parameters) # On submission, the engine returns aggregators that can be # used to retreive results after potentially doing postprocessing returned_aggregators = [] for i in range(3): job = MyJob(ScalarResultAggregator()) agg = engine.submit_job(job) returned_aggregators.append(agg) # This call blocks until all jobs are finished (magic happens here) logger.info("Waiting for all jobs to be completed.") engine.wait_for_all() # now that everything is done, we can collect the results # and or do postprocessing logger.info("Collecting results") results = np.zeros(len(returned_aggregators)) for i, agg in enumerate(returned_aggregators): # the aggregator might implement postprocessing agg.finalize() # aggregators[i].get_final_result() here returns a ScalarResult instance, # which we need to extract the number from results[i] = agg.get_final_result().result print "Results", results
def wait_for_all(self): self._wait_until_n_unfinished(0) logger.info("All jobs finished.")
def run_dataset(prob_label): """Run the experiment""" sample_source, n = get_sample_source(prob_label) # /////// submit jobs ////////// # create folder name string home = os.path.expanduser("~") foldername = os.path.join(home, "freqopttest_slurm", 'e%d'%ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters( foldername=foldername, job_name_base="e%d_"%ex, parameter_prefix="") # Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() engine = SlurmComputationEngine(batch_parameters, do_clean_up=True) n_methods = len(method_job_funcs) # repetitions x #methods aggregators = np.empty((reps, n_methods ), dtype=object) d = sample_source.dim() for r in range(reps): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.'%fname) test_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(test_result)) aggregators[r, mi] = sra else: # result not exists or rerun job = Ex5Job(SingleResultAggregator(), prob_label, r, n, f) agg = engine.submit_job(job) aggregators[r, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") test_results = np.empty((reps, n_methods), dtype=object) for r in range(reps): for mi, f in enumerate(method_job_funcs): logger.info("Collecting result (%s, r=%d)" % (f.__name__, r )) # let the aggregator finalize things aggregators[r, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result test_result = aggregators[r, mi].get_final_result().result test_results[r, mi] = test_result func_name = f.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname) func_names = [f.__name__ for f in method_job_funcs] func2labels = exglobal.get_func2label_map() method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = {'results': test_results, 'n': n, 'data_fname':label2fname[prob_label], 'alpha': alpha, 'J': J, 'sample_source': sample_source, 'tr_proportion': 0.5, 'method_job_funcs': method_job_funcs, 'prob_label': prob_label, 'method_labels': method_labels} # class name fname = 'ex%d-%s-me%d_J%d_rs%d_nma%d_d%d_a%.3f_trp%.2f.p' \ %(ex, prob_label, n_methods, J, reps, n, d, alpha, tr_proportion) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s'%fname)
from independent_jobs.aggregators.ScalarResultAggregator import ScalarResultAggregator from independent_jobs.engines.BatchClusterParameters import BatchClusterParameters from independent_jobs.engines.SGEComputationEngine import SGEComputationEngine from independent_jobs.engines.SerialComputationEngine import SerialComputationEngine from independent_jobs.examples.MyJob import MyJob from independent_jobs.tools.Log import Log from independent_jobs.tools.Log import logger import numpy as np # See other file for implementation of MyJob # Since we are using ScalarResult, we can use the already implemented aggregator # ScalarResultAggregator if __name__ == '__main__': Log.set_loglevel(logger.info) logger.info("Start") # create an instance of the SGE engine, with certain parameters # create folder name string home = expanduser("~") foldername = os.sep.join([home, "minimal_example"]) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername) # possibly create SGE engine instance, which can be used to submit jobs to # there are more engines available. # logger.info("creating SGE engine instance") # engine = SGEComputationEngine(batch_parameters, check_interval=1)
# Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() engine = SlurmComputationEngine(batch_parameters) n_methods = len(method_job_funcs) # repetitions x #methods aggregators = np.empty((reps, n_methods ), dtype=object) d = sample_source.dim() for r in range(reps): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.'%fname) test_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() if test_result is SingleResult: sra.submit_result(test_result) else: sra.submit_result(SingleResult(test_result)) aggregators[r, mi] = sra else: # result not exists or rerun job = Ex4Job(SingleResultAggregator(), prob_label, r, n, f) agg = engine.submit_job(job) aggregators[r, mi] = agg
def run_problem(prob_label): """Run the experiment""" L = get_pqsource_list(prob_label) prob_params, ps, data_sources = zip(*L) # make them lists prob_params = list(prob_params) ps = list(ps) data_sources = list(data_sources) # /////// submit jobs ////////// # create folder name string #result_folder = glo.result_folder() from kgof.config import expr_configs tmp_dir = expr_configs['scratch_path'] foldername = os.path.join(tmp_dir, 'kgof_slurm', 'e%d' % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") # Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() engine = SlurmComputationEngine(batch_parameters) #engine = SlurmComputationEngine(batch_parameters, partition='wrkstn,compute') n_methods = len(method_job_funcs) # repetitions x len(prob_params) x #methods aggregators = np.empty((reps, len(prob_params), n_methods), dtype=object) for r in range(reps): for pi, param in enumerate(prob_params): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-n%d_r%d_p%g_a%.3f_trp%.2f.p' \ %(prob_label, func_name, sample_size, r, param, alpha, tr_proportion) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.' % fname) job_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(job_result)) aggregators[r, pi, mi] = sra else: # result not exists or rerun # p: an UnnormalizedDensity object p = ps[pi] job = Ex2Job(SingleResultAggregator(), p, data_sources[pi], prob_label, r, f, param) agg = engine.submit_job(job) aggregators[r, pi, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") job_results = np.empty((reps, len(prob_params), n_methods), dtype=object) for r in range(reps): for pi, param in enumerate(prob_params): for mi, f in enumerate(method_job_funcs): logger.info("Collecting result (%s, r=%d, param=%.3g)" % (f.__name__, r, param)) # let the aggregator finalize things aggregators[r, pi, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result job_result = aggregators[r, pi, mi].get_final_result().result job_results[r, pi, mi] = job_result #func_names = [f.__name__ for f in method_job_funcs] #func2labels = exglobal.get_func2label_map() #method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = { 'job_results': job_results, 'prob_params': prob_params, 'alpha': alpha, 'repeats': reps, 'ps': ps, 'list_data_source': data_sources, 'tr_proportion': tr_proportion, 'method_job_funcs': method_job_funcs, 'prob_label': prob_label, 'sample_size': sample_size, } # class name fname = 'ex%d-%s-me%d_n%d_rs%d_pmi%g_pma%g_a%.3f_trp%.2f.p' \ %(ex, prob_label, n_methods, sample_size, reps, min(prob_params), max(prob_params), alpha, tr_proportion) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s' % fname)
def run_problem(prob_label): """Run the experiment""" # /////// submit jobs ////////// # create folder name string #result_folder = glo.result_folder() from kmod.config import expr_configs tmp_dir = expr_configs['scratch_path'] foldername = os.path.join(tmp_dir, 'kmod_slurm', 'e%d' % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") # Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() partitions = expr_configs['slurm_partitions'] if partitions is None: engine = SlurmComputationEngine(batch_parameters) else: engine = SlurmComputationEngine(batch_parameters, partition=partitions) n_methods = len(method_funcs) # problem setting ns, P, Q, ds, = get_ns_pqrsource(prob_label) # repetitions x len(ns) x #methods aggregators = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-n%d_r%d_a%.3f.p' \ %(prob_label, func_name, n, r, alpha,) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.' % fname) job_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(job_result)) aggregators[r, ni, mi] = sra else: # result not exists or rerun job = Ex1Job(SingleResultAggregator(), P, Q, ds, prob_label, r, f, n) agg = engine.submit_job(job) aggregators[r, ni, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") job_results = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_funcs): logger.info("Collecting result (%s, r=%d, n=%d)" % (f.__name__, r, n)) # let the aggregator finalize things aggregators[r, ni, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result job_result = aggregators[r, ni, mi].get_final_result().result job_results[r, ni, mi] = job_result #func_names = [f.__name__ for f in method_funcs] #func2labels = exglobal.get_func2label_map() #method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = { 'job_results': job_results, 'P': P, 'Q': Q, 'data_source': ds, 'alpha': alpha, 'repeats': reps, 'ns': ns, 'method_funcs': method_funcs, 'prob_label': prob_label, } # class name fname = 'ex%d-%s-me%d_rs%d_nmi%d_nma%d_a%.3f.p' \ %(ex, prob_label, n_methods, reps, min(ns), max(ns), alpha,) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s' % fname)
def run_problem(folder_path, prob_label): """Run the experiment""" pl = exglo.parse_prob_label(prob_label) is_h0 = pl['is_h0'] n = pl['n'] # /////// submit jobs ////////// # create folder name string #result_folder = glo.result_folder() #tmp_dir = tempfile.gettempdir() from fsic.config import expr_configs tmp_dir = expr_configs['scratch_dir'] foldername = os.path.join(tmp_dir, 'wj_slurm', 'e%d' % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") # Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() engine = SlurmComputationEngine(batch_parameters) n_methods = len(method_job_funcs) # repetitions x sample_sizes x #methods aggregators = np.empty((reps, n_methods), dtype=object) for r in range(reps): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-r%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, r, alpha, tr_proportion) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.' % fname) job_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(job_result)) aggregators[r, mi] = sra else: # result not exists or rerun job = Ex4Job(SingleResultAggregator(), folder_path, prob_label, r, f) agg = engine.submit_job(job) aggregators[r, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") job_results = np.empty((reps, n_methods), dtype=object) for r in range(reps): for mi, f in enumerate(method_job_funcs): logger.info("Collecting result (%s, r=%d, n=%d)" % (f.__name__, r, n)) # let the aggregator finalize things aggregators[r, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result job_result = aggregators[r, mi].get_final_result().result job_results[r, mi] = job_result #func_names = [f.__name__ for f in method_job_funcs] #func2labels = exglobal.get_func2label_map() #method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results # - Do not store PairedSource because it can be very big. results = { 'job_results': job_results, 'n': n, 'is_h0': is_h0, 'alpha': alpha, 'repeats': reps, 'tr_proportion': tr_proportion, 'method_job_funcs': method_job_funcs, 'prob_label': prob_label, } # class name fname = 'ex%d-%s-me%d_rs%d_a%.3f_trp%.2f.p' \ %(ex, prob_label, n_methods, reps, alpha, tr_proportion) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s' % fname)
def run_dataset(prob_label): """Run the experiment""" sample_source, n = get_sample_source(prob_label) # /////// submit jobs ////////// # create folder name string home = os.path.expanduser("~") foldername = os.path.join(home, "freqopttest_slurm", 'e%d' % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") # Use the following line if Slurm queue is not used. #engine = SerialComputationEngine() engine = SlurmComputationEngine(batch_parameters, do_clean_up=True) n_methods = len(method_job_funcs) # repetitions x #methods aggregators = np.empty((reps, n_methods), dtype=object) d = sample_source.dim() for r in range(reps): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info('%s exists. Load and return.' % fname) test_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(test_result)) aggregators[r, mi] = sra else: # result not exists or rerun job = Ex5Job(SingleResultAggregator(), prob_label, r, n, f) agg = engine.submit_job(job) aggregators[r, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") test_results = np.empty((reps, n_methods), dtype=object) for r in range(reps): for mi, f in enumerate(method_job_funcs): logger.info("Collecting result (%s, r=%d)" % (f.__name__, r)) # let the aggregator finalize things aggregators[r, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result test_result = aggregators[r, mi].get_final_result().result test_results[r, mi] = test_result func_name = f.__name__ fname = '%s-%s-J%d_r%d_d%d_a%.3f_trp%.2f.p' \ %(prob_label, func_name, J, r, d, alpha, tr_proportion) glo.ex_save_result(ex, test_result, prob_label, fname) func_names = [f.__name__ for f in method_job_funcs] func2labels = exglobal.get_func2label_map() method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = { 'results': test_results, 'n': n, 'data_fname': label2fname[prob_label], 'alpha': alpha, 'J': J, 'sample_source': sample_source, 'tr_proportion': tr_proportion, 'method_job_funcs': method_job_funcs, 'prob_label': prob_label, 'method_labels': method_labels } # class name fname = 'ex%d-%s-me%d_J%d_rs%d_nma%d_d%d_a%.3f_trp%.2f.p' \ %(ex, prob_label, n_methods, J, reps, n, d, alpha, tr_proportion) glo.ex_save_result(ex, results, fname) logger.info('Saved aggregated results to %s' % fname)
def run_problem(prob_label): """Run the experiment""" ns, p, ds = get_ns_pqsource(prob_label) # /////// submit jobs ////////// # create folder name string # result_folder = glo.result_folder() from sbibm.third_party.kgof.config import expr_configs tmp_dir = expr_configs["scratch_path"] foldername = os.path.join(tmp_dir, "kgof_slurm", "e%d" % ex) logger.info("Setting engine folder to %s" % foldername) # create parameter instance that is needed for any batch computation engine logger.info("Creating batch parameter instance") batch_parameters = BatchClusterParameters(foldername=foldername, job_name_base="e%d_" % ex, parameter_prefix="") # Use the following line if Slurm queue is not used. # engine = SerialComputationEngine() # engine = SlurmComputationEngine(batch_parameters, partition='wrkstn,compute') engine = SlurmComputationEngine(batch_parameters) n_methods = len(method_job_funcs) # repetitions x len(ns) x #methods aggregators = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_job_funcs): # name used to save the result func_name = f.__name__ fname = "%s-%s-n%d_r%d_a%.3f_trp%.2f.p" % ( prob_label, func_name, n, r, alpha, tr_proportion, ) if not is_rerun and glo.ex_file_exists(ex, prob_label, fname): logger.info("%s exists. Load and return." % fname) job_result = glo.ex_load_result(ex, prob_label, fname) sra = SingleResultAggregator() sra.submit_result(SingleResult(job_result)) aggregators[r, ni, mi] = sra else: # result not exists or rerun # p: an UnnormalizedDensity object job = Ex1Job(SingleResultAggregator(), p, ds, prob_label, r, f, n) agg = engine.submit_job(job) aggregators[r, ni, mi] = agg # let the engine finish its business logger.info("Wait for all call in engine") engine.wait_for_all() # ////// collect the results /////////// logger.info("Collecting results") job_results = np.empty((reps, len(ns), n_methods), dtype=object) for r in range(reps): for ni, n in enumerate(ns): for mi, f in enumerate(method_job_funcs): logger.info("Collecting result (%s, r=%d, n=%rd)" % (f.__name__, r, n)) # let the aggregator finalize things aggregators[r, ni, mi].finalize() # aggregators[i].get_final_result() returns a SingleResult instance, # which we need to extract the actual result job_result = aggregators[r, ni, mi].get_final_result().result job_results[r, ni, mi] = job_result # func_names = [f.__name__ for f in method_job_funcs] # func2labels = exglobal.get_func2label_map() # method_labels = [func2labels[f] for f in func_names if f in func2labels] # save results results = { "job_results": job_results, "data_source": ds, "alpha": alpha, "repeats": reps, "ns": ns, "p": p, "tr_proportion": tr_proportion, "method_job_funcs": method_job_funcs, "prob_label": prob_label, } # class name fname = "ex%d-%s-me%d_rs%d_nmi%d_nma%d_a%.3f_trp%.2f.p" % ( ex, prob_label, n_methods, reps, min(ns), max(ns), alpha, tr_proportion, ) glo.ex_save_result(ex, results, fname) logger.info("Saved aggregated results to %s" % fname)