def main(): (options, args) = parse_args() if options.job: job_runner(load_job(options.job)) exit(0) experiment_config = args[0] expt_dir = os.path.dirname(os.path.realpath(experiment_config)) log("Using experiment configuration: " + experiment_config) log("experiment dir: " + expt_dir) if not os.path.exists(expt_dir): log("Cannot find experiment directory '%s'. " "Aborting." % (expt_dir)) sys.exit(-1) check_experiment_dirs(expt_dir) # Load up the chooser module. module = importlib.import_module('spearmint.chooser.' + options.chooser_module) chooser = module.init(expt_dir, options.chooser_args) if options.web_status: web_proc = start_web_view(options, experiment_config, chooser) # Load up the job execution driver. module = importlib.import_module('spearmint.driver.' + options.driver) driver = module.init() # Loop until we run out of jobs. while attempt_dispatch(experiment_config, expt_dir, chooser, driver, options): # This is polling frequency. A higher frequency means that the algorithm # picks up results more quickly after they finish, but also significantly # increases overhead. time.sleep(options.polling_time)
def main(options=None, experiment_config=None, expt_dir=None): #If nothing given, get arguments from sys.argv. Otherwise they are provided #by external caller. if options == None: (options, args) = parse_args() if options.job: job_runner(load_job(options.job)) return 0 experiment_config = args[0] expt_dir = os.path.dirname(os.path.realpath(experiment_config)) log("Using experiment configuration: " + str(experiment_config)) log("experiment dir: " + expt_dir) if not os.path.exists(expt_dir): log("Cannot find experiment directory '%s'. " "Aborting." % (expt_dir)) sys.exit(-1) check_experiment_dirs(expt_dir) # Load up the chooser module. module = load_module('chooser', options.chooser_module) chooser = module.init(expt_dir, options.chooser_args) if options.web_status: web_proc = start_web_view(options, experiment_config, chooser) module = load_module('driver', options.driver) driver = module.init(run_func=options.run_func) if options.jobs_per_node != -1: module = load_module('driver', options.distant_driver) distant_driver = module.init(**options.distant_driver_params) else: distant_driver = None #Jobs per node is used for hybrid jobs. if options.jobs_per_node != -1: start_time = time.time() total_time = 0 last_exp_time = 0 loops = 0 while True: if options.nb_dist_nodes != 1 or (total_time + 1.5 * last_exp_time > 20*60*60): #Launch new distant job without selecting any experiment, they #will be selected on the distant node. log("Launching on new distant nodes.") for i in range(options.nb_dist_nodes): #only the first execution should #launch more than one distributed job. out = dispatch_empty_job(expt_dir, distant_driver, options) if out == 0: raise Exception("Error trying to dispatch empty job with distant driver.") return else: pids = [] for i in range(options.jobs_per_node): out, pid = attempt_dispatch(experiment_config, expt_dir, chooser, driver, options) if out == 0: break #stop the local dispatch loop. pids.append(pid) if len(pids) == 0: #we are done, no more processes launched. break #Wait for all local jobs. log("Waiting for local processes.") for pid in pids: try: os.waitpid(pid, 0) except: pass loops += 1 last_exp_time = time.time() - total_time - start_time total_time = time.time() - start_time log("All processes done executing %i times (this batch took %f mins, total time: %f\ mins)." % (loops, last_exp_time / 60, total_time / 60)) else: #This process won't end until we run out of jobs or time. while True: out, _ = attempt_dispatch(experiment_config, expt_dir, chooser, driver, options) if out == 0: break # This is polling frequency. A higher frequency means that the algorithm # picks up results more quickly after they finish, but also significantly # increases overhead. time.sleep(options.polling_time)