def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): "Main routine for thread/process launch of libE." nworkers = libE_specs['nworkers'] check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) exctr = Executor.executor if exctr is not None: local_host = [socket.gethostname()] exctr.add_comm_info(libE_nodes=local_host, serial_setup=True) hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # Launch worker team and set up logger wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs) if not libE_specs.get('disable_log_files', False): manager_logging_config() # Set up cleanup routine to shut down worker team def cleanup(): "Handler to clean up comms team." kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout', 1)) # Run generic manager return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, hist, on_cleanup=cleanup)
def libE_mpi_manager(mpi_comm, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): "Manager routine run at rank 0." from libensemble.comms.mpi import MainMPIComm hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # Lauch worker team wcomms = [MainMPIComm(mpi_comm, w) for w in range(1, mpi_comm.Get_size())] if not libE_specs.get('disable_log_files', False): manager_logging_config() # Set up abort handler def on_abort(): "Shut down MPI on error." comms_abort(mpi_comm) # Run generic manager return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, hist, on_abort=on_abort)
def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): "Main routine for thread/process launch of libE." nworkers = libE_specs['nworkers'] check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) jobctl = JobController.controller if jobctl is not None: local_host = socket.gethostname() jobctl.add_comm_info(libE_nodes=local_host, serial_setup=True) hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # Launch worker team and set up logger wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs) manager_logging_config() # Set up cleanup routine to shut down worker team def cleanup(): "Handler to clean up comms team." kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout')) # Run generic manager return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, hist, on_cleanup=cleanup)
def hist_setup2(sim_max=10, H0_in=[]): sim_specs, gen_specs, exit_criteria = make_criteria_and_specs_1( simx=sim_max) alloc_specs = { 'alloc_f': give_sim_work_first, 'out': [('allocated', bool)] } # default for libE H0 = H0_in hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) return hist, sim_specs, gen_specs, exit_criteria, alloc_specs
def test_decide_work_and_resources(): sim_specs, gen_specs, exit_criteria = setup.make_criteria_and_specs_1() hist = History(al, sim_specs, gen_specs, exit_criteria, H0) mgr = man.Manager(hist, libE_specs, al, sim_specs, gen_specs, exit_criteria) W = mgr.W # Don't give out work when all workers are active W['active'] = 1 Work, persis_info = al['alloc_f'](W, hist.H, sim_specs, gen_specs, al, {}) assert len(Work) == 0
def libE_tcp_mgr(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): "Main routine for TCP multiprocessing launch of libE at manager." hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # Set up a worker launcher launchf = libE_tcp_worker_launcher(libE_specs) # Get worker launch parameters and fill in defaults for TCP/IP conn if 'nworkers' in libE_specs: workers = None nworkers = libE_specs['nworkers'] elif 'workers' in libE_specs: workers = libE_specs['workers'] nworkers = len(workers) ip = libE_specs.get('ip', None) or get_ip() port = libE_specs.get('port', 0) authkey = libE_specs.get('authkey', libE_tcp_authkey()) with ServerQCommManager(port, authkey.encode('utf-8')) as manager: # Get port if needed because of auto-assignment if port == 0: _, port = manager.address if not libE_specs.get('disable_log_files', False): manager_logging_config() logger.info("Launched server at ({}, {})".format(ip, port)) # Launch worker team and set up logger worker_procs, wcomms =\ libE_tcp_start_team(manager, nworkers, workers, ip, port, authkey, launchf) def cleanup(): "Handler to clean up launched team." for wp in worker_procs: launcher.cancel(wp, timeout=libE_specs.get('worker_timeout')) # Run generic manager return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, hist, on_cleanup=cleanup)
def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0): "Main routine for thread/process launch of libE." nworkers = libE_specs['nworkers'] check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) exctr = Executor.executor if exctr is not None: local_host = [socket.gethostname()] exctr.add_comm_info(libE_nodes=local_host, serial_setup=True) hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) # On Python 3.8 on macOS, the default start method for new processes was # switched to 'spawn' by default due to 'fork' potentially causing crashes. # These crashes haven't yet been observed with libE, but with 'spawn' runs, # warnings about leaked semaphore objects are displayed instead. # The next several statements enforce 'fork' on macOS (Python 3.8) if os.uname().sysname == 'Darwin': from multiprocessing import set_start_method set_start_method('fork', force=True) # Launch worker team and set up logger wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs) if not libE_specs.get('disable_log_files', False): manager_logging_config() # Set up cleanup routine to shut down worker team def cleanup(): "Handler to clean up comms team." kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout', 1)) # Run generic manager return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, hist, on_cleanup=cleanup)
def libE(sim_specs, gen_specs, exit_criteria, persis_info={}, alloc_specs={ 'alloc_f': give_sim_work_first, 'out': [('allocated', bool)] }, libE_specs={ 'comm': MPI.COMM_WORLD, 'color': 0 }, H0=[]): """ libE(sim_specs, gen_specs, exit_criteria, persis_info={}, alloc_specs={'alloc_f': give_sim_work_first, 'out':[('allocated',bool)]}, libE_specs={'comm': MPI.COMM_WORLD, 'color': 0}, H0 =[]) This is the outer libEnsemble routine. If the rank in libE_specs['comm'] is 0, manager_main is run. Otherwise, worker_main is run. If an exception is encountered by the manager or workers, the history array is dumped to file and MPI abort is called. Parameters ---------- sim_specs: :obj:`dict` Specifications for the simulation function :doc:`(example)<data_structures/sim_specs>` gen_specs: :obj:`dict` Specifications for the generator function :doc:`(example)<data_structures/gen_specs>` exit_criteria: :obj:`dict` Tell libEnsemble when to stop a run :doc:`(example)<data_structures/exit_criteria>` persis_info: :obj:`dict`, optional Persistent information to be passed between user functions :doc:`(example)<data_structures/persis_info>` alloc_specs: :obj:`dict`, optional Specifications for the allocation function :doc:`(example)<data_structures/alloc_specs>` libE_specs: :obj:`dict`, optional Specifications for libEnsemble :doc:`(example)<data_structures/libE_specs>` H0: :obj:`dict`, optional A previous libEnsemble history to be prepended to the history in the current libEnsemble run :doc:`(example)<data_structures/history_array>` Returns ------- H: :obj:`dict` History array storing rows for each point. :doc:`(example)<data_structures/history_array>` Dictionary containing persistent info persis_info: :obj:`dict` Final state of persistent information :doc:`(example)<data_structures/persis_info>` exit_flag: :obj:`int` Flag containing job status: 0 = No errors, 1 = Exception occured and MPI aborted, 2 = Manager timed out and ended simulation """ #sys.excepthook = comms_abort(libE_specs['comm']) H = exit_flag = [] libE_specs = check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0) if libE_specs['comm'].Get_rank() == 0: CalcInfo.make_statdir() libE_specs['comm'].Barrier() if libE_specs['comm'].Get_rank() == 0: hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0) try: persis_info, exit_flag = manager_main(hist, libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, persis_info) except Exception as e: # Manager exceptions are fatal eprint(traceback.format_exc()) eprint("\nManager exception raised .. aborting ensemble:\n" ) #datetime eprint( "\nDumping ensemble history with {} sims evaluated:\n".format( hist.sim_count)) #datetime filename = 'libE_history_at_abort_' + str(hist.sim_count) + '.npy' np.save(filename, hist.trim_H()) sys.stdout.flush() sys.stderr.flush() #sys.excepthook = comms_abort(libE_specs['comm']) comms_abort(libE_specs['comm']) #raise else: logger.debug("Manager exiting") print(libE_specs['comm'].Get_size(), exit_criteria) sys.stdout.flush() else: try: worker_main(libE_specs, sim_specs, gen_specs) except Exception as e: eprint( "\nWorker exception raised on rank {} .. aborting ensemble:\n". format(libE_specs['comm'].Get_rank())) eprint(traceback.format_exc()) sys.stdout.flush() sys.stderr.flush() #First try to signal manager to dump history comms_signal_abort_to_man(libE_specs['comm']) #comms_abort(libE_specs['comm']) else: logger.debug("Worker {} exiting".format( libE_specs['comm'].Get_rank())) # Create calc summary file libE_specs['comm'].Barrier() if libE_specs['comm'].Get_rank() == 0: CalcInfo.merge_statfiles() H = hist.trim_H() return H, persis_info, exit_flag