Exemplo n.º 1
0
def libE_local(sim_specs, gen_specs, exit_criteria,
               persis_info, alloc_specs, libE_specs, H0):
    "Main routine for thread/process launch of libE."

    nworkers = libE_specs['nworkers']
    check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    exctr = Executor.executor
    if exctr is not None:
        local_host = [socket.gethostname()]
        exctr.add_comm_info(libE_nodes=local_host, serial_setup=True)

    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    # Launch worker team and set up logger
    wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs)

    if not libE_specs.get('disable_log_files', False):
        manager_logging_config()

    # Set up cleanup routine to shut down worker team
    def cleanup():
        "Handler to clean up comms team."
        kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout', 1))

    # Run generic manager
    return libE_manager(wcomms, sim_specs, gen_specs, exit_criteria,
                        persis_info, alloc_specs, libE_specs, hist,
                        on_cleanup=cleanup)
Exemplo n.º 2
0
def libE_mpi_manager(mpi_comm, sim_specs, gen_specs, exit_criteria,
                     persis_info, alloc_specs, libE_specs, H0):
    "Manager routine run at rank 0."

    from libensemble.comms.mpi import MainMPIComm

    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    # Lauch worker team
    wcomms = [MainMPIComm(mpi_comm, w) for w in range(1, mpi_comm.Get_size())]

    if not libE_specs.get('disable_log_files', False):
        manager_logging_config()

    # Set up abort handler
    def on_abort():
        "Shut down MPI on error."
        comms_abort(mpi_comm)

    # Run generic manager
    return libE_manager(wcomms,
                        sim_specs,
                        gen_specs,
                        exit_criteria,
                        persis_info,
                        alloc_specs,
                        libE_specs,
                        hist,
                        on_abort=on_abort)
Exemplo n.º 3
0
def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs,
               libE_specs, H0):
    "Main routine for thread/process launch of libE."

    nworkers = libE_specs['nworkers']
    check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria,
                 H0)

    jobctl = JobController.controller
    if jobctl is not None:
        local_host = socket.gethostname()
        jobctl.add_comm_info(libE_nodes=local_host, serial_setup=True)

    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    # Launch worker team and set up logger
    wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs)
    manager_logging_config()

    # Set up cleanup routine to shut down worker team
    def cleanup():
        "Handler to clean up comms team."
        kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout'))

    # Run generic manager
    return libE_manager(wcomms,
                        sim_specs,
                        gen_specs,
                        exit_criteria,
                        persis_info,
                        alloc_specs,
                        libE_specs,
                        hist,
                        on_cleanup=cleanup)
Exemplo n.º 4
0
def hist_setup2(sim_max=10, H0_in=[]):
    sim_specs, gen_specs, exit_criteria = make_criteria_and_specs_1(
        simx=sim_max)
    alloc_specs = {
        'alloc_f': give_sim_work_first,
        'out': [('allocated', bool)]
    }  # default for libE
    H0 = H0_in
    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)
    return hist, sim_specs, gen_specs, exit_criteria, alloc_specs
def test_decide_work_and_resources():

    sim_specs, gen_specs, exit_criteria = setup.make_criteria_and_specs_1()
    hist = History(al, sim_specs, gen_specs, exit_criteria, H0)

    mgr = man.Manager(hist, libE_specs, al, sim_specs, gen_specs, exit_criteria)
    W = mgr.W

    # Don't give out work when all workers are active
    W['active'] = 1
    Work, persis_info = al['alloc_f'](W, hist.H, sim_specs, gen_specs, al, {})
    assert len(Work) == 0
Exemplo n.º 6
0
def libE_tcp_mgr(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs,
                 libE_specs, H0):
    "Main routine for TCP multiprocessing launch of libE at manager."

    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    # Set up a worker launcher
    launchf = libE_tcp_worker_launcher(libE_specs)

    # Get worker launch parameters and fill in defaults for TCP/IP conn
    if 'nworkers' in libE_specs:
        workers = None
        nworkers = libE_specs['nworkers']
    elif 'workers' in libE_specs:
        workers = libE_specs['workers']
        nworkers = len(workers)
    ip = libE_specs.get('ip', None) or get_ip()
    port = libE_specs.get('port', 0)
    authkey = libE_specs.get('authkey', libE_tcp_authkey())

    with ServerQCommManager(port, authkey.encode('utf-8')) as manager:

        # Get port if needed because of auto-assignment
        if port == 0:
            _, port = manager.address

        if not libE_specs.get('disable_log_files', False):
            manager_logging_config()

        logger.info("Launched server at ({}, {})".format(ip, port))

        # Launch worker team and set up logger
        worker_procs, wcomms =\
            libE_tcp_start_team(manager, nworkers, workers,
                                ip, port, authkey, launchf)

        def cleanup():
            "Handler to clean up launched team."
            for wp in worker_procs:
                launcher.cancel(wp, timeout=libE_specs.get('worker_timeout'))

        # Run generic manager
        return libE_manager(wcomms,
                            sim_specs,
                            gen_specs,
                            exit_criteria,
                            persis_info,
                            alloc_specs,
                            libE_specs,
                            hist,
                            on_cleanup=cleanup)
Exemplo n.º 7
0
def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs,
               libE_specs, H0):
    "Main routine for thread/process launch of libE."

    nworkers = libE_specs['nworkers']
    check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria,
                 H0)

    exctr = Executor.executor
    if exctr is not None:
        local_host = [socket.gethostname()]
        exctr.add_comm_info(libE_nodes=local_host, serial_setup=True)

    hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)

    # On Python 3.8 on macOS, the default start method for new processes was
    #  switched to 'spawn' by default due to 'fork' potentially causing crashes.
    # These crashes haven't yet been observed with libE, but with 'spawn' runs,
    #  warnings about leaked semaphore objects are displayed instead.
    # The next several statements enforce 'fork' on macOS (Python 3.8)
    if os.uname().sysname == 'Darwin':
        from multiprocessing import set_start_method
        set_start_method('fork', force=True)

    # Launch worker team and set up logger
    wcomms = start_proc_team(nworkers, sim_specs, gen_specs, libE_specs)

    if not libE_specs.get('disable_log_files', False):
        manager_logging_config()

    # Set up cleanup routine to shut down worker team
    def cleanup():
        "Handler to clean up comms team."
        kill_proc_team(wcomms, timeout=libE_specs.get('worker_timeout', 1))

    # Run generic manager
    return libE_manager(wcomms,
                        sim_specs,
                        gen_specs,
                        exit_criteria,
                        persis_info,
                        alloc_specs,
                        libE_specs,
                        hist,
                        on_cleanup=cleanup)
Exemplo n.º 8
0
def libE(sim_specs,
         gen_specs,
         exit_criteria,
         persis_info={},
         alloc_specs={
             'alloc_f': give_sim_work_first,
             'out': [('allocated', bool)]
         },
         libE_specs={
             'comm': MPI.COMM_WORLD,
             'color': 0
         },
         H0=[]):
    """
    libE(sim_specs, gen_specs, exit_criteria, persis_info={}, alloc_specs={'alloc_f': give_sim_work_first, 'out':[('allocated',bool)]}, libE_specs={'comm': MPI.COMM_WORLD, 'color': 0}, H0 =[])

    This is the outer libEnsemble routine. If the rank in libE_specs['comm'] is
    0, manager_main is run. Otherwise, worker_main is run.

    If an exception is encountered by the manager or workers,  the history array
    is dumped to file and MPI abort is called.

    Parameters
    ----------

    sim_specs: :obj:`dict`

        Specifications for the simulation function
        :doc:`(example)<data_structures/sim_specs>`

    gen_specs: :obj:`dict`

        Specifications for the generator function
        :doc:`(example)<data_structures/gen_specs>`

    exit_criteria: :obj:`dict`

        Tell libEnsemble when to stop a run
        :doc:`(example)<data_structures/exit_criteria>`

    persis_info: :obj:`dict`, optional

        Persistent information to be passed between user functions
        :doc:`(example)<data_structures/persis_info>`

    alloc_specs: :obj:`dict`, optional

        Specifications for the allocation function
        :doc:`(example)<data_structures/alloc_specs>`

    libE_specs: :obj:`dict`, optional

        Specifications for libEnsemble
        :doc:`(example)<data_structures/libE_specs>`

    H0: :obj:`dict`, optional

        A previous libEnsemble history to be prepended to the history in the
        current libEnsemble run
        :doc:`(example)<data_structures/history_array>`

    Returns
    -------

    H: :obj:`dict`

        History array storing rows for each point.  :doc:`(example)<data_structures/history_array>`
        Dictionary containing persistent info

    persis_info: :obj:`dict`

        Final state of persistent information
        :doc:`(example)<data_structures/persis_info>`

    exit_flag: :obj:`int`

        Flag containing job status: 0 = No errors,
        1 = Exception occured and MPI aborted,
        2 = Manager timed out and ended simulation

    """
    #sys.excepthook = comms_abort(libE_specs['comm'])
    H = exit_flag = []
    libE_specs = check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs,
                              exit_criteria, H0)

    if libE_specs['comm'].Get_rank() == 0:
        CalcInfo.make_statdir()
    libE_specs['comm'].Barrier()

    if libE_specs['comm'].Get_rank() == 0:
        hist = History(alloc_specs, sim_specs, gen_specs, exit_criteria, H0)
        try:
            persis_info, exit_flag = manager_main(hist, libE_specs,
                                                  alloc_specs, sim_specs,
                                                  gen_specs, exit_criteria,
                                                  persis_info)
        except Exception as e:

            # Manager exceptions are fatal
            eprint(traceback.format_exc())
            eprint("\nManager exception raised .. aborting ensemble:\n"
                   )  #datetime

            eprint(
                "\nDumping ensemble history with {} sims evaluated:\n".format(
                    hist.sim_count))  #datetime
            filename = 'libE_history_at_abort_' + str(hist.sim_count) + '.npy'
            np.save(filename, hist.trim_H())
            sys.stdout.flush()
            sys.stderr.flush()
            #sys.excepthook = comms_abort(libE_specs['comm'])
            comms_abort(libE_specs['comm'])
            #raise

        else:
            logger.debug("Manager exiting")
            print(libE_specs['comm'].Get_size(), exit_criteria)
            sys.stdout.flush()

    else:
        try:
            worker_main(libE_specs, sim_specs, gen_specs)
        except Exception as e:
            eprint(
                "\nWorker exception raised on rank {} .. aborting ensemble:\n".
                format(libE_specs['comm'].Get_rank()))
            eprint(traceback.format_exc())
            sys.stdout.flush()
            sys.stderr.flush()

            #First try to signal manager to dump history
            comms_signal_abort_to_man(libE_specs['comm'])
            #comms_abort(libE_specs['comm'])
        else:
            logger.debug("Worker {} exiting".format(
                libE_specs['comm'].Get_rank()))

    # Create calc summary file
    libE_specs['comm'].Barrier()
    if libE_specs['comm'].Get_rank() == 0:
        CalcInfo.merge_statfiles()
        H = hist.trim_H()

    return H, persis_info, exit_flag