Python EmceeOptimizer 예제들, espei.optimizers.opt_mcmc.EmceeOptimizer Python 예제들

예제 #1

0

파일 보기

def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db):
    """lnprob() successfully calculates the probability for equilibrium """
    dbf = Database.from_string(CU_MG_TDB, fmt='tdb')
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    comps = ['CU', 'MG', 'VA']
    phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2']
    param = 'VV0001'
    orig_val = dbf.symbols[param].args[0].expr
    initial_params = {param: orig_val}

    zpf_kwargs = {
        'zpf_data': get_zpf_data(dbf, comps, phases, datasets_db,
                                 initial_params),
        'data_weight': 1.0,
    }
    opt = EmceeOptimizer(dbf)
    res = opt.predict([10],
                      prior_rvs=[rv_zero()],
                      symbols_to_fit=[param],
                      zpf_kwargs=zpf_kwargs)

    assert np.isreal(res)
    assert np.isclose(res, -31.309645520830344, rtol=1e-4)

    res_2 = opt.predict([10000000],
                        prior_rvs=[rv_zero()],
                        symbols_to_fit=[param],
                        zpf_kwargs=zpf_kwargs)

    assert not np.isclose(res_2, -31.309645520830344, rtol=1e-6)

예제 #2

0

파일 보기

파일: test_mcmc.py 프로젝트: npaulson/ESPEI

def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db):
    """lnprob() successfully calculates the probability for equilibrium """
    dbf = Database.from_string(CU_MG_TDB, fmt='tdb')
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    comps = ['CU', 'MG', 'VA']
    phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2']
    param = 'VV0001'
    orig_val = dbf.symbols[param].args[0].expr
    models = instantiate_models(dbf, comps, phases, parameters={param: orig_val})
    eq_callables = build_callables(dbf, comps, phases, models, parameter_symbols=[param],
                        output='GM', build_gradients=True, build_hessians=False,
                        additional_statevars={v.N, v.P, v.T})

    zpf_kwargs = {
        'dbf': dbf, 'phases': phases, 'zpf_data': get_zpf_data(comps, phases, datasets_db),
        'phase_models': models, 'callables': eq_callables,
        'data_weight': 1.0,
    }
    opt = EmceeOptimizer(dbf)
    res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs)

    assert np.isreal(res)
    assert np.isclose(res, -31.309645520830344, rtol=1e-6)

    res_2 = opt.predict([10000000], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs)

    assert not np.isclose(res_2, -31.309645520830344, rtol=1e-6)

예제 #3

0

파일 보기

파일: test_mcmc.py 프로젝트: npaulson/ESPEI

def test_emcee_opitmizer_can_restart(datasets_db):
    """A restart trace can be passed to the Emcee optimizer """
    dbf = Database.from_string(CU_MG_TDB, fmt='tdb')
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    param = 'VV0001'
    opt = EmceeOptimizer(dbf)
    restart_tr = -4*np.ones((2, 10, 1))  # 2 chains, 10 iterations, 1 parameter
    opt.fit([param], datasets_db, iterations=1, chains_per_parameter=2, restart_trace=restart_tr)
    assert opt.sampler.chain.shape == (2, 1, 1)

예제 #4

0

파일 보기

파일: test_mcmc.py 프로젝트: npaulson/ESPEI

def test_lnprob_does_not_raise_on_ValueError(datasets_db):
    """lnprob() should catch ValueError raised by equilibrium and return -np.inf"""
    dbf = Database.from_string(CU_MG_TDB, fmt='tdb')
    opt = EmceeOptimizer(dbf)
    comps = ['CU', 'MG', 'VA']
    phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2']
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    zpf_kwargs = {'dbf': dbf, 'phases': phases, 'zpf_data': get_zpf_data(comps, phases, datasets_db), 'data_weight': 1.0}
    res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=['VV0001'], zpf_kwargs=zpf_kwargs)
    assert np.isneginf(res)

예제 #5

0

파일 보기

파일: test_mcmc.py 프로젝트: npaulson/ESPEI

def test_parameter_initialization():
    """Determinisitically generated parameters should match."""
    initial_parameters = np.array([1, 10, 100, 1000])
    opt = EmceeOptimizer(Database())
    deterministic_params = opt.initialize_new_chains(initial_parameters, 1, 0.10, deterministic=True)
    expected_parameters = np.array([
        [9.81708401e-01, 9.39027722e+00, 1.08016748e+02, 9.13512881e+02],
        [1.03116874, 9.01412995, 112.79594345, 916.44725799],
        [1.00664662e+00, 1.07178898e+01, 9.63696718e+01, 1.36872292e+03],
        [1.07642366e+00, 1.16413520e+01, 8.71742457e+01, 9.61836382e+02]])
    assert np.all(np.isclose(deterministic_params, expected_parameters))

예제 #6

0

파일 보기

파일: test_mcmc.py 프로젝트: bocklund/ESPEI

def test_equilibrium_thermochemical_correct_probability(datasets_db):
    """Integration test for equilibrium thermochemical error."""
    dbf = Database(CU_MG_TDB)
    opt = EmceeOptimizer(dbf)
    datasets_db.insert(CU_MG_EQ_HMR_LIQUID)
    ctx = setup_context(dbf, datasets_db, ['VV0017'])
    ctx.update(opt.get_priors(None, ['VV0017'], [0]))

    prob = opt.predict(np.array([-31626.6]), **ctx)
    expected_prob = norm(loc=0, scale=500).logpdf([-31626.6 * 0.5 * 0.5]).sum()
    assert np.isclose(prob, expected_prob)

    # change to -40000
    prob = opt.predict(np.array([-40000], dtype=np.float_), **ctx)
    expected_prob = norm(loc=0, scale=500).logpdf([-40000 * 0.5 * 0.5]).sum()
    assert np.isclose(prob, expected_prob)

예제 #7

0

파일 보기

def test_emcee_optimizer_can_restart(datasets_db):
    """A restart trace can be passed to the Emcee optimizer """
    dbf = Database.from_string(CU_MG_TDB, fmt='tdb')
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    param = 'VV0001'
    opt = EmceeOptimizer(dbf)
    restart_tr = np.array([[[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4],
                            [5]],
                           [[-6], [-4], [-2], [0], [2], [4], [6], [8], [10],
                            [12]]])  # 2 chains, 10 iterations, 1 parameter
    opt.fit([param],
            datasets_db,
            iterations=1,
            chains_per_parameter=2,
            restart_trace=restart_tr)
    assert opt.sampler.chain.shape == (2, 1, 1)

예제 #8

0

파일 보기

def test_equilibrium_thermochemical_context_is_pickleable(datasets_db):
    """Test that the context for equilibrium thermochemical data is pickleable"""
    datasets_db.insert(CU_MG_EQ_HMR_LIQUID)
    dbf = Database(CU_MG_TDB)

    symbols_to_fit = database_symbols_to_fit(dbf)
    initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit])
    prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess)
    ctx = setup_context(dbf, datasets_db)
    ctx.update(prior_dict)

    ctx_pickle = pickle.dumps(ctx)
    ctx_unpickled = pickle.loads(ctx_pickle)

    regular_predict  = EmceeOptimizer.predict(initial_guess, **ctx)
    unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled)
    assert np.isclose(regular_predict, unpickle_predict)

예제 #9

0

파일 보기

def test_zpf_context_is_pickleable(datasets_db):
    """Test that the context for ZPF data is pickleable"""
    datasets_db.insert(CU_MG_DATASET_ZPF_ZERO_ERROR)
    dbf = Database(CU_MG_TDB)

    symbols_to_fit = database_symbols_to_fit(dbf)
    initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit])
    prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess)
    ctx = setup_context(dbf, datasets_db)
    ctx.update(prior_dict)

    ctx_pickle = pickle.dumps(ctx)
    ctx_unpickled = pickle.loads(ctx_pickle)

    regular_predict  = EmceeOptimizer.predict(initial_guess, **ctx)
    unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled)
    assert np.isclose(regular_predict, unpickle_predict)

예제 #10

0

파일 보기

def test_lnprob_calculates_single_phase_probability_for_success(datasets_db):
    """lnprob() succesfully calculates the probability from single phase data"""
    dbf = Database.from_string(CU_MG_TDB_FCC_ONLY, fmt='tdb')
    datasets_db.insert(CU_MG_HM_MIX_SINGLE_FCC_A1)
    comps = ['CU', 'MG', 'VA']
    phases = ['FCC_A1']
    param = 'VV0003'
    orig_val = -14.0865
    opt = EmceeOptimizer(dbf)

    thermochemical_data = get_thermochemical_data(dbf,
                                                  comps,
                                                  phases,
                                                  datasets_db,
                                                  symbols_to_fit=[param])
    thermochemical_kwargs = {
        'dbf': dbf,
        'comps': comps,
        'thermochemical_data': thermochemical_data
    }
    res_orig = opt.predict([orig_val],
                           prior_rvs=[rv_zero()],
                           symbols_to_fit=[param],
                           thermochemical_kwargs=thermochemical_kwargs)
    assert np.isreal(res_orig)
    assert np.isclose(res_orig, -9.119484935312146, rtol=1e-6)

    res_10 = opt.predict([10],
                         prior_rvs=[rv_zero()],
                         symbols_to_fit=[param],
                         thermochemical_kwargs=thermochemical_kwargs)
    assert np.isreal(res_10)
    assert np.isclose(res_10, -9.143559131626864, rtol=1e-6)

    res_1e5 = opt.predict([1e5],
                          prior_rvs=[rv_zero()],
                          symbols_to_fit=[param],
                          thermochemical_kwargs=thermochemical_kwargs)
    assert np.isreal(res_1e5)
    assert np.isclose(res_1e5, -1359.1335466316268, rtol=1e-6)

예제 #11

0

파일 보기

파일: test_mcmc.py 프로젝트: bocklund/ESPEI

def test_lnprob_calculates_associate_tdb(datasets_db):
    """lnprob() successfully calculates the probability for equilibrium """
    dbf = Database.from_string(CU_MG_TDB_ASSOC, fmt='tdb')
    datasets_db.insert(CU_MG_DATASET_ZPF_WORKING)
    comps = ['CU', 'MG', 'VA']
    phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2']
    param = 'VV0001'
    orig_val = dbf.symbols[param].args[0]
    initial_params = {param: orig_val}

    zpf_kwargs = {
        'zpf_data': get_zpf_data(dbf, comps, phases, datasets_db,
                                 initial_params),
        'data_weight': 1.0,
    }
    opt = EmceeOptimizer(dbf)
    res = opt.predict([10],
                      prior_rvs=[rv_zero()],
                      symbols_to_fit=[param],
                      zpf_kwargs=zpf_kwargs)

    assert np.isreal(res)
    assert not np.isinf(res)
    assert np.isclose(res, -31.309645520830344, rtol=1e-6)

    # The purpose of this part is to test that the driving forces (and probability)
    # are different than the case of VV0001 = 10.
    res_2 = opt.predict([-10000000],
                        prior_rvs=[rv_zero()],
                        symbols_to_fit=[param],
                        zpf_kwargs=zpf_kwargs)

    assert np.isreal(res_2)
    assert not np.isinf(res_2)
    # Accept a large rtol becuase the results should be _very_ different
    assert not np.isclose(res_2, -31.309645520830344, rtol=1e-2)

예제 #12

0

파일 보기

파일: espei_script.py 프로젝트: npaulson/ESPEI

def run_espei(run_settings):
    """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary.

    Parameters
    ----------
    run_settings : dict
        Dictionary of input settings

    Returns
    -------
    Either a Database (for generate parameters only) or a tuple of (Database, sampler)
    """
    run_settings = get_run_settings(run_settings)
    system_settings = run_settings['system']
    output_settings = run_settings['output']
    generate_parameters_settings = run_settings.get('generate_parameters')
    mcmc_settings = run_settings.get('mcmc')

    # handle verbosity
    verbosity = {
        0: logging.WARNING,
        1: logging.INFO,
        2: TRACE,
        3: logging.DEBUG
    }
    logging.basicConfig(level=verbosity[output_settings['verbosity']],
                        filename=output_settings['logfile'])

    log_version_info()

    # load datasets and handle i/o
    logging.log(TRACE, 'Loading and checking datasets.')
    dataset_path = system_settings['datasets']
    datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json')))
    if len(datasets.all()) == 0:
        logging.warning(
            'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.'
            .format(dataset_path))
    apply_tags(datasets, system_settings.get('tags', dict()))
    add_ideal_exclusions(datasets)
    logging.log(TRACE, 'Finished checking datasets')

    with open(system_settings['phase_models']) as fp:
        phase_models = json.load(fp)

    if generate_parameters_settings is not None:
        refdata = generate_parameters_settings['ref_state']
        excess_model = generate_parameters_settings['excess_model']
        ridge_alpha = generate_parameters_settings['ridge_alpha']
        aicc_penalty = generate_parameters_settings['aicc_penalty_factor']
        input_dbf = generate_parameters_settings.get('input_db', None)
        if input_dbf is not None:
            input_dbf = Database(input_dbf)
        dbf = generate_parameters(
            phase_models,
            datasets,
            refdata,
            excess_model,
            ridge_alpha=ridge_alpha,
            dbf=input_dbf,
            aicc_penalty_factor=aicc_penalty,
        )
        dbf.to_file(output_settings['output_db'], if_exists='overwrite')

    if mcmc_settings is not None:
        tracefile = output_settings['tracefile']
        probfile = output_settings['probfile']
        # check that the MCMC output files do not already exist
        # only matters if we are actually running MCMC
        if os.path.exists(tracefile):
            raise OSError(
                'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.'
                .format(tracefile))
        if os.path.exists(probfile):
            raise OSError(
                'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.'
                .format(probfile))

        # scheduler setup
        if mcmc_settings['scheduler'] == 'dask':
            _raise_dask_work_stealing()  # check for work-stealing
            from distributed import LocalCluster
            cores = mcmc_settings.get('cores', multiprocessing.cpu_count())
            if (cores > multiprocessing.cpu_count()):
                cores = multiprocessing.cpu_count()
                logging.warning(
                    "The number of cores chosen is larger than available. "
                    "Defaulting to run on the {} available cores.".format(
                        cores))
            # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity
            scheduler = LocalCluster(n_workers=cores,
                                     threads_per_worker=1,
                                     processes=True,
                                     memory_limit=0)
            client = ImmediateClient(scheduler)
            client.run(logging.basicConfig,
                       level=verbosity[output_settings['verbosity']],
                       filename=output_settings['logfile'])
            logging.info("Running with dask scheduler: %s [%s cores]" %
                         (scheduler, sum(client.ncores().values())))
            try:
                bokeh_server_info = client.scheduler_info(
                )['services']['bokeh']
                logging.info(
                    "bokeh server for dask scheduler at localhost:{}".format(
                        bokeh_server_info))
            except KeyError:
                logging.info("Install bokeh to use the dask bokeh server.")
        elif mcmc_settings['scheduler'] == 'None':
            client = None
            logging.info(
                "Not using a parallel scheduler. ESPEI is running MCMC on a single core."
            )
        else:  # we were passed a scheduler file name
            _raise_dask_work_stealing()  # check for work-stealing
            client = ImmediateClient(scheduler_file=mcmc_settings['scheduler'])
            client.run(logging.basicConfig,
                       level=verbosity[output_settings['verbosity']],
                       filename=output_settings['logfile'])
            logging.info("Running with dask scheduler: %s [%s cores]" %
                         (client.scheduler, sum(client.ncores().values())))

        # get a Database
        if mcmc_settings.get('input_db'):
            dbf = Database(mcmc_settings.get('input_db'))

        # load the restart trace if needed
        if mcmc_settings.get('restart_trace'):
            restart_trace = np.load(mcmc_settings.get('restart_trace'))
        else:
            restart_trace = None

        # load the remaining mcmc fitting parameters
        iterations = mcmc_settings.get('iterations')
        save_interval = mcmc_settings.get('save_interval')
        chains_per_parameter = mcmc_settings.get('chains_per_parameter')
        chain_std_deviation = mcmc_settings.get('chain_std_deviation')
        deterministic = mcmc_settings.get('deterministic')
        prior = mcmc_settings.get('prior')
        data_weights = mcmc_settings.get('data_weights')
        syms = mcmc_settings.get('symbols')

        # set up and run the EmceeOptimizer
        optimizer = EmceeOptimizer(dbf, scheduler=client)
        optimizer.save_interval = save_interval
        all_symbols = syms if syms is not None else database_symbols_to_fit(
            dbf)
        optimizer.fit(all_symbols,
                      datasets,
                      prior=prior,
                      iterations=iterations,
                      chains_per_parameter=chains_per_parameter,
                      chain_std_deviation=chain_std_deviation,
                      deterministic=deterministic,
                      restart_trace=restart_trace,
                      tracefile=tracefile,
                      probfile=probfile,
                      mcmc_data_weights=data_weights)
        optimizer.commit()

        optimizer.dbf.to_file(output_settings['output_db'],
                              if_exists='overwrite')
        # close the scheduler, if possible
        if hasattr(client, 'close'):
            client.close()
        return optimizer.dbf, optimizer.sampler
    return dbf

예제 #13

0

파일 보기

파일: mcmc.py 프로젝트: npaulson/ESPEI

def mcmc_fit(dbf,
             datasets,
             iterations=1000,
             save_interval=1,
             chains_per_parameter=2,
             chain_std_deviation=0.1,
             scheduler=None,
             tracefile=None,
             probfile=None,
             restart_trace=None,
             deterministic=True,
             prior=None,
             mcmc_data_weights=None):
    """
    Run MCMC via the EmceeOptimizer class

    Parameters
    ----------
    dbf : Database
        A pycalphad Database to fit with symbols to fit prefixed with `VV`
        followed by a number, e.g. `VV0001`
    datasets : PickleableTinyDB
        A database of single- and multi-phase data to fit
    iterations : int
        Number of trace iterations to calculate in MCMC. Default is 1000 iterations.
    save_interval :int
        interval of iterations to save the tracefile and probfile
    chains_per_parameter : int
        number of chains for each parameter. Must be an even integer greater or
        equal to 2. Defaults to 2.
    chain_std_deviation : float
        standard deviation of normal for parameter initialization as a fraction
        of each parameter. Must be greater than 0. Default is 0.1, which is 10%.
    scheduler : callable
        Scheduler to use with emcee. Must implement a map method.
    tracefile : str
        filename to store the trace with NumPy.save. Array has shape
        (chains, iterations, parameters)
    probfile : str
        filename to store the log probability with NumPy.save. Has shape (chains, iterations)
    restart_trace : np.ndarray
        ndarray of the previous trace. Should have shape (chains, iterations, parameters)
    deterministic : bool
        If True, the emcee sampler will be seeded to give deterministic sampling
        draws. This will ensure that the runs with the exact same database,
        chains_per_parameter, and chain_std_deviation (or restart_trace) will
        produce exactly the same results.
    prior : str
        Prior to use to generate priors. Defaults to 'zero', which keeps
        backwards compatibility. Can currently choose 'normal', 'uniform',
        'triangular', or 'zero'.
    mcmc_data_weights : dict
        Dictionary of weights for each data type, e.g. {'ZPF': 20, 'HM': 2}

    """
    warnings.warn("The mcmc convenience function will be removed in ESPEI 0.8")
    all_symbols = database_symbols_to_fit(dbf)

    optimizer = EmceeOptimizer(dbf, scheduler=scheduler)
    optimizer.save_interval = save_interval
    optimizer.fit(all_symbols,
                  datasets,
                  prior=prior,
                  iterations=iterations,
                  chains_per_parameter=chains_per_parameter,
                  chain_std_deviation=chain_std_deviation,
                  deterministic=deterministic,
                  restart_trace=restart_trace,
                  tracefile=tracefile,
                  probfile=probfile,
                  mcmc_data_weights=mcmc_data_weights)
    optimizer.commit()
    return optimizer.dbf, optimizer.sampler