def main(): """ Handle starting ESPEI from the command line. Parse command line arguments and input file. """ args = parser.parse_args(sys.argv[1:]) # if desired, check datasets and return if args.check_datasets: dataset_filenames = sorted( recursive_glob(args.check_datasets, '*.json')) errors = [] for dataset in dataset_filenames: try: load_datasets([dataset]) except (ValueError, DatasetError) as e: errors.append(e) if len(errors) > 0: print(*errors, sep='\n') return 1 else: return 0 # if we aren't checking datasets, then we will check that the input file exists input_file = args.input if input_file is None: raise ValueError( 'To run ESPEI, provide an input file with the `--input` option.') # continue with setup # load the settings ext = os.path.splitext(input_file)[-1] if ext == '.yml' or ext == '.yaml': with open(input_file) as f: input_settings = yaml.safe_load(f) elif ext == '.json': with open(input_file) as f: input_settings = json.load(f) else: raise ValueError( 'Unknown file type {} for input file {}. YAML and JSON are supported' .format(ext, input_file)) run_espei(input_settings)
def run(database, tracefile, probfile, datasets, t_min, t_max, t_step, plot): dbf = Database(database) trace = np.load(tracefile) if tracefile else None lnprob = np.load(probfile) if probfile else None ds = load_datasets(recursive_glob(datasets, "*.json")) if datasets else None phase_diagram_options = dict() phase_diagram_options["temperatures"] = (t_min, t_max, t_step) plots = plot.lower() main(dbf, trace, lnprob, ds, plots, phase_diagram_options)
def __init__(self, dbf): self.orig_dbf = copy.deepcopy(dbf) self.dbf = copy.deepcopy(dbf) parameters = { sym: unpack_piecewise(dbf.symbols[sym]) for sym in database_symbols_to_fit(dbf) } ds = load_datasets([]) # empty TinyDB root = OptNode(parameters, ds) self.current_node = root self.staged_nodes = [] self.graph = OptGraph(root)
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} logging.basicConfig(level=verbosity[output_settings['verbosity']]) # load datasets and handle i/o logging.debug('Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) logging.debug('Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] dbf = generate_parameters(phase_models, datasets, refdata, excess_model) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'MPIPool': # check that cores is not an input setting if mcmc_settings.get('cores') != None: logging.warning("MPI does not take the cores input setting.") from emcee.utils import MPIPool # code recommended by emcee: if not master, wait for instructions then exit client = MPIPool() if not client.is_master(): logging.debug( 'MPIPool is not master. Waiting for instructions...') client.wait() sys.exit(0) logging.info("Using MPIPool on {} MPI ranks".format(client.size)) elif mcmc_settings['scheduler'] == 'dask': from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']]) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: logging.info( "bokeh server for dask scheduler at localhost:{}".format( client.scheduler_info()['services']['bokeh'])) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'emcee': from emcee.interruptible_pool import InterruptiblePool cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) client = InterruptiblePool(processes=cores) logging.info("Using multiprocessing on {} cores".format(cores)) elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart chain if needed if mcmc_settings.get('restart_chain'): restart_chain = np.load(mcmc_settings.get('restart_chain')) else: restart_chain = None # load the remaning mcmc fitting parameters mcmc_steps = mcmc_settings.get('mcmc_steps') save_interval = mcmc_settings.get('mcmc_save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') dbf, sampler = mcmc_fit( dbf, datasets, scheduler=client, mcmc_steps=mcmc_steps, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, save_interval=save_interval, tracefile=tracefile, probfile=probfile, restart_chain=restart_chain, deterministic=deterministic, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return dbf, sampler return dbf
""" import argparse from espei.datasets import recursive_glob, load_datasets, DatasetError parser = argparse.ArgumentParser(description='Check datasets at the chosen paths.') parser.add_argument('paths', metavar='PATH', type=str, nargs='*', default=['.'], help='Path(s) to search') args = parser.parse_args() checked_files = 0 errors = [] for path in args.paths: dataset_filenames = sorted(recursive_glob(path, '*.json')) for dataset in dataset_filenames: try: load_datasets([dataset]) except KeyError as e: # this is likely from an input.json pass except (ValueError, DatasetError) as e: errors.append(e) finally: checked_files += 1 if len(errors) > 0: print(*errors, sep='\n') exit(1) else: print('Successfully checked {} files'.format(checked_files)) exit(0)
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = { 0: logging.WARNING, 1: logging.INFO, 2: TRACE, 3: logging.DEBUG } logging.basicConfig(level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) log_version_info() # load datasets and handle i/o logging.log(TRACE, 'Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) apply_tags(datasets, system_settings.get('tags', dict())) add_ideal_exclusions(datasets) logging.log(TRACE, 'Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] ridge_alpha = generate_parameters_settings['ridge_alpha'] aicc_penalty = generate_parameters_settings['aicc_penalty_factor'] input_dbf = generate_parameters_settings.get('input_db', None) if input_dbf is not None: input_dbf = Database(input_dbf) dbf = generate_parameters( phase_models, datasets, refdata, excess_model, ridge_alpha=ridge_alpha, dbf=input_dbf, aicc_penalty_factor=aicc_penalty, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'dask': _raise_dask_work_stealing() # check for work-stealing from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True, memory_limit=0) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: bokeh_server_info = client.scheduler_info( )['services']['bokeh'] logging.info( "bokeh server for dask scheduler at localhost:{}".format( bokeh_server_info)) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) else: # we were passed a scheduler file name _raise_dask_work_stealing() # check for work-stealing client = ImmediateClient(scheduler_file=mcmc_settings['scheduler']) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (client.scheduler, sum(client.ncores().values()))) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart trace if needed if mcmc_settings.get('restart_trace'): restart_trace = np.load(mcmc_settings.get('restart_trace')) else: restart_trace = None # load the remaining mcmc fitting parameters iterations = mcmc_settings.get('iterations') save_interval = mcmc_settings.get('save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') prior = mcmc_settings.get('prior') data_weights = mcmc_settings.get('data_weights') syms = mcmc_settings.get('symbols') # set up and run the EmceeOptimizer optimizer = EmceeOptimizer(dbf, scheduler=client) optimizer.save_interval = save_interval all_symbols = syms if syms is not None else database_symbols_to_fit( dbf) optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=data_weights) optimizer.commit() optimizer.dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return optimizer.dbf, optimizer.sampler return dbf
OUTPUT_EXP_FILENAME = 'phase_diagram.exp' ############################ ############# RUN ########## ############################ import tinydb from pycalphad import Database from espei.datasets import load_datasets, recursive_glob from espei.core_utils import ravel_zpf_values from espei.utils import bib_marker_map # load the experimental and DFT datasets datasets = load_datasets(recursive_glob(DATASETS_DIR, '*.json')) # phases = ['LIQUID', 'BCC_A2', 'FCC_A1'] desired_data = datasets.search((tinydb.where('output') == 'ZPF') & (tinydb.where('components').test(lambda x: set(x).issubset(comps + ['VA']))) #& # (tinydb.where('phases').test(lambda x: len(set(phases).intersection(x)) > 0))) ) raveled_dict = ravel_zpf_values(desired_data, [independent_component])