Exemplo n.º 1
0
def test__write_simulation_results__no_filename():

    cleanup_test()

    parameter_names = ['param{}'.format(i + 1) for i in range(3)]
    qoi_names = ['qoi{}'.format(i + 1) for i in range(5)]
    error_names = ['err{}'.format(i + 1) for i in range(5)]

    datafile = PyposmatDataFile()
    datafile.write_header_section(parameter_names=parameter_names,
                                  qoi_names=qoi_names,
                                  error_names=error_names,
                                  filename=datafile_out_fn)

    sim_id = "test_id"

    results = OrderedDict()
    results['parameters'] = OrderedDict([(v, 1.) for v in parameter_names])
    results['qois'] = OrderedDict([(v, 2.) for v in qoi_names])
    results['errors'] = OrderedDict([(v, 3.0) for v in error_names])

    datafile.write_simulation_results(sim_id, results)

    assert os.path.isfile(datafile_out_fn)

    datafile_read = PyposmatDataFile()
    datafile_read.read(filename=datafile_out_fn)
Exemplo n.º 2
0
class PyposmatMonteCarloSampler(PyposmatEngine):
    def __init__(self,
                 filename_in='pyposmat.config.in',
                 filename_out='pyposmat.results.out',
                 o_log=None,
                 mpi_rank=None,
                 mpi_size=None,
                 base_directory=None):
        """Additional attributes are set by the base class :obj:PyposmatEngine

        Args:
            filename_in (str) - path of the configuration file
            filename_out (str) - path of the output file
            o_log (PyposmatLogFile) - if type(o_log) is a string, then the string is treated as a path in which to log information to.  If type(o_log) is PyposmatLogFile then it is set as an attribute for the refernce.
            mpi_rank (int)
            mpi_size (int)
            base_directory (str,optional): Either the relative or full path which provides a
        unique drive addressing space for simultaneously running simulations.
        Attributes:
            mpi_rank (int) - this is passed in
            mpi_size (int) - this is passed in
            pyposmat_data_in_filename (str) - the path of the datafile to read in
            pyposmat_data_out_filename (str) - the path of the datafile to write simulation results to
        """
        assert isinstance(filename_in, str)
        assert isinstance(filename_out, str)
        assert type(base_directory) in [str, type(None)]

        PyposmatEngine.__init__(self,
                                filename_in=filename_in,
                                filename_out=filename_out,
                                base_directory=base_directory,
                                fullauto=False)

        if mpi_rank is None:
            self.mpi_rank = 0
        else:
            self.mpi_rank = mpi_rank

        if mpi_size is None:
            self.mpi_size = 1
        else:
            self.mpi_size = mpi_size

        assert self.mpi_rank < self.mpi_size

        self.mpi_rank = mpi_rank
        self.mpi_size = mpi_size
        self.pyposmat_data_in_filename = None
        self.pyposmat_data_out_filename = filename_out
        self.pyposmat_badparameters_filename = 'pyposmat.badparameters.out'

        try:
            self.configure_logger(o_log)
        except TypeError as e:
            m = "Unable to to configure obj_log based on attribute log:{}".format(
                str(o_log))
            raise TypeError(m)

    def configure_logger(self, o_log=None):
        """
        Configurtion of the log object has different behavior based upon the type passed
        into the argument o_log.  If o_log is PyposmatLogFile, that object will be accessed
        by reference.  A string is assumed to be a filename location.  By default the
        argument for o_log is None, which means logging will go to standard out by means of 
        the print() function.

        Args:
            o_log (str,PyposmatLogFile,None): default: None
        """

        if type(o_log) is PyposmatLogFile:
            self.obj_log = o_log
        elif type(o_log) is str:
            self.obj_log = PyposmatLogFile(filename=o_log)
        elif o_log is None:
            self.obj_log = None
        else:
            m = "log object must be str, PyposmatLogFile, or None"
            raise TypeError(m)

    def log(self, str_msg):
        if type(str_msg) is str:
            m = str_msg
        elif type(str_msg) is list:
            m = "\n".join(str_msg)

        if type(self.obj_log) is PyposmatLogFile:
            self.obj_log.write(m)
        print(m)

    def configure_pyposmat_datafile_in(self, filename):
        self.pyposmat_data_in_filename = filename
        self.pyposmat_datafile_in = PyposmatDataFile(filename)

    def configure_pyposmat_datafile_out(self, filename=None):
        if filename is not None:
            assert type(filename) is str
            self.pyposmat_data_out_filename = filename
        self.pyposmat_datafile_out = PyposmatDataFile(filename)

    def configure_pyposmat_badparameters_file(self, filename=None):
        if filename is not None:
            assert type(filename) is str
            self.pyposmat_badparameters_filename = filename

        self.pyposmat_badparameters = PyposmatBadParametersFile(
            filename=self.pyposmat_badparameters_filename,
            o_config=self.configuration)

    def read_configuration_file(self, filename=None):
        PyposmatEngine.read_configuration_file(self, filename=filename)
        # self.structure_directory = self.configuration.structures['structure_directory']
        self.n_iterations = self.configuration.sampling_type['n_iterations']
        self.parameter_names = [
            p for p in self.configuration.sampling_distribution
        ]
        self.qoi_names = [k for k in self.configuration.qois]
        self.error_names = ['{}.err'.format(k) for k in self.qoi_names]
        self.parameter_distribution_definition =\
                self.configuration.sampling_distribution

        try:
            self.free_parameter_names = [
                k for k, v in self.parameter_distribution_definition.items()
                if v[0] != 'equals'
            ]
        except KeyError as e:
            print(self.parameter_distribution_definition.items())
            raise
        if self.configuration.sampling_constraints is not None:
            self.parameter_constraints = copy.deepcopy(
                self.configuration.sampling_constraints)
        else:
            self.parameter_constraints = OrderedDict()

        self.constrained_parameter_names = []
        for p in self.parameter_names:
            if p not in self.free_parameter_names:
                self.constrained_parameter_names.append(p)

    def run_simulations(self, i_iteration, n_samples=None, filename=None):
        """

        Args:
            i_iteration(int): the iteration cycle we are on.
            n_samples(int,optional): the number of parameters to evaluate
            filename(str,optional): the filename
        """

        assert type(i_iteration) is int
        assert type(n_samples) in [type(None), int]
        assert type(filename) in [type(None), str]

        i = i_iteration
        _sampling_type = self.configuration.sampling_type[i]['type']
        _n_samples = self.configuration.sampling_type[i]['n_samples']

        if self.mpi_rank == 0:
            m = [
                "R{}: Starting iteration N={}".format(self.mpi_rank,
                                                      i_iteration)
            ]
            if _sampling_type is "from_file":
                m += [
                    "R{}: Sampling parameters from {}".format(
                        self.mpi_rank, filename)
                ]
            else:
                m += [
                    "R{}: Attemping n_samples={} with sampling_type={}".format(
                        self.mpi_rank, _n_samples, _sampling_type)
                ]
            if filename is not None:
                m += ["R{}: Using file:{}".format(self.mpi_rank, filename)]
            self.log(m)

        if n_samples is not None:
            _n_samples = n_samples

        if _sampling_type == 'parametric':
            self.run_parameteric_sampling(n_samples=_n_samples)
        elif _sampling_type == 'kde':
            if filename is None:
                raise ValueError('cannot do kde sampling with out filename')
            self.run_kde_sampling(n_samples=_n_samples, filename_in=filename)
        elif _sampling_type == 'from_file':
            if filename is None:
                raise ValueError('cannot do filesampling without file')
            self.run_file_sampling(filename)
        else:
            raise ValueError('unknown sampling type:{}'.format(_sampling_type))

    def write_badparameters_header(self):
        self.pyposmat_badparameters.write_header_section(
            filename=self.pyposmat_badparameters_filename)

    def write_data_out_header(self):
        self.pyposmat_datafile_out.write_header_section(
            filename=self.pyposmat_data_out_filename,
            parameter_names=self.parameter_names,
            qoi_names=self.qoi_names,
            error_names=self.error_names)

    def get_sim_id(self, i, s=None):
        if s is not None:
            return s
        elif isinstance(i, int):
            return str(i)
        else:
            m = 'cannot determine sim_id from i:{} and s:{}'.format(i, s)
            raise TypeError(m)

    def run_parameteric_sampling(self, n_samples):

        # create random number generator
        _rv_generators = OrderedDict()
        for p in self.free_parameter_names:
            distribution_type = self.parameter_distribution_definition[p][0]
            if distribution_type == 'uniform':
                _a = self.parameter_distribution_definition[p][1]['a']
                _b = self.parameter_distribution_definition[p][1]['b']
                _loc = _a
                _scale = _b - _a
                _rv_generators[p] = scipy.stats.uniform(loc=_loc, scale=_scale)
            elif distribution_type == 'normal':
                _mu = self.parameter_distribution_definition[p][1]['mu']
                _sigma = self.parameter_distribution_definition[p][1]['sigma']
                _loc = _mu
                _scale = _sigma
                _rv_generators[p] = scipy.stats.norm(loc=_loc, scale=_scale)
            else:
                raise ValueError(
                    'unknown distribution type: {}'.format(distribution_type))

        self.write_data_out_header()
        self.write_badparameters_header()

        time_start_iteration = time.time()
        _n_errors = 0

        for i_sample in range(n_samples):
            # determin sim_id
            sim_id = self.get_sim_id(i=i_sample)

            # new OrderedDict to hold in parameter values
            _parameters = OrderedDict([(p, None)
                                       for p in self.parameter_names])

            # generate free parameters for ordered dictionary
            for p in self.free_parameter_names:
                _parameters[p] = _rv_generators[p].rvs(size=1)[0]

            # determine parameters determined from equality constraints
            for p in self.constrained_parameter_names:
                _constraint_type = self.parameter_distribution_definition[p][0]
                if _constraint_type == 'equals':

                    # this condition is for fitting EoS for EAM function which
                    # requires a refernce ground state crystal structure
                    if p.endswith('latticetype'):
                        _v = self.parameter_distribution_definition[p][1]
                        _parameters[p] = _v

                    # process evaluation strings
                    elif type(self.parameter_distribution_definition[p]
                              [1]) is not list:
                        _str_eval = str(
                            self.parameter_distribution_definition[p][1])

                        # replace string values with numerical values
                        for fp in self.free_parameter_names:
                            if fp in _str_eval:
                                _str_eval = _str_eval.replace(
                                    fp, str(_parameters[fp]))

                        # evaluate the string into a float
                        _parameters[p] = eval(_str_eval)
                    else:
                        raise ValueError("oops")

            # additional tasks added here
            for p in self.constrained_parameter_names:
                if self.parameter_distribution_definition[p][0] == 'equals':
                    if type(self.parameter_distribution_definition[p]
                            [1]) is list:
                        # required for EAM potentials to calculate dens_max for embedding function
                        if self.parameter_distribution_definition[p][1][
                                0] == 'equilibrium_density':
                            a0 = self.parameter_distribution_definition[p][1][
                                1]
                            latt = self.parameter_distribution_definition[p][
                                1][2]
                            _parameters[
                                p] = self.calculate_equilibrium_density(
                                    a0, latt, _parameters)

            try:
                # check constraints
                for k, v in self.parameter_constraints.items():
                    _eval_str = v
                    for pn, pv in _parameters.items():
                        _eval_str = _eval_str.replace(pn, str(pv))

                    if eval(_eval_str) is False:
                        m = "failed parameter constraint, {}".format(k)
                        raise PyposmatBadParameterError(m,
                                                        parameters=_parameters)

                _results = self.evaluate_parameter_set(parameters=_parameters)
            except PyposmatBadParameterError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except LammpsSimulationError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackTaskManagerError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackBadEamEosError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            else:

                #if type(sim_id) is float:
                #    _sim_id = int(sim_id)

                _sim_id = "{}".format(i_sample)

                self.pyposmat_datafile_out.write_simulation_results(
                    filename=self.pyposmat_data_out_filename,
                    sim_id=_sim_id,
                    results=_results)
            finally:
                # print out summaries every 10 solutions
                if (i_sample + 1) % 10 == 0:
                    n_samples_completed = i_sample + 1
                    time_end = time.time()
                    time_total = time_end - time_start_iteration
                    avg_time = time_total / n_samples_completed
                    _str_msg = 'R{}:{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format(
                        self.mpi_rank, n_samples_completed, time_total,
                        avg_time, _n_errors)
                    self.log(_str_msg)

    def get_options_kde_bandwidth(self):
        """
        Returns:
            OrderedDict
        """

        kde_options = OrderedDict()
        kde_options['chiu1999'] = OrderedDict()
        kde_options['chiu1999'][
            'reference'] = 'Chiu, S.T. Ann. Stat. 1991, Vol. 19, No 4. 1883-1905'
        kde_options['chiu1999']['doi'] = '10.1214/aos/1176348376'
        kde_options['chiu1999']['description'] = ""
        kde_options['silverman1984'] = OrderedDict()
        kde_options['silverman1984'][
            'reference'] = 'Silverman, B.W. (1986). Density Estimation for Statistics and Data Analysis. London: Chapman & Hall/CRC. p. 48'
        kde_options['silverman1984']['isbn'] = '0-412-24620-1'

    def determine_kde_bandwidth(self, X, kde_bw_type):
        """ determine kde bandwidth

        Args:
            X(np.ndarray): array of data to determine the KDE bandwidth
            kde_bw_type(str): the method of estimating the optimal bandwidth
        """

        if self.mpi_rank == 0:
            self.log('determine kde bandwidth...')

        if kde_bw_type == 'chiu1999':
            try:
                h = Chiu1999_h(X)
            except ValueError as e:
                print(X)
                raise

        elif kde_bw_type == 'silverman1985':
            h = Silverman1986
        else:
            m = 'kde_bw_type, {}, is not an implemented bandwidth type'
            raise PypospackBadKdeBandwidthType(m)

        if self.mpi_rank == 0:
            self.log('{}:{}'.format(kde_bw_type, h))
        self.kde_bw_type = kde_bw_type
        self.kde_bw = h

        return self.kde_bw

    def run_kde_sampling(self,
                         n_samples,
                         filename_in,
                         cluster_id=None,
                         kde_bw_type='chiu1999'):
        """ sample from a KDE distribution

        Args:
            n_samples(int): the number of samples to draw from the KDE distribution
            filename_in(str): the path to the datafile from which the parameters will be drawn from
            cluster_id(int): if we need to use a specific cluster_id, we specify it here.  
                otherwise, it will be drawn from all parameters contained within the set.
            kde_bw_type(str): the method of estimating the optimal bandwidth
        """
        _datafile_in = PyposmatDataFile()
        _datafile_in.read(filename_in)

        if cluster_id is None:
            _free_parameter_names = [str(v) for v in self.free_parameter_names]
            _X = _datafile_in.df[_free_parameter_names].values.T
        else:
            # subselect the dataframe by the cluster_id of interest
            _datafile_in.df = _datafile_in.df.loc[_datafile_in.df['cluster_id']
                                                  == cluster_id]
            _X = _datafile_in.df[self.free_parameter_names].loc[
                _datafile_in.df['cluster_id'] == cluster_id].values.T
            # self.log.write("cluster_id {c} _X.shape={x}".format(c=cluster_id, x=_X.shape))

        kde_bw = self.determine_kde_bandwidth(X=_X, kde_bw_type=kde_bw_type)

        _rv_generator = scipy.stats.gaussian_kde(_X, kde_bw)

        self.write_data_out_header()
        self.write_badparameters_header()

        time_start_iteration = time.time()
        _n_errors = 0

        for i_sample in range(n_samples):
            # determine sim_id
            sim_id = self.get_sim_id(i=i_sample)

            # new OrderedDict to hold in parameter values
            _parameters = OrderedDict([(p, None)
                                       for p in self.parameter_names])

            # generate free parameters for ordered dictionary
            _free_parameters = _rv_generator.resample(1)
            for i, v in enumerate(self.free_parameter_names):
                _parameters[v] = float(_free_parameters[i, 0])

            # determine parameters determined from equality constraints
            for p in self.constrained_parameter_names:
                _constraint_type = self.parameter_distribution_definition[p][0]
                if _constraint_type == 'equals':

                    # this condition is for fitting EoS for EAM function which
                    # requires a refernce ground state crystal structure
                    if p.endswith('latticetype'):
                        _v = self.parameter_distribution_definition[p][1]
                        _parameters[p] = _v

                    # process evaluation strings
                    elif type(self.parameter_distribution_definition[p]
                              [1]) is not list:
                        _str_eval = str(
                            self.parameter_distribution_definition[p][1])

                        # replace string values with numerical values
                        for fp in self.free_parameter_names:
                            if fp in _str_eval:
                                _str_eval = _str_eval.replace(
                                    fp, str(_parameters[fp]))

                        # evaluate the string into a float
                        _parameters[p] = eval(_str_eval)
                    else:
                        raise ValueError("oops")

            for p in self.constrained_parameter_names:
                if self.parameter_distribution_definition[p][0] == 'equals':
                    # some EAM potentials have a normalizing equilbirum density
                    # which have to be determined based upon the parameterization of
                    # the electron density function
                    if type(self.parameter_distribution_definition[p]
                            [1]) is list:
                        if self.parameter_distribution_definition[p][1][
                                0] == 'equilibrium_density':
                            a0 = self.parameter_distribution_definition[p][1][
                                1]
                            latt = self.parameter_distribution_definition[p][
                                1][2]
                            _parameters[
                                p] = self.calculate_equilibrium_density(
                                    a0, latt, _parameters)

            try:
                # now we check parameter inequality constraints
                for k, v in self.parameter_constraints.items():
                    _eval_str = v
                    for pn, pv in _parameters.items():
                        _eval_str = _eval_str.replace(pn, str(pv))

                    if eval(_eval_str) is False:
                        s = 'parameter constraint failed, {}'.format(k)
                        raise PyposmatBadParameterError(s,
                                                        parameters=_parameters)
                _results = self.evaluate_parameter_set(parameters=_parameters)
            except PyposmatBadParameterError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except LammpsSimulationError as e:
                assert isinstance(self.pyposmat_badparameters,
                                  PyposmatBadParametersFile)
                assert isinstance(self.pyposmat_badparameters.parameter_names,
                                  list)
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackTaskManagerError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackBadEamEosError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            else:

                # determine sim_id
                _sim_id = int(i_sample)

                self.pyposmat_datafile_out.write_simulation_results(
                    filename=self.pyposmat_data_out_filename,
                    sim_id=i_sample,
                    cluster_id=cluster_id,
                    results=_results)
            finally:
                # print out summaries every 10 solutions
                if (i_sample + 1) % 10 == 0:
                    n_samples_completed = i_sample + 1
                    time_end = time.time()
                    time_total = time_end - time_start_iteration
                    avg_time = time_total / n_samples_completed
                    _str_msg = 'R{}:{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format(
                        self.mpi_rank, n_samples_completed, time_total,
                        avg_time, _n_errors)
                    self.log(_str_msg)

        d = OrderedDict()
        d['kde_bandwidth'] = OrderedDict()
        d['kde_bandwidth']['type'] = self.kde_bw_type
        d['kde_bandwidth']['h'] = self.kde_bw

    def run_file_sampling(self, filename_in):

        _datafile_in = PyposmatDataFile(filename=filename_in)
        _datafile_in.read()
        # configure random number generator

        self.write_data_out_header()
        self.write_badparameters_header()

        time_start_iteration = time.time()

        _n_errors = 0
        i_sample = 0
        for row in _datafile_in.df.iterrows():
            if self.mpi_rank != i_sample % self.mpi_size:
                i_sample += 1
                continue
            else:
                i_sample += 1
            _parameters = OrderedDict([(p, row[1][p])
                                       for p in self.parameter_names])
            _sim_id = row[1]['sim_id']

            # generate wierd things
            for p in self.constrained_parameter_names:
                if self.parameter_distribution_definition[p][0] == 'equals':
                    if type(self.parameter_distribution_definition[p]
                            [1]) is list:
                        if self.parameter_distribution_definition[p][1][
                                0] == 'equilibrium_density':
                            a0 = self.parameter_distribution_definition[p][1][
                                1]
                            latt = self.parameter_distribution_definition[p][
                                1][2]
                            _parameters[
                                p] = self.calculate_equilibrium_density(
                                    a0, latt, _parameters)
            try:
                # check constraints
                for k, v in self.parameter_constraints.items():
                    _eval_str = v
                    for pn, pv in _parameters.items():
                        _eval_str = _eval_str.replace(pn, str(pv))
                    if eval(_eval_str) is False:
                        raise PyposmatBadParameterError()

                _results = self.evaluate_parameter_set(parameters=_parameters)
            except PyposmatBadParameterError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except LammpsSimulationError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackTaskManagerError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            except PypospackBadEamEosError as e:
                self.pyposmat_badparameters.write_simulation_exception(
                    sim_id=sim_id, exception=e)
                _n_errors += 1
            else:
                if type(_sim_id) is float: _sim_id = int(sim_id)
                self.pyposmat_datafile_out.write_simulation_results(
                    filename=self.pyposmat_data_out_filename,
                    sim_id=_sim_id,
                    results=_results)
            finally:
                # print out summaries every 10 solutions
                i_sample = i_sample + 1
                if (i_sample) % 10 == 0:
                    n_samples_completed = i_sample
                    time_end = time.time()
                    time_total = time_end - time_start_iteration
                    avg_time = time_total / n_samples_completed
                    _str_msg = '{} samples completed in {:.4f}s. Avg_time = {:.4f}. n_errors = {}'.format(
                        n_samples_completed, time_total, avg_time, _n_errors)
                    print('rank{}:'.format(self.mpi_rank) + _str_msg)

    def calculate_equilibrium_density(self, a0, latt, parameters):
        _parameters = OrderedDict()
        for k, v in parameters.items():
            if k.startswith('d_'):
                _parameters[k[2:]] = v
            s = k[2:].split('_')[0]
        _potential_type = self.configuration.potential['density_type']
        _symbols = self.configuration.potential['symbols']
        _module_name, _class_name = PotentialObjectMap(
            potential_type=_potential_type)
        try:
            _module = importlib.import_module(_module_name)
            _class = getattr(_module, _class_name)
            _dens_potential = _class(symbols=_symbols)
        except:
            raise

        if latt == 'fcc':
            d = OrderedDict([('1NN', 2 / (2**0.5) * a0), ('2NN', 1.000 * a0),
                             ('3NN', 1.225 * a0)])
            Z = OrderedDict([('1NN', 12), ('2NN', 6), ('3NN', 24)])
            rcut = (d['2NN'] + d['3NN']) / 2.

            rmax = 10.
            r = np.linspace(1, 10, 5000) * rmax / 10
            rho = _dens_potential.evaluate(r, _parameters, rcut)

            rho_e = 0
            for m in Z:
                if d[m] < rcut:
                    rho_e += Z[m] * np.interp(d[m], r, rho[s])

            return rho_e

    def print_structure_database(self):
        m = [
            80 * '-', '{:^80}'.format('STRUCTURE DATABASE'), 80 * '-',
            'structure_directory:{}'.format(self.structure_directory), '',
            '{:^20} {:^20}'.format('name', 'filename'),
            '{} {}'.format(20 * '-', 20 * '-')
        ]
        m += [
            '{:20} {:20}'.format(k, v)
            for k, v in self.structures['structures'].items()
        ]
        self.log(m)

    def print_sampling_configuration(self):
        print(80 * '-')
        print('{:^80}'.format('SAMPLING CONFIGURATION'))
        print(80 * '-')

        print('{:^10} {:^10} {:^20}'.format('iteration', 'n_samples',
                                            'sampling_type'))
        print('{} {} {}'.format(10 * '-', 10 * '-', 20 * '-'))

        for i in range(self.n_iterations):
            _sample_type = self.configuration.sampling_type[i]['type']
            if _sample_type == 'kde_w_clusters':
                _n_samples = self.configuration.sampling_type[i][
                    'n_samples_per_cluster']
            else:
                _n_samples = self.configuration.sampling_type[i]['n_samples']
            print('{:^10} {:^10} {:^20}'.format(i, _n_samples, _sample_type))

    def print_initial_parameter_distribution(self):
        print(80 * '-')
        print('{:80}'.format('INITIAL PARAMETER DISTRIBUTION'))
        print(80 * '-')
        for p in self.parameter_distribution_definition:
            if p in self.free_parameter_names:
                str_free = 'free'
                if self.parameter_distribution_definition[p][0] == 'uniform':
                    print('{:^20} {:^10} {:^10} {:^10} {:^10}'.format(
                        p, str_free,
                        self.parameter_distribution_definition[p][0],
                        self.parameter_distribution_definition[p][1]['a'],
                        self.parameter_distribution_definition[p][1]['b']))
                elif self.parameter_distribution_definition[p][0] == 'normal':
                    print('{:^20} {:^10} {:^10} {:^10} {:^10}'.format(
                        p, str_free,
                        self.parameter_distribution_definition[p][0],
                        self.parameter_distribution_definition[p][1]['mu'],
                        self.parameter_distribution_definition[p][1]['sigma']))
                else:
                    _distribution_type = self.parameter_distribution_defintion[
                        p][0]
                    s = "incorrection parameter distribution for parameter {}.  probability distribution function, {}, is not supported"
                    s = s.format(p, _distribution_type)
                    raise ValueError(s)

            else:
                str_free = 'not_free'
                print('{:^20} {:^10}'.format(p, str_free))
Exemplo n.º 3
0
class PyposmatFileSampler(PyposmatEngine):
    """ samples from a datafile

    Args:
        config_fn(str): path of the configuration file
        data_in_fn(str): path of the datafile to sample from
        data_out_fn(str): path of the datafile to write results to
        mpi_rank(int): the MPI rank of this process
        mpi_rank(int): the MPI size of this process
        log(PyposmatLogFile): log object instance

    Attributes:
        configuration(PyposmatConfigurationFile): configuration file
        datafile_in(PyposmatDataFile): object instance of the datafile being read in
        datafile_out(PyposmatDataFile): object instance of the datafiel being written to

    """

    def __init__(self,
            config_fn='pyposmat.config.in',
            data_in_fn='pyposmat.results.in',
            data_out_fn='pyposmat.results.out',
            mpi_rank = None,
            mpi_size=None,
            o_log = None,
            log_to_stdout=True,
            base_directory = None,
            fullauto=True):

        self.DEBUG = False
        self.configuration = None
        self.datafile_in = None
        self.datafile_out = None

        self.mpi_rank = None
        self.mpi_size = None

        self.qoi_mananger = None
        self.task_manager = None

        PyposmatEngine.__init__(self,
                filename_in=config_fn,
                filename_out=data_out_fn,
                base_directory=base_directory,
                fullauto=fullauto)

        self.configuration_fn = config_fn
        self.datafile_in_fn = data_in_fn
        self.datafile_out_fn = data_out_fn
        self.pyposmat_badparameters_filename = 'pyposmat.badparameters.out'

        self.configuration = None
        self.datafile = None
        self.subselect_df = None
        self.reference_potentials = None

        self.qoi_validation_names = None
        self.error_validation_names = None
        self.normed_error_validation_names = None

        self.qoi_validation_target = None
        self.obj_log = None
        self.log_to_stdout = None

        self.initialize_mpi_information(mpi_rank=mpi_rank,
                                        mpi_size=mpi_size)
        self.configure_logger(o_log=o_log,
                              log_to_stdout=log_to_stdout)
        if fullauto is True:
            self.read_configuration_file(filename=self.configuration_fn)
            self.read_datafile_in(filename=data_in_fn)
            self.configure_datafile_out(filename=data_out_fn)

    @property
    def n_iterations(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.sampling_type['n_iterations']
        else:
            return None

    @property
    def parameter_names(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.parameter_names
        else:
            return None

    @property
    def qoi_names(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.qoi_names
        else:
            return None

    @property
    def qoi_targets(self):
        if isntance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.qoi_targets
        else:
            return None

    @property
    def error_names(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.error_names
        else:
            return None

    @property
    def normed_error_names(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.normed_error_names
        else:
            return None
    
    @property
    def parameter_distribution_definition(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.sampling_distribution
        else:
            return None

    @property
    def free_parameter_names(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.free_parameter_names
        else:
            return None

    @property
    def parameter_constraints(self):
        if isinstance(self.configuration,PyposmatConfigurationFile):
            return self.configuration.sampling_constraints
        else:
            return None

    def initialize_mpi_information(self,mpi_rank=None,mpi_size=None):
        if isinstance(mpi_rank, int) and isinstance(mpi_size,int):
            self.mpi_rank = mpi_rank
            self.mpi_size = mpi_size
        elif mpi_rank is None and mpi_size is None:
            self.mpi_rank = 0
            self.mpi_size = 1
        else:
            error_message = "mpi_rank:{}\n".format(mpi_rank)
            error_message += "mpi_size:{}".format(mpi_size)
            raise TypeError(error_message)


    def log(self,str_msg):
        """ log message

        Args:
            str_msg(str,list): This is the message to log.  If this argument is a string, then
                the string will bemessage to log

        """
        if type(str_msg) is str:
            m = str_msg
        elif type(str_msg) is list:
            m = "\n".join(str_msg)

        if type(self.obj_log) is PyposmatLogFile:
            self.obj_log.write(m)

        if self.log_to_stdout:
            print(m)
    
    def configure_logger(self,o_log=None,log_to_stdout=True):
        """
        Configurtion of the log object has different behavior based upon the type passed
        into the argument o_log.  If o_log is PyposmatLogFile, that object will be accessed
        by reference.  A string is assumed to be a filename location.  By default the
        argument for o_log is None, which means logging will go to standard out by means of 
        the print() function.

        Args:
            o_log (str,PyposmatLogFile,None): default: None
        """

        if type(o_log) is PyposmatLogFile:
            self.obj_log = o_log
        elif type(o_log) is str:
            self.obj_log = PyposmatLogFile(filename=o_log)
        elif o_log is None:
            self.obj_log = None
        else:
            m = "log object must be str, PyposmatLogFile, or None"
            raise TypeError(m)

        if isinstance(log_to_stdout,bool):
            self.log_to_stdout = log_to_stdout
        else:
            m = "log_to_stdout must be boolean"
            raise TypeError(m)
    
    def configure_qoi_manager(self,qois=None,use_fitting_qois=True,use_testing_qois=False):

        if qois is not None:
            _qois = copy.deepcopy(qois)
        else:
            _qois = OrderedDict()

            if use_fitting_qois:
                for k,v in self.configuration.qois.items():
                    _qois[k]=v

            if use_testing_qois:
                for k,v in self.configuration.qois_validation.items():
                    _qois[k]=v
        PyposmatEngine.configure_qoi_manager(self,_qois)

    def configure_task_manager(self):
        PyposmatEngine.configure_task_manager(self)

    def configure_pyposmat_badparameters_file(self,filename=None):
        if filename is not None:
            assert type(filename) is str
            self.pyposmat_badparameters_filename = filename

        self.pyposmat_badparameters = PyposmatBadParametersFile(
                filename=self.pyposmat_badparameters_filename,
                o_config=self.configuration)

    def read_configuration_file(self,filename=None):
        PyposmatEngine.read_configuration_file(self,filename=filename)

        # does this have to be removed?
        # self.structure_directory = self.configuration.structures['structure_directory']
        if self.DEBUG:
            if os.path.isdir(self.structure_directory):
                msg = "[OK] structure_directory:".format(self.structure_directory)
                self.__log(msg)
            else:
                msg = "[FAIL] structure_directory:".format(self.structure_directory)
                raise PyposmatEngineError(msg)
        

        # set name arrays for validation qois
        self.qoi_validation_names = self.configuration.qoi_validation_names
        self.error_validation_names = self.configuration.error_validation_names
        self.normed_error_validation_names = self.configuration.normed_error_validation_names

        # set dictionaries for qoi targets
        self.qoi_validation_targets = self.configuration.qoi_validation_targets
    
        # set dictionary for reference potentials
        self.reference_potentials = self.configuration.reference_potentials

    def read_datafile_in(self,filename=None):
        self.datafile_in = PyposmatDataFile()
        self.datafile_in.read(filename=filename)

    def configure_datafile_out(self,filename=None):
        if filename is not None:
            self.datafile_out_fn = filename

        self.datafile_out = PyposmatDataFile(self.datafile_out_fn)

    def subselect_by_dmetric(self,nsmallest=50):

        # calculated normalized errors for qois
        for iqn,qn in enumerate(self.qoi_names):
            error_name = "{}.err".format(qn)
            normed_error_name = "{}.nerr".format(qn)

            q = self.qoi_targets[qn]
            error = self.datafile_in.df[error_name]

            self.datafile_in.df[normed_error_name] = error/q

        self.datafile_in.df['d_metric'] = np.sqrt(np.square(
            self.datafile_in.df[self.normed_error_names]).sum(axis=1))  
    
        self.subselect_df = self.datafile_in.df.nsmallest(nsmallest,'d_metric')

        return self.subselect_df
    
    def run_simulations(self,i_iteration,n_samples=None,filename=None):
        """ run simulations

        Args:
            i_iterations(int): the iteration cycle we are on
            n_samples(int,optional): does not do anything
            filename(str,optional): the filename we are sampling from

        """
        if isinstance(filename,str):
            self.read_datafile_in(filename=filename)
        else:
            _filename = self.configuration.sampling_type[i_iteration]['file']
            if os.path.isabs(_filename):
                self.read_datafile_in(filename=_filename)
            else:
                self.read_datafile_in(filename=_filename)

        if self.qoi_validation_names is not None:
            self.datafile_out.write_header_section(
                filename = self.datafile_out_fn,
                parameter_names = self.parameter_names,
                qoi_names = self.qoi_names,
                error_names = self.error_names,
                qoi_v_names = self.qoi_validation_names,
                error_v_names = self.error_validation_names
            )
        else:
            self.datafile_out.write_header_section(
                filename = self.datafile_out_fn,
                parameter_names = self.parameter_names,
                qoi_names = self.qoi_names,
                error_names = self.error_names,
            )
            
        if self.reference_potentials is not None:
            self._sample_from_reference_potentials()

        if self.subselect_df is not None:
            self._sample_from_subselect_df(
                    subselect_df=self.subselect_df)
        else:
            self._sample_from_subselect_df(
                    subselect_df=self.datafile_in.df)

    def _sample_from_reference_potentials(self,reference_potentials=None):
        """

        This method assumes that the reference potentials have the same functional form as 
        the potentials being tested.

        """
        if reference_potentials is None:
            _rpotentials = self.reference_potentials

        for potential_name,potential in _rpotentials.items():
            
            try:
                _sim_id = int(float(potential_name))
            except ValueError as e:
                _sim_id = potential_name

            parameters = potential['parameters']
            
            evals = self.evaluate_parameter_set(parameters=parameters)

            _results = OrderedDict()
            _results['parameters'] = parameters
            
            _results['qois'] = OrderedDict()
            for v in self.qoi_names:
                try:
                    _results['qois'][v] = potential['qoi'][v]
                except KeyError as e:
                    if v in evals['qois']:
                        _results['qois'][v] = evals['qois'][v]
                    else:
                        _results['qois'][qn] = np.NaN
            _results['errors'] = OrderedDict()
            for v in self.error_names:
                try:
                    qn = ".".join([s for s in v.split(".") if s != 'err'])
                    qhat = potential['qoi'][qn]
                    q = self.configuration.qois[qn]['target']
                    _results['errors'][v] = qhat - q
                except KeyError as e:
                    if v in evals['errors']:
                        _results['errors'][v] = evals['errors'][v]
                    else:
                        _results['errors'][qn] = np.NaN

            _results['qois_v'] = OrderedDict()
            for v in self.qoi_validation_names:
                if v in evals['qois']:
                    _results['qois_v'][v] = evals['qois'][v]
                else:
                    _results['qois_v'][qn] = np.NaN
                    
            _results['errors_v'] = OrderedDict()
            for v in self.error_validation_names:
                if v in evals['errors']:
                    _results['errors_v'][v] = evals['errors'][v]
                else:
                    _results['errors_v'][qn] = np.NaN
          
            self.datafile_out.write_simulation_results(
                    sim_id = _sim_id,
                    results = _results)


    def _sample_from_subselect_df(self,subselect_df=None):
        
        if subselect_df is None:
            _subselect_df = self.subselect_df
        else:
            _subselect_df = subselect_df

        time_start_iteration = time.time()
        n_errors = 0

        for i_sample,row in _subselect_df.iterrows():
            if i_sample%self.mpi_size != self.mpi_rank:
                pass
            else:

                # populate local parameter dictionary
                parameters = OrderedDict([(pn,row[pn]) for pn in self.parameter_names])
               
                try:
                    sim_id = int(float(row['sim_id']))
                except ValueError as e:
                    sim_id = row['sim_id']

                try:
                    evals = self.evaluate_parameter_set(parameters=parameters)
                except PyposmatBadParameterError as e:
                    n_errors += 1
                except LammpsSimulationError as e:
                    n_errors += 1
                except PypospackTaskManagerError as e:
                    n_errors += 1
                else:
                    results = OrderedDict()
                    results['parameters'] = parameters
                    results['qois'] = OrderedDict([(v,evals['qois'][v]) for v in self.qoi_names])
                    results['errors'] = OrderedDict([(v,evals['errors'][v]) for v in self.error_names])
                    self.datafile_out.write_simulation_results(sim_id = sim_id,results = results)
                finally:
                    if (i_sample+1)%10 == 0:
                        n_samples_completed = i_sample+1
                        time_end = time.time()
                        time_total = time_end-time_start_iteration
                        avg_time = time_total/n_samples_completed
                        s = 'R{}:{} samples completed in {:4f}s, Avg_time={:4f}. n_errors={}'
                        s = s.format(self.mpi_rank,n_samples_completed,time_total,avg_time,n_errors)
                        self.log(s)
    
    def print_structure_database(self):
        m = [
            80*'-',
            '{:^80}'.format('STRUCTURE DATABASE'),
            80*'-',
            'structure_directory:{}'.format(self.structure_directory),
            '',
            '{:^20} {:^20}'.format('name','filename'),
            '{} {}'.format(20*'-',20*'-')
        ]
        m += ['{:20} {:20}'.format(k,v) for k,v in self.structures['structures'].items()]
        self.log(m)

    def print_sampling_configuration(self):
        print(80*'-')
        print('{:^80}'.format('SAMPLING CONFIGURATION'))
        print(80*'-')

        print('{:^10} {:^10} {:^20}'.format(
            'iteration',
            'n_samples',
            'sampling_type'))
        print('{} {} {}'.format(10*'-',10*'-',20*'-'))

        for i in range(self.n_iterations):
            _sample_type = self.configuration.sampling_type[i]['type']
            if _sample_type == 'kde_w_clusters':
                _n_samples = self.configuration.sampling_type[i]['n_samples_per_cluster']
            else:
                _n_samples = self.configuration.sampling_type[i]['n_samples']
            print('{:^10} {:^10} {:^20}'.format(i,_n_samples,_sample_type))
    
    def print_initial_parameter_distribution(self):
        self.log(80*'-')
        self.log('{:80}'.format('INITIAL PARAMETER DISTRIBUTION'))
        self.log(80*'-')
        for p in self.parameter_distribution_definition:
            if p in self.free_parameter_names:
                str_free = 'free'
                if self.parameter_distribution_definition[p][0] == 'uniform':
                    print('{:^20} {:^10} {:^10} {:^10} {:^10}'.format(
                        p,
                        str_free,
                        self.parameter_distribution_definition[p][0],
                        self.parameter_distribution_definition[p][1]['a'],
                        self.parameter_distribution_definition[p][1]['b']))
                elif self.parameter_distribution_definition[p][0] == 'normal':
                    print('{:^20} {:^10} {:^10} {:^10} {:^10}'.format(
                        p,
                        str_free,
                        self.parameter_distribution_definition[p][0],
                        self.parameter_distribution_definition[p][1]['mu'],
                        self.parameter_distribution_definition[p][1]['sigma']))
                else:
                    _distribution_type = self.parameter_distribution_defintion[p][0]
                    s = "incorrection parameter distribution for parameter {}.  probability distribution function, {}, is not supported"
                    s = s.format(p,_distribution_type)
                    raise ValueError(s)

            else:
                str_free = 'not_free'
                print('{:^20} {:^10}'.format(p,str_free))
Exemplo n.º 4
0
                    results['qois'][q] = press_data.df.iloc[
                        j, press_data.df.columns.get_loc(q)]

                results['errors'] = OrderedDict()
                for e in a0_config.error_names:
                    results['errors'][e] = a0_data.df.iloc[
                        i, a0_data.df.columns.get_loc(e)]
                for e in press_config.error_names:
                    results['errors'][e] = press_data.df.iloc[
                        j, press_data.df.columns.get_loc(e)]

                print(results)

                # write the data results
                new_data.write_simulation_results(sim_id=i,
                                                  results=results,
                                                  filename=new_data_fn)
                break
            else:
                pass

        #print(a0_data.df.iloc[
        #    i,
        #    [a0_data.df.columns.get_loc(c) for c in parameter_names]
        #])
        #print(press_data.df.iloc[
        #     i,
        #     [press_data.df.columns.get_loc(c) for c in parameter_names]
        #])

    #a0_col_names
    data_incremental.read(v)
    for row in data_incremental.df.iterrows():
        if pyposmat_data_out is None:
            pyposmat_data_out = PyposmatDataFile()
            pyposmat_data_out.write_header_section(
                parameter_names=data_incremental.parameter_names,
                qoi_names=data_incremental.qoi_names,
                error_names=data_incremental.error_names,
                filename=pyposmat_data_out_fn)
        sim_id = sim_id_fmt.format(i_iteration, i_sim_id)

        results = OrderedDict()
        results['parameters'] = OrderedDict()
        for p in pyposmat_data_out.parameter_names:
            results['parameters'][p] = row[1][p]

        results['qois'] = OrderedDict()
        for q in pyposmat_data_out.qoi_names:
            results['qois'][q] = row[1][q]

        results['errors'] = OrderedDict()
        for e in pyposmat_data_out.error_names:
            results['errors'][e] = row[1][e]

        pyposmat_data_out.write_simulation_results(
            sim_id=sim_id,
            results=results,
            cluster_id=None,
            filename=pyposmat_data_out_fn)
        i_sim_id += 1