示例#1
0
class OTF(object):
    def __init__(self,
                 dft_input: str,
                 dt: float,
                 number_of_steps: int,
                 gp: gp.GaussianProcess,
                 dft_loc: str,
                 std_tolerance_factor: float = 1,
                 prev_pos_init: np.ndarray = None,
                 par: bool = False,
                 skip: int = 0,
                 init_atoms: List[int] = None,
                 calculate_energy=False,
                 output_name='otf_run',
                 max_atoms_added=1,
                 freeze_hyps=10,
                 rescale_steps=[],
                 rescale_temps=[],
                 dft_softwarename="qe",
                 no_cpus=1,
                 npool=None,
                 mpi="srun"):

        self.dft_input = dft_input
        self.dt = dt
        self.number_of_steps = number_of_steps
        self.gp = gp
        self.dft_loc = dft_loc
        self.std_tolerance = std_tolerance_factor
        self.skip = skip
        self.dft_step = True
        self.freeze_hyps = freeze_hyps
        self.dft_module = dft_software[dft_softwarename]

        # parse input file
        positions, species, cell, masses = \
            self.dft_module.parse_dft_input(self.dft_input)

        _, coded_species = struc.get_unique_species(species)

        self.structure = struc.Structure(cell=cell,
                                         species=coded_species,
                                         positions=positions,
                                         mass_dict=masses,
                                         prev_positions=prev_pos_init,
                                         species_labels=species)

        self.noa = self.structure.positions.shape[0]
        self.atom_list = list(range(self.noa))
        self.curr_step = 0

        self.max_atoms_added = max_atoms_added

        # initialize local energies
        if calculate_energy:
            self.local_energies = np.zeros(self.noa)
        else:
            self.local_energies = None

        # set atom list for initial dft run
        if init_atoms is None:
            self.init_atoms = [int(n) for n in range(self.noa)]
        else:
            self.init_atoms = init_atoms

        self.dft_count = 0

        # set pred function
        if not par and not calculate_energy:
            self.pred_func = predict.predict_on_structure
        elif par and not calculate_energy:
            self.pred_func = predict.predict_on_structure_par
        elif not par and calculate_energy:
            self.pred_func = predict.predict_on_structure_en
        elif par and calculate_energy:
            self.pred_func = predict.predict_on_structure_par_en
        self.par = par

        # set rescale attributes
        self.rescale_steps = rescale_steps
        self.rescale_temps = rescale_temps

        self.output = Output(output_name, always_flush=True)

        # set number of cpus and npool for qe runs
        self.no_cpus = no_cpus
        self.npool = npool
        self.mpi = mpi

    def run(self):
        self.output.write_header(self.gp.cutoffs, self.gp.kernel_name,
                                 self.gp.hyps, self.gp.algo, self.dt,
                                 self.number_of_steps, self.structure,
                                 self.std_tolerance)
        counter = 0
        self.start_time = time.time()

        while self.curr_step < self.number_of_steps:
            print('curr_step:', self.curr_step)
            # run DFT and train initial model if first step and DFT is on
            if self.curr_step == 0 and self.std_tolerance != 0:
                # call dft and update positions
                self.run_dft()
                dft_frcs = copy.deepcopy(self.structure.forces)
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)
                self.update_temperature(new_pos)
                self.record_state()

                # make initial gp model and predict forces
                self.update_gp(self.init_atoms, dft_frcs)
                if (self.dft_count - 1) < self.freeze_hyps:
                    self.train_gp()

            # after step 1, try predicting with GP model
            else:
                self.gp.check_L_alpha()
                self.pred_func(self.structure, self.gp, self.no_cpus)
                self.dft_step = False
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)

                # get max uncertainty atoms
                std_in_bound, target_atoms = is_std_in_bound(
                    self.std_tolerance, self.gp.hyps[-1], self.structure,
                    self.max_atoms_added)

                if not std_in_bound:
                    # record GP forces
                    self.update_temperature(new_pos)
                    self.record_state()
                    gp_frcs = copy.deepcopy(self.structure.forces)

                    # run DFT and record forces
                    self.dft_step = True
                    self.run_dft()
                    dft_frcs = copy.deepcopy(self.structure.forces)
                    new_pos = md.update_positions(self.dt, self.noa,
                                                  self.structure)
                    self.update_temperature(new_pos)
                    self.record_state()

                    # compute mae and write to output
                    mae = np.mean(np.abs(gp_frcs - dft_frcs))
                    mac = np.mean(np.abs(dft_frcs))

                    self.output.write_to_log('\nmean absolute error:'
                                             ' %.4f eV/A \n' % mae)
                    self.output.write_to_log('mean absolute dft component:'
                                             ' %.4f eV/A \n' % mac)

                    # add max uncertainty atoms to training set
                    self.update_gp(target_atoms, dft_frcs)
                    if (self.dft_count - 1) < self.freeze_hyps:
                        self.train_gp()

            # write gp forces
            if counter >= self.skip and not self.dft_step:
                self.update_temperature(new_pos)
                self.record_state()
                counter = 0

            counter += 1
            self.update_positions(new_pos)
            self.curr_step += 1

        self.output.conclude_run()

    def run_dft(self):
        self.output.write_to_log('\nCalling DFT...\n')

        # calculate DFT forces
        forces = self.dft_module.run_dft_par(self.dft_input,
                                             self.structure,
                                             self.dft_loc,
                                             no_cpus=self.no_cpus,
                                             npool=self.npool,
                                             mpi=self.mpi)
        self.structure.forces = forces

        # write wall time of DFT calculation
        self.dft_count += 1
        self.output.write_to_log('QE run complete.\n')
        time_curr = time.time() - self.start_time
        self.output.write_to_log('number of DFT calls: %i \n' % self.dft_count)
        self.output.write_to_log('wall time from start: %.2f s \n' % time_curr)

    def update_gp(self, train_atoms, dft_frcs):
        self.output.write_to_log(
            '\nAdding atom {} to the training set.\n'.format(train_atoms))
        self.output.write_to_log('Uncertainty: {}.\n'.format(
            self.structure.stds[train_atoms[0]]))

        # update gp model
        self.gp.update_db(self.structure, dft_frcs, custom_range=train_atoms)

        self.gp.set_L_alpha()

    def train_gp(self):
        self.gp.train(self.output)
        self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                               self.start_time, self.gp.likelihood,
                               self.gp.likelihood_gradient)

    def update_positions(self, new_pos):
        if self.curr_step in self.rescale_steps:
            rescale_ind = self.rescale_steps.index(self.curr_step)
            temp_fac = self.rescale_temps[rescale_ind] / self.temperature
            vel_fac = np.sqrt(temp_fac)
            self.structure.prev_positions = \
                new_pos - self.velocities * self.dt * vel_fac
        else:
            self.structure.prev_positions = self.structure.positions
        self.structure.positions = new_pos
        self.structure.wrap_positions()

    def update_temperature(self, new_pos):
        KE, temperature, velocities = \
                md.calculate_temperature(new_pos, self.structure, self.dt,
                                         self.noa)
        self.KE = KE
        self.temperature = temperature
        self.velocities = velocities

    def record_state(self):
        self.output.write_md_config(self.dt, self.curr_step, self.structure,
                                    self.temperature, self.KE,
                                    self.local_energies, self.start_time,
                                    self.dft_step, self.velocities)
        self.output.write_xyz_config(self.curr_step, self.structure,
                                     self.dft_step)
示例#2
0
class TrajectoryTrainer(object):

    def __init__(self, frames: List[Structure],
                 gp: GaussianProcess,
                 rel_std_tolerance: float = 1,
                 abs_std_tolerance: float = 1,
                 parallel: bool = False,
                 skip: int = 0,
                 calculate_energy: bool = False,
                 output_name: str = 'gp_from_aimd',
                 max_atoms_from_frame: int = np.inf, max_trains: int = np.inf,
                 min_atoms_added: int = 1,
                 n_cpus: int = 1, shuffle_frames: bool = False,
                 verbose: int = 0, model_write: str = '',
                 pre_train_on_skips: bool = False,
                 pre_train_seed_frames: List[Structure] = None,
                 pre_train_seed_envs: List[Tuple[AtomicEnvironment,
                                                 np.array]] = None,
                 pre_train_atoms_per_element: dict = None):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.

        :param frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param rel_std_tolerance: Train if uncertainty is above this *
        noise variance hyperparameter
        :param abs_std_tolerance: Train if uncertainty is above this
        :param parallel: Use parallel functions or not
        :param skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param output_name: Write output of training to this file
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_added: Only train when this many atoms have been added
        :param max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over
        :param shuffle_frames: Randomize order of frames for better training
        :param verbose: 0: Silent, 1: Minimal, 2: Lots of information
        :param model_write: Write output model here
        :param pre_train_on_skips: Train model on every n frames before running
        :param pre_train_seed_frames: Frames to train on before running
        :param pre_train_seed_envs: Environments to train on before running
        :param pre_train_atoms_per_element: Max # of environments to add from
        each species in the seed pre-training steps
        """

        self.frames = frames
        if shuffle_frames:
            np.random.shuffle(frames)
        self.gp = gp
        self.rel_std_tolerance = rel_std_tolerance
        self.abs_std_tolerance = abs_std_tolerance
        self.skip = skip
        self.max_trains = max_trains
        self.curr_step = 0
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_added = min_atoms_added
        self.verbose = verbose
        self.train_count = 0

        self.parallel = parallel

        # set pred function
        if parallel:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par
        else:
            if calculate_energy:
                self.pred_func = predict_on_structure_en
            else:
                self.pred_func = predict_on_structure

        self.output = Output(output_name)

        # set number of cpus for parallelization
        self.n_cpus = n_cpus

        # To later be filled in using the time library
        self.start_time = None
        self.pickle_name = model_write

        self.pre_train_on_skips = pre_train_on_skips
        self.seed_envs = [] if pre_train_seed_envs is None else \
            pre_train_seed_envs
        self.seed_frames = [] if pre_train_seed_frames is None \
            else pre_train_seed_frames
        self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
                                               is None else pre_train_atoms_per_element

    def pre_run(self):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.
        1. Print the output.
        2. Pre-train the GP with the seed frames and
        environments. If no seed frames or environments and the GP has no
        training set, then seed with at least one atom from each
        """

        self.output.write_header(self.gp.cutoffs,
                                 self.gp.kernel_name,
                                 self.gp.hyps,
                                 self.gp.algo,
                                 dt=0,
                                 Nsteps=len(self.frames),
                                 structure=self.frames[0],
                                 std_tolerance=(self.rel_std_tolerance,
                                                self.abs_std_tolerance))

        self.start_time = time.time()

        # If seed environments were passed in, add them to the GP.
        for point in self.seed_envs:
            self.gp.add_one_env(point[0], point[1], train=False)

        # No training set ("blank slate" run) and no seeds specified:
        # Take one of each atom species in the first frame
        # so all atomic species are represented in the first step.
        # Otherwise use the seed frames passed in by user.
        if len(self.gp.training_data) == 0 and self.seed_frames is None:
            self.seed_frames = [self.frames[0]]

        for frame in self.seed_frames:
            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                atoms_of_specie = frame.indices_of_specie(species_i)
                np.random.shuffle(atoms_of_specie)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_to_add = min(n_at, self.pre_train_env_per_species.get(
                    species_i, np.inf), self.max_atoms_from_frame)

                for atom in atoms_of_specie[:n_to_add]:
                    train_atoms.append(atom)

            self.update_gp_and_print(frame, train_atoms, train=False)

        # These conditions correspond to if either the GP was never trained
        # or if data was added to it during the pre-run.

        if (self.gp.l_mat is None) \
                or (self.seed_frames is not None
                    or self.seed_envs is not None):
            self.gp.train(output=self.output if self.verbose > 0 else None)

    def run(self):
        """
        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty threshold.
        :return:
        """

        self.pre_run()

        # Loop through trajectory
        for i, cur_frame in enumerate(self.frames):

            if self.verbose >= 2:
                print("=====NOW ON FRAME {}=====".format(i))
            dft_forces = deepcopy(cur_frame.forces)
            self.pred_func(cur_frame, self.gp)

            # Convert to meV/A
            mae = np.mean(np.abs(cur_frame.forces - dft_forces)) * 1000
            mac = np.mean(np.abs(dft_forces)) * 1000

            self.output.write_gp_dft_comparison(
                curr_step=i, frame=cur_frame,
                start_time=time.time(),
                dft_forces=dft_forces,
                mae=mae, mac=mac, local_energies=None)

            # get max uncertainty atoms
            std_in_bound, train_atoms = self.is_std_in_bound(cur_frame)
            if not std_in_bound:

                # compute mae and write to output
                # add max uncertainty atoms to training set
                self.update_gp_and_print(cur_frame, train_atoms, train=False)

                if self.train_count < self.max_trains:
                    self.train_gp()

        self.output.conclude_run()

        if self.pickle_name:
            with open(self.pickle_name, 'wb') as f:
                pickle.dump(self.gp, f)

    def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
                            train: bool=True):
        """
        Update the internal GP model training set with a list of training
        atoms indexing atoms within the frame. If train is True, re-train
        the GP by optimizing hyperparameters.
        :param frame: Structure to train on
        :param train_atoms: Index atoms to train on
        :param train: Train or not
        :return:
        """

        self.output.write_to_log('\nAdding atom(s) {} to the '
                                 'training set.\n'
                                 .format(train_atoms, ))
        self.output.write_to_log('Uncertainties: {}.\n'
                                 .format(frame.stds[train_atoms]))

        # update gp model
        self.gp.update_db(frame, frame.forces, custom_range=train_atoms)
        self.gp.set_L_alpha()

        if train:
            self.train_gp()

    def train_gp(self):
        """
        Train the Gaussian process and write the results to the output file.
        """
        self.gp.train(output=self.output if self.verbose >= 2 else None)

        self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                               self.start_time,
                               self.gp.like, self.gp.like_grad)
        self.train_count += 1

    def is_std_in_bound(self, frame: Structure)->(bool, List[int]):
        """
        If the predicted variance is too high, returns a list of atoms
        with the highest uncertainty
        :param frame: Structure
        :return:
        """

        # This indicates test mode, as the GP is not being modified in any way
        if self.rel_std_tolerance == 0 and self.abs_std_tolerance == 0:
            return True, [-1]

        # set uncertainty threshold
        if self.rel_std_tolerance == 0:
            threshold = self.abs_std_tolerance
        elif self.abs_std_tolerance == 0:
            threshold = self.rel_std_tolerance * np.abs(self.gp.hyps[-1])
        else:
            threshold = min(self.rel_std_tolerance * np.abs(self.gp.hyps[-1]),
                            self.abs_std_tolerance)

        # sort max stds
        max_stds = np.zeros(frame.nat)
        for atom_idx, std in enumerate(frame.stds):
            max_stds[atom_idx] = np.max(std)
        stds_sorted = np.argsort(max_stds)

        # Handle case where unlimited atoms are added
        # or if max # of atoms exceeds size of frame
        if self.max_atoms_from_frame == np.inf or \
                self.max_atoms_from_frame > len(frame):
            target_atoms = list(stds_sorted)
        else:
            target_atoms = list(stds_sorted[-self.max_atoms_from_frame:])

        # if above threshold, return atom
        if max_stds[stds_sorted[-1]] > threshold:
            return False, target_atoms
        else:
            return True, [-1]
示例#3
0
文件: otf.py 项目: nw13slx/flare-RC
class OTF(object):
    def __init__(self,
                 qe_input: str,
                 dt: float,
                 number_of_steps: int,
                 gp: gp.GaussianProcess,
                 pw_loc: str,
                 std_tolerance_factor: float = 1,
                 prev_pos_init: np.ndarray = None,
                 par: bool = False,
                 skip: int = 0,
                 init_atoms: List[int] = None,
                 calculate_energy=False,
                 output_name='otf_run',
                 max_atoms_added=1,
                 freeze_hyps=10,
                 rescale_steps=[],
                 rescale_temps=[],
                 no_cpus=1):

        self.qe_input = qe_input
        self.dt = dt
        self.number_of_steps = number_of_steps
        self.gp = gp
        self.pw_loc = pw_loc
        self.std_tolerance = std_tolerance_factor
        self.skip = skip
        self.dft_step = True
        self.freeze_hyps = freeze_hyps

        # parse input file
        positions, species, cell, masses = \
            qe_util.parse_qe_input(self.qe_input)

        _, coded_species = struc.get_unique_species(species)

        self.structure = struc.Structure(cell=cell,
                                         species=coded_species,
                                         positions=positions,
                                         mass_dict=masses,
                                         prev_positions=prev_pos_init,
                                         species_labels=species)

        self.noa = self.structure.positions.shape[0]
        self.atom_list = list(range(self.noa))
        self.curr_step = 0

        self.max_atoms_added = max_atoms_added

        # initialize local energies
        if calculate_energy:
            self.local_energies = np.zeros(self.noa)
        else:
            self.local_energies = None

        # set atom list for initial dft run
        if init_atoms is None:
            self.init_atoms = [int(n) for n in range(self.noa)]
        else:
            self.init_atoms = init_atoms

        self.dft_count = 0

        # set pred function
        if not par and not calculate_energy:
            self.pred_func = self.predict_on_structure
        elif par and not calculate_energy:
            self.pred_func = self.predict_on_structure_par
        elif not par and calculate_energy:
            self.pred_func = self.predict_on_structure_en
        elif par and calculate_energy:
            self.pred_func = self.predict_on_structure_par_en
        self.par = par

        # set rescale attributes
        self.rescale_steps = rescale_steps
        self.rescale_temps = rescale_temps

        self.output = Output(output_name)

        # set number of cpus for qe runs
        self.no_cpus = no_cpus

    def run(self):
        self.output.write_header(self.gp.cutoffs, self.gp.kernel_name,
                                 self.gp.hyps, self.gp.algo, self.dt,
                                 self.number_of_steps, self.structure,
                                 self.std_tolerance)
        counter = 0
        self.start_time = time.time()

        while self.curr_step < self.number_of_steps:
            print('curr_step:', self.curr_step)
            # run DFT and train initial model if first step and DFT is on
            if self.curr_step == 0 and self.std_tolerance != 0:
                # call dft and update positions
                self.run_dft()
                dft_frcs = copy.deepcopy(self.structure.forces)
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)
                self.update_temperature(new_pos)
                self.record_state()

                # make initial gp model and predict forces
                self.update_gp(self.init_atoms, dft_frcs)
                if (self.dft_count - 1) < self.freeze_hyps:
                    self.train_gp()

            # after step 1, try predicting with GP model
            else:
                self.pred_func()
                self.dft_step = False
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)

                # get max uncertainty atoms
                std_in_bound, target_atoms = self.is_std_in_bound()

                if not std_in_bound:
                    # record GP forces
                    self.update_temperature(new_pos)
                    self.record_state()
                    gp_frcs = copy.deepcopy(self.structure.forces)

                    # run DFT and record forces
                    self.dft_step = True
                    self.run_dft()
                    dft_frcs = copy.deepcopy(self.structure.forces)
                    new_pos = md.update_positions(self.dt, self.noa,
                                                  self.structure)
                    self.update_temperature(new_pos)
                    self.record_state()

                    # compute mae and write to output
                    mae = np.mean(np.abs(gp_frcs - dft_frcs))
                    mac = np.mean(np.abs(dft_frcs))

                    self.output.write_to_log('\nmean absolute error:'
                                             ' %.4f eV/A \n' % mae)
                    self.output.write_to_log('mean absolute dft component:'
                                             ' %.4f eV/A \n' % mac)

                    # add max uncertainty atoms to training set
                    self.update_gp(target_atoms, dft_frcs)
                    if (self.dft_count - 1) < self.freeze_hyps:
                        self.train_gp()

            # write gp forces
            if counter >= self.skip and not self.dft_step:
                self.update_temperature(new_pos)
                self.record_state()
                counter = 0

            counter += 1
            self.update_positions(new_pos)
            self.curr_step += 1

        self.output.conclude_run()

    def predict_on_atom(self, atom):
        chemenv = env.AtomicEnvironment(self.structure, atom, self.gp.cutoffs)
        comps = []
        stds = []
        # predict force components and standard deviations
        for i in range(3):
            force, var = self.gp.predict(chemenv, i + 1)
            comps.append(float(force))
            stds.append(np.sqrt(np.abs(var)))

        return comps, stds

    def predict_on_atom_en(self, atom):
        chemenv = env.AtomicEnvironment(self.structure, atom, self.gp.cutoffs)
        comps = []
        stds = []
        # predict force components and standard deviations
        for i in range(3):
            force, var = self.gp.predict(chemenv, i + 1)
            comps.append(float(force))
            stds.append(np.sqrt(np.abs(var)))

        # predict local energy
        local_energy = self.gp.predict_local_energy(chemenv)
        return comps, stds, local_energy

    def predict_on_structure_par(self):
        n = 0
        with concurrent.futures.ProcessPoolExecutor() as executor:
            for res in executor.map(self.predict_on_atom, self.atom_list):
                for i in range(3):
                    self.structure.forces[n][i] = res[0][i]
                    self.structure.stds[n][i] = res[1][i]
                n += 1

    def predict_on_structure_par_en(self):
        n = 0
        with concurrent.futures.ProcessPoolExecutor() as executor:
            for res in executor.map(self.predict_on_atom_en, self.atom_list):
                for i in range(3):
                    self.structure.forces[n][i] = res[0][i]
                    self.structure.stds[n][i] = res[1][i]
                self.local_energies[n] = res[2]
                n += 1

    def predict_on_structure(self):
        for n in range(self.structure.nat):
            chemenv = env.AtomicEnvironment(self.structure, n, self.gp.cutoffs)
            for i in range(3):
                force, var = self.gp.predict(chemenv, i + 1)
                self.structure.forces[n][i] = float(force)
                self.structure.stds[n][i] = np.sqrt(np.abs(var))

    def predict_on_structure_en(self):
        for n in range(self.structure.nat):
            chemenv = env.AtomicEnvironment(self.structure, n, self.gp.cutoffs)
            for i in range(3):
                force, var = self.gp.predict(chemenv, i + 1)
                self.structure.forces[n][i] = float(force)
                self.structure.stds[n][i] = np.sqrt(np.abs(var))
            self.local_energies[n] = self.gp.predict_local_energy(chemenv)

    def run_dft(self):
        self.output.write_to_log('\nCalling Quantum Espresso...\n')

        # calculate DFT forces
        forces = qe_util.run_espresso_par(self.qe_input, self.structure,
                                          self.pw_loc, self.no_cpus)
        self.structure.forces = forces

        # write wall time of DFT calculation
        self.dft_count += 1
        self.output.write_to_log('QE run complete.\n')
        time_curr = time.time() - self.start_time
        self.output.write_to_log('number of DFT calls: %i \n' % self.dft_count)
        self.output.write_to_log('wall time from start: %.2f s \n' % time_curr)

    def update_gp(self, train_atoms, dft_frcs):
        self.output.write_to_log(
            '\nAdding atom {} to the training set.\n'.format(train_atoms))
        self.output.write_to_log('Uncertainty: {}.\n'.format(
            self.structure.stds[train_atoms[0]]))

        # update gp model
        self.gp.update_db(self.structure, dft_frcs, custom_range=train_atoms)

        self.gp.set_L_alpha()

        # if self.curr_step == 0:
        #     self.gp.set_L_alpha()
        # else:
        #     self.gp.update_L_alpha()

    def train_gp(self):
        self.gp.train(self.output)
        self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                               self.start_time, self.gp.like,
                               self.gp.like_grad)

    def is_std_in_bound(self):
        # set uncertainty threshold
        if self.std_tolerance == 0:
            return True, -1
        elif self.std_tolerance > 0:
            threshold = self.std_tolerance * np.abs(self.gp.hyps[-1])
        else:
            threshold = np.abs(self.std_tolerance)

        # sort max stds
        max_stds = np.zeros((self.noa))
        for atom, std in enumerate(self.structure.stds):
            max_stds[atom] = np.max(std)
        stds_sorted = np.argsort(max_stds)
        target_atoms = list(stds_sorted[-self.max_atoms_added:])

        # if above threshold, return atom
        if max_stds[stds_sorted[-1]] > threshold:
            return False, target_atoms
        else:
            return True, [-1]

    def update_positions(self, new_pos):
        if self.curr_step in self.rescale_steps:
            rescale_ind = self.rescale_steps.index(self.curr_step)
            temp_fac = self.rescale_temps[rescale_ind] / self.temperature
            vel_fac = np.sqrt(temp_fac)
            self.structure.prev_positions = \
                new_pos - self.velocities * self.dt * vel_fac
        else:
            self.structure.prev_positions = self.structure.positions
        self.structure.positions = new_pos
        self.structure.wrap_positions()

    def update_temperature(self, new_pos):
        KE, temperature, velocities = \
                md.calculate_temperature(new_pos, self.structure, self.dt,
                                         self.noa)
        self.KE = KE
        self.temperature = temperature
        self.velocities = velocities

    def record_state(self):
        self.output.write_md_config(self.dt, self.curr_step, self.structure,
                                    self.temperature, self.KE,
                                    self.local_energies, self.start_time,
                                    self.dft_step, self.velocities)
        self.output.write_xyz_config(self.curr_step, self.structure,
                                     self.dft_step)
class TrajectoryTrainer:
    def __init__(
        self,
        frames: List[Structure] = None,
        gp: Union[GaussianProcess, MappedGaussianProcess] = None,
        rel_std_tolerance: float = 4,
        abs_std_tolerance: float = 1,
        abs_force_tolerance: float = 0,
        max_force_error: float = inf,
        parallel: bool = False,
        n_cpus: int = 1,
        skip: int = 1,
        validate_ratio: float = 0.0,
        calculate_energy: bool = False,
        include_energies: bool = False,
        output_name: str = "gp_from_aimd",
        print_as_xyz: bool = False,
        pre_train_max_iter: int = 50,
        max_atoms_from_frame: int = np.inf,
        max_trains: int = np.inf,
        min_atoms_per_train: int = 1,
        shuffle_frames: bool = False,
        verbose: str = "INFO",
        pre_train_on_skips: int = -1,
        pre_train_seed_frames: List[Structure] = None,
        pre_train_seed_envs: List[Tuple[AtomicEnvironment, "np.array"]] = None,
        pre_train_atoms_per_element: dict = None,
        train_atoms_per_element: dict = None,
        predict_atoms_per_element: dict = None,
        train_checkpoint_interval: int = 1,
        checkpoint_interval: int = 1,
        atom_checkpoint_interval: int = 100,
        print_training_plan: bool = True,
        model_format: str = "pickle",
    ):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.

        There are a variety of options which can give you a finer control
        over the training process.

        :param frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param rel_std_tolerance: Train if uncertainty is above this *
            noise variance hyperparameter
        :param abs_std_tolerance: Train if uncertainty is above this
        :param abs_force_tolerance: Add atom force error exceeds this
        :param max_force_error: Don't add atom if force error exceeds this
        :param parallel: Use parallel functions or not
        :param validate_ratio: Fraction of frames used for validation
        :param skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param include_energies: Add energies associated with individual frames
        :param output_name: Write output of training to this file
        :param print_as_xyz: If True, print the configurations in xyz format
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_per_train: Only train when this many atoms have been
            added
        :param max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over for parallelization
                over atoms
        :param shuffle_frames: Randomize order of frames for better training
        :param verbose: same as logging level, "WARNING", "INFO", "DEBUG"
        :param pre_train_on_skips: Train model on every n frames before running
        :param pre_train_seed_frames: Frames to train on before running
        :param pre_train_seed_envs: Environments to train on before running
        :param pre_train_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param train_atoms_per_element: Max # of environments to add from
            each species in the training steps
        :param predict_atoms_per_element: Choose a random subset of N random
            atoms from each specified element to predict on. For instance,
            {"H":5} will only predict the forces and uncertainties
            associated with 5 Hydrogen atoms per frame. Elements not
            specified will be predicted as normal. This is useful for
            systems where you are most interested in a subset of elements.
            This will result in a faster but less exhaustive learning process.
        :param checkpoint_interval: Will be deprecated. Same as
                            train_checkpoint_interval
        :param train_checkpoint_interval: How often to write model after
                        trainings
        :param atom_checkpoint_interval: How often to write model after atoms are
            added (since atoms may be added without training)
        :param model_format: Format to write GP model to
        :param print_training_plan: Write which atoms in which frames that
            triggered uncertainty or force conditions, so that training can
            be 'fast-forwarded' later. Also useful for gauging MGP results and
            then applying the atoms with high uncertainty and error to a GP.
        """

        # Set up parameters
        self.frames = frames
        if shuffle_frames:
            np.random.shuffle(frames)
            if print_training_plan:
                warnings.warn("Frames are shuffled so training plan will not"
                              " map onto the structures used; Try to "
                              "shuffle the frames outside of the GPFA module "
                              "for now.")

        # GP Training and Execution parameters
        self.gp = gp
        # Check to see if GP is MGP for later flagging
        self.gp_is_mapped = isinstance(gp, MappedGaussianProcess)
        self.rel_std_tolerance = rel_std_tolerance
        self.abs_std_tolerance = abs_std_tolerance
        self.abs_force_tolerance = abs_force_tolerance
        self.max_force_error = max_force_error
        self.max_trains = max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_per_train
        self.predict_atoms_per_element = predict_atoms_per_element
        self.train_count = 0
        self.calculate_energy = calculate_energy
        self.n_cpus = n_cpus
        self.include_energies = include_energies

        if parallel is True:
            warnings.warn(
                "Parallel flag will be deprecated;we will instead use n_cpu alone.",
                DeprecationWarning,
            )

        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if self.gp_is_mapped:
            self.pred_func = predict_on_structure_mgp
            self.pred_func_env = self.gp.predict
        else:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par
            self.pred_func_env = self.gp.predict_force_xyz

        # Parameters for negotiating with the training frames

        # To later be filled in using the time library
        self.start_time = None

        self.skip = skip
        assert (isinstance(skip, int)
                and skip >= 1), "Skip needs to be a positive integer."
        self.validate_ratio = validate_ratio
        assert 0 <= validate_ratio <= 1, "validate_ratio needs to be [0,1]"

        # Set up for pretraining
        self.pre_train_max_iter = pre_train_max_iter
        self.pre_train_on_skips = pre_train_on_skips
        self.seed_envs = [] if pre_train_seed_envs is None else pre_train_seed_envs
        self.seed_frames = ([] if pre_train_seed_frames is None else
                            pre_train_seed_frames)

        self.pre_train_env_per_species = ({} if
                                          pre_train_atoms_per_element is None
                                          else pre_train_atoms_per_element)
        self.train_env_per_species = ({} if train_atoms_per_element is None
                                      else train_atoms_per_element)

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(
                    key)] = self.pre_train_env_per_species[key]

        # Output parameters
        self.output = Output(output_name,
                             verbose,
                             print_as_xyz=print_as_xyz,
                             always_flush=True)
        self.logger_name = self.output.basename + "log"
        self.train_checkpoint_interval = (train_checkpoint_interval
                                          or checkpoint_interval)
        self.atom_checkpoint_interval = atom_checkpoint_interval

        self.model_format = model_format
        self.output_name = output_name
        self.print_training_plan = print_training_plan

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = time.time()

    def run_passive_learning(
        self,
        frames: List[Structure] = (),
        environments: List[AtomicEnvironment] = (),
        max_atoms_per_frame: int = np.inf,
        post_training_iterations: int = 0,
        post_build_matrices: bool = False,
        max_elts_per_frame: Dict[str, int] = None,
        max_model_size: int = np.inf,
        max_model_elts: Dict[str, int] = None,
    ):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.

        If you want to skip frames, splice the input as
        frames[::skip_n].

        If you want to randomize the frame order, try the random module's shuffle function.

        Loads the GP with the seed frames and
        environments. ALL environments passed in will be added. Randomly chosen
        atoms from each frame will be added. If no seed frames or environments and
        the GP has no training set, then seed with at least one atom from each
        """

        if self.gp_is_mapped:
            raise NotImplementedError(
                "Passive learning not yet configured for MGP")
        if max_elts_per_frame is None:
            max_elts_per_frame = dict()
        if max_model_elts is None:
            max_model_elts = dict()

        logger = logging.getLogger(self.logger_name)
        logger.debug("Beginning passive learning.")
        # If seed environments were passed in, add them to the GP.

        for env in environments:
            self.gp.add_one_env(env, env.force, train=False)

        # Ensure compatibility with number / symbol elemental notation
        for cur_dict in [max_elts_per_frame, max_model_elts]:
            for key in list(cur_dict.keys()):
                if isinstance(key, int):
                    cur_dict[Z_to_element(key)] = cur_dict[key]
                elif isinstance(key, str):
                    cur_dict[element_to_Z(key)] = cur_dict[key]

        # Main frame loop
        total_added = 0
        for frame in frames:
            current_stats = self.gp.training_statistics
            available_to_add = max_model_size - current_stats["N"]

            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                elt = Z_to_element(species_i)
                atoms_of_specie = frame.indices_of_specie(species_i)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_add = min(
                    n_at,
                    max_elts_per_frame.get(species_i, inf),
                    max_atoms_per_frame - len(train_atoms),
                    available_to_add - len(train_atoms),
                    max_model_elts.get(elt, np.inf) -
                    current_stats["envs_by_species"].get(elt, 0),
                )
                n_add = max(0, n_add)

                train_atoms += sample(atoms_of_specie, n_add)
                available_to_add -= n_add
                total_added += n_add

            self.update_gp_and_print(
                frame=frame,
                train_atoms=train_atoms,
                uncertainties=[],
                train=False,
            )

        logger = logging.getLogger(self.logger_name)
        logger.info(f"Added {total_added} atoms to "
                    "GP.\n"
                    "Current GP Statistics: "
                    f"{json.dumps(self.gp.training_statistics)} ")

        if post_training_iterations:
            logger.debug("Now commencing pre-run training of GP (which has "
                         "non-empty training set)")
            time0 = time.time()
            self.train_gp(max_iter=post_training_iterations)
            logger.debug(f"Done train_gp {time.time() - time0}")
        elif post_build_matrices:
            logger.debug(
                "Now commencing pre-run set up of GP (which has non-empty training set)"
            )
            time0 = time.time()
            self.gp.check_L_alpha()
            logger.debug(f"Done check_L_alpha {time.time() - time0}")

    def run_active_learning(
        self,
        frames: Union[List[Structure], Trajectory] = (),
        rel_std_tolerance: float = 4,
        abs_std_tolerance: float = 0,
        abs_force_tolerance: float = 0.15,
        min_atoms_per_train: int = 200,
        max_force_error: float = inf,
        max_atoms_from_frame: int = inf,
        max_trains: int = inf,
        max_model_size: int = inf,
        max_elts_per_frame: Dict[str, int] = None,
        max_model_elts: Dict[str, int] = None,
        predict_atoms_per_elt: Dict[str, int] = None,
        write_model_train_interval: int = 1,
        write_model_atom_interval: int = 100,
        validate_ratio: float = 0,
        post_write: bool = True,
    ):

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)

        if len(self.gp) == 0:
            logger.warning(
                "You are attempting active learning with an empty model. "
                "One atom of each element will be added from the first frame, "
                "but be warned: Hyperparameter optimzation on a very small "
                "subset of data can lead to suboptimal training set "
                "choices, as the hyperparameters will take time to become "
                "representative of their converged state relative to your data of "
                "interest.")
            self.run_passive_learning(
                frames[0:1],
                max_model_elts={elt: 1
                                for elt in frames[0].species_labels})

        if isinstance(frames, list):
            frames = Trajectory(deepcopy(frames))

        train_frame = int(len(frames) * (1 - validate_ratio))

        # Loop through trajectory.
        train_model_atom_counter = 0  # Track atoms added for training
        write_model_atom_counter = 0  # Track atoms added for writing
        train_counter = 0  # Track # of times training done

        # Keep track of which atoms trigger force / uncertainty condition
        training_plan = {}

        # MAIN LOOP - Frames
        for i, cur_frame in enumerate(frames):
            frame_start_time = time.time()
            logger.info(f"=====NOW ON FRAME {i}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(cur_frame,
                                                       predict_atoms_per_elt)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.gp_is_mapped:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            force_error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds
            cur_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=i,
                frame=dummy_frame,
                start_time=frame_start_time,
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=force_error,
                local_energies=local_energies,
                KE=0,
                cell=cur_frame.cell,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if i < train_frame:
                # Noise hyperparameter & relative std tolerance is not for gp_is_mapped.
                if self.gp_is_mapped:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)

                in_bound, train_atoms = evaluate_training_atoms(
                    rel_std_tolerance=rel_std_tolerance,
                    abs_std_tolerance=abs_std_tolerance,
                    noise=noise,
                    abs_force_tolerance=abs_force_tolerance,
                    max_force_error=max_force_error,
                    pred_forces=pred_forces,
                    dft_forces=dft_forces,
                    structure=dummy_frame,
                    max_model_elts=max_model_elts,
                    max_atoms_from_frame=max_atoms_from_frame,
                    max_elts_per_frame=max_elts_per_frame,
                    training_statistics=self.gp.training_statistics,
                )

                # Protocol for adding atoms to training set
                if not in_bound:

                    # Record frame and training atoms, uncertainty, error
                    force_errors = list(np.abs(pred_forces - dft_forces))
                    uncertainties = list(dummy_frame.stds)
                    training_plan[int(i)] = [(int(a), uncertainties[a],
                                              force_errors[a])
                                             for a in train_atoms]

                    if self.gp_is_mapped:
                        continue

                    if len(self.gp) + len(train_atoms) <= max_model_size:
                        self.update_gp_and_print(
                            cur_frame,
                            train_atoms=train_atoms,
                            uncertainties=pred_stds[train_atoms],
                            train=False,
                        )
                    else:
                        logger.info(
                            f"GP is at maximum model size of {max_model_size}. "
                            f"No further atoms will be added for "
                            f"remainder of run, but predictions will still be "
                            f"made. Setting max_atoms_from_frame "
                            f"to 0.")
                        max_atoms_from_frame = 0
                        if self.model_format:
                            self.gp.write_model(
                                f"{self.output_name}_saturated",
                                self.model_format)
                    train_model_atom_counter += len(train_atoms)
                    write_model_atom_counter += len(train_atoms)

                    # Re-train if number of sampled atoms is high enough
                    if (train_model_atom_counter >= min_atoms_per_train
                            or (i + 1) == train_frame
                            and train_counter <= max_trains):
                        self.train_gp()
                        train_counter += 1
                        train_model_atom_counter = 0
                    else:
                        self.gp.update_L_alpha()
                    written = self.write_model_decision(
                        write_model_train_interval,
                        write_model_atom_counter,
                        write_model_atom_interval,
                        train_counter,
                    )
                    if written:
                        write_model_atom_counter = 0

        # Print training statistics for GP model used
        conclusion_strings = [
            "Final GP statistics:" + json.dumps(self.gp.training_statistics)
        ]
        self.output.conclude_run(conclusion_strings)

        if self.print_training_plan:
            with open(f"{self.output_name}_training_plan.json", "w") as f:
                f.write(json.dumps(training_plan, cls=NumpyEncoder))

        if self.model_format and post_write and not self.gp_is_mapped:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)

    def write_model_decision(
        self,
        write_model_train_interval: int,
        write_model_atom_counter: int,
        write_model_atom_interval: int,
        train_counter: int,
    ) -> bool:

        # Loop deicing if model should be written
        will_write = False

        # Train checkpoint interval
        if (write_model_train_interval
                and train_counter % write_model_train_interval == 0):
            will_write = True
        # Atoms added checkpoint interval
        if (write_model_atom_interval and write_model_atom_counter
                and write_model_atom_interval <= write_model_atom_counter):
            will_write = True

        if self.model_format and will_write:
            self.gp.check_L_alpha()
            self.gp.write_model(f"{self.output_name}_checkpt",
                                self.model_format)
        return will_write

    def pre_run(self):

        self.output.write_header(
            str(self.gp),
            dt=0,
            Nsteps=len(self.frames),
            structure=None,
            std_tolerance=(self.rel_std_tolerance, self.abs_std_tolerance),
            optional={
                "GP Statistics": json.dumps(self.gp.training_statistics),
                "GP Name": self.gp.name,
                "GP Write Name":
                self.output_name + "_model." + self.model_format,
            },
        )

        if self.gp_is_mapped:
            raise NotImplementedError(
                "Passive learning not yet configured for "
                "MGP")

        self.start_time = time.time()
        logger = logging.getLogger(self.logger_name)
        logger.debug("Now beginning pre-run activity.")
        # If seed environments were passed in, add them to the GP.

        for point in self.seed_envs:
            self.gp.add_one_env(point[0], point[1], train=False)

        # No training set ("blank slate" run) and no seeds specified:
        # Take one of each atom species in the first frame
        # so all atomic species are represented in the first step.
        # Otherwise use the seed frames passed in by user.

        # Remove frames used as seed from later part of training
        if self.pre_train_on_skips > 0:
            self.seed_frames = []
            newframes = []
            for i in range(len(self.frames)):
                if (i % self.pre_train_on_skips) == 0:
                    self.seed_frames += [self.frames[i]]
                else:
                    newframes += [self.frames[i]]
            self.frames = newframes

        # If the GP is empty, use the first frame as a seed frame.
        elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
            self.seed_frames = [self.frames[0]]
            self.frames = self.frames[1:]

        atom_count = 0
        for frame in self.seed_frames:
            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                atoms_of_specie = frame.indices_of_specie(species_i)
                np.random.shuffle(atoms_of_specie)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_to_add = min(
                    n_at,
                    self.pre_train_env_per_species.get(species_i, inf),
                    self.max_atoms_from_frame,
                )

                for atom in atoms_of_specie[:n_to_add]:
                    train_atoms.append(atom)
                    atom_count += 1

            self.update_gp_and_print(
                frame=frame,
                train_atoms=train_atoms,
                uncertainties=[],
                train=False,
            )

        logger = logging.getLogger(self.logger_name)
        if atom_count > 0:
            logger.info(f"Added {atom_count} atoms to "
                        "pretrain.\n"
                        "Pre-run GP Statistics: "
                        f"{json.dumps(self.gp.training_statistics)} ")

        if (self.seed_envs or atom_count
                or self.seed_frames) and (self.pre_train_max_iter
                                          or self.max_trains):
            logger.debug("Now commencing pre-run training of GP (which has "
                         "non-empty training set)")
            time0 = time.time()
            self.train_gp(max_iter=self.pre_train_max_iter)
            logger.debug(f"Done train_gp {time.time()-time0}")
        else:
            logger.debug(
                "Now commencing pre-run set up of GP (which has non-empty training set)"
            )
            time0 = time.time()
            self.gp.check_L_alpha()
            logger.debug(f"Done check_L_alpha {time.time()-time0}")

        if self.model_format and not self.gp_is_mapped:
            self.gp.write_model(f"{self.output_name}_prerun",
                                self.model_format)

    def run(self):
        """
        UPDATE: SOON TO BE DEPRECATED, CIRCA SEPTEMBER 2020

        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.

        :return: None
        """

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)
        logger.debug("Commencing run with pre-run...")
        if not self.gp_is_mapped:
            self.pre_run()

        # Past this frame, stop adding atoms to the training set
        #  (used for validation of model)
        train_frame = int(
            len(self.frames[::self.skip]) * (1 - self.validate_ratio))

        # Loop through trajectory.
        cur_atoms_added_train = 0  # Track atoms added for training
        cur_atoms_added_write = 0  # Track atoms added for writing
        cur_trains_done_write = 0  # Track training done for writing

        # Keep track of which atoms trigger force / uncertainty condition
        training_plan = {}

        for i, cur_frame in enumerate(self.frames[::self.skip]):

            frame_start_time = time.time()
            logger.info(f"=====NOW ON FRAME {i}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(
                cur_frame, self.predict_atoms_per_element)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.gp_is_mapped:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds
            cur_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=i,
                frame=dummy_frame,
                start_time=frame_start_time,
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=error,
                local_energies=local_energies,
                KE=0,
                cell=cur_frame.cell,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if i < train_frame:
                # Noise hyperparameter & relative std tolerance is not for gp_is_mapped.
                if self.gp_is_mapped:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)
                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=noise,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                )

                # Get max force error atoms
                force_in_bound, force_train_atoms = is_force_in_bound_per_species(
                    abs_force_tolerance=self.abs_force_tolerance,
                    predicted_forces=pred_forces,
                    label_forces=dft_forces,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                    max_force_error=self.max_force_error,
                )

                if not std_in_bound or not force_in_bound:

                    # -1 is returned from the is_in_bound methods,
                    # so filter that out and the use sets to remove repeats
                    train_atoms = list(
                        set(std_train_atoms).union(force_train_atoms) - {-1})

                    # Record frame and training atoms, uncertainty, error
                    force_errors = list(np.abs(pred_forces - dft_forces))
                    uncertainties = list(dummy_frame.stds)
                    training_plan[int(i)] = [(int(a), uncertainties[a],
                                              force_errors[a])
                                             for a in train_atoms]

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame,
                        train_atoms=train_atoms,
                        uncertainties=pred_stds[train_atoms],
                        train=False,
                    )
                    cur_atoms_added_train += len(train_atoms)
                    cur_atoms_added_write += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough

                    if (cur_atoms_added_train >= self.min_atoms_per_train
                            or (i + 1) == train_frame):
                        if self.train_count < self.max_trains:
                            self.train_gp()
                            cur_trains_done_write += 1
                        else:
                            self.gp.update_L_alpha()
                        cur_atoms_added_train = 0
                    else:
                        self.gp.update_L_alpha()

                    # Loop to decide of a model should be written this
                    # iteration
                    will_write = False

                    if (self.train_checkpoint_interval
                            and cur_trains_done_write
                            and self.train_checkpoint_interval <=
                            cur_trains_done_write):
                        will_write = True
                        cur_trains_done_write = 0

                    if (self.atom_checkpoint_interval and cur_atoms_added_write
                            and self.atom_checkpoint_interval <=
                            cur_atoms_added_write):
                        will_write = True
                        cur_atoms_added_write = 0

                    if self.model_format and will_write:
                        self.gp.write_model(f"{self.output_name}_checkpt",
                                            self.model_format)

                if (i + 1) == train_frame and not self.gp_is_mapped:
                    self.gp.check_L_alpha()

        # Print training statistics for GP model used
        conclusion_strings = [
            "Final GP statistics:" + json.dumps(self.gp.training_statistics)
        ]
        self.output.conclude_run(conclusion_strings)

        if self.print_training_plan:
            with open(f"{self.output_name}_training_plan.json", "w") as f:
                f.write(json.dumps(training_plan, cls=NumpyEncoder))

        if self.model_format and not self.gp_is_mapped:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)

    def update_gp_and_print(
        self,
        frame: Structure,
        train_atoms: List[int],
        uncertainties: List[int] = None,
        train: bool = True,
    ):
        """
        Update the internal GP model training set with a list of training
        atoms indexing atoms within the frame. If train is True, re-train
        the GP by optimizing hyperparameters.
        :param frame: Structure to train on
        :param train_atoms: Index atoms to train on
        :param uncertainties: Uncertainties to print, pass in [] to silence
        :param train: Train or not
        :return: None
        """

        if not train_atoms:
            return

        # Group added atoms by species for easier output
        added_species = [
            Z_to_element(frame.coded_species[at]) for at in train_atoms
        ]
        added_atoms = {spec: [] for spec in set(added_species)}

        for atom, spec in zip(train_atoms, added_species):
            added_atoms[spec].append(atom)

        logger = logging.getLogger(self.logger_name)
        logger.info("Adding atom(s) "
                    f"{json.dumps(added_atoms,cls=NumpyEncoder)}"
                    " to the training set.")

        if uncertainties is None:
            uncertainties = frame.stds[train_atoms]

        if uncertainties is not None and len(uncertainties) != 0:
            logger.info(f"Uncertainties: {uncertainties}.")

        logger.info(
            f"New GP Statistics: {json.dumps(self.gp.training_statistics)}\n")

        # update gp model; handling differently if it's an MGP
        if not self.gp_is_mapped:
            frame_energy = frame.energy if self.include_energies else None
            self.gp.update_db(frame,
                              frame.forces,
                              custom_range=train_atoms,
                              energy=frame_energy)

            if train:
                self.train_gp()

        else:
            logger.warning(
                "Warning: Adding data to an MGP is not yet supported.")

    def train_gp(self, max_iter: int = None):
        """
        Train the Gaussian process and write the results to the output file.

        :param max_iter: Maximum iterations associated with this training run,
            overriding the Gaussian Process's internally set maxiter.
        :type max_iter: int
        """
        logger = logging.getLogger(self.logger_name)

        if self.gp_is_mapped:
            logger.debug("Training skipped because of MGP")
            return

        logger.debug("Train GP")

        logger_train = self.output.basename + "hyps"

        # TODO: Improve flexibility in GP training to make this next step
        # unnecessary, so maxiter can be passed as an argument

        # Don't train if maxiter == 0
        if max_iter == 0:
            self.gp.check_L_alpha()
        elif max_iter is not None:
            temp_maxiter = self.gp.maxiter
            self.gp.maxiter = max_iter
            self.gp.train(logger_name=logger_train)
            self.gp.maxiter = temp_maxiter
        else:
            self.gp.train(logger_name=logger_train)

        hyps, labels = Parameters.get_hyps(self.gp.hyps_mask,
                                           self.gp.hyps,
                                           constraint=False,
                                           label=True)
        if labels is None:
            labels = self.gp.hyp_labels
        self.output.write_hyps(
            labels,
            hyps,
            self.start_time,
            self.gp.likelihood,
            self.gp.likelihood_gradient,
            hyps_mask=self.gp.hyps_mask,
        )
        self.train_count += 1
示例#5
0
class TrajectoryTrainer:
    def __init__(self,
                 frames: List[Structure],
                 gp: Union[GaussianProcess, MappedGaussianProcess],
                 rel_std_tolerance: float = 4,
                 abs_std_tolerance: float = 1,
                 abs_force_tolerance: float = 0,
                 max_force_error: float = inf,
                 parallel: bool = False,
                 n_cpus: int = 1,
                 skip: int = 1,
                 validate_ratio: float = 0.0,
                 calculate_energy: bool = False,
                 output_name: str = 'gp_from_aimd',
                 pre_train_max_iter: int = 50,
                 max_atoms_from_frame: int = np.inf,
                 max_trains: int = np.inf,
                 min_atoms_per_train: int = 1,
                 shuffle_frames: bool = False,
                 verbose: int = 1,
                 pre_train_on_skips: int = -1,
                 pre_train_seed_frames: List[Structure] = None,
                 pre_train_seed_envs: List[Tuple[AtomicEnvironment,
                                                 'np.array']] = None,
                 pre_train_atoms_per_element: dict = None,
                 train_atoms_per_element: dict = None,
                 predict_atoms_per_element: dict = None,
                 train_checkpoint_interval: int = 1,
                 checkpoint_interval: int = 1,
                 atom_checkpoint_interval: int = 100,
                 model_format: str = 'json'):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.

        There are a variety of options which can give you a finer control
        over the training process.

        :param frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param rel_std_tolerance: Train if uncertainty is above this *
            noise variance hyperparameter
        :param abs_std_tolerance: Train if uncertainty is above this
        :param abs_force_tolerance: Add atom force error exceeds this
        :param max_force_error: Don't add atom if force error exceeds this
        :param parallel: Use parallel functions or not
        :param validate_ratio: Fraction of frames used for validation
        :param skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param output_name: Write output of training to this file
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_per_train: Only train when this many atoms have been
            added
        :param max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over for parallelization
                over atoms
        :param shuffle_frames: Randomize order of frames for better training
        :param verbose: 0: Silent, NO output written or printed at all.
                        1: Minimal,
                        2: Lots of information
        :param pre_train_on_skips: Train model on every n frames before running
        :param pre_train_seed_frames: Frames to train on before running
        :param pre_train_seed_envs: Environments to train on before running
        :param pre_train_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param train_atoms_per_element: Max # of environments to add from
            each species in the training steps
        :param predict_atoms_per_element: Choose a random subset of N random
            atoms from each specified element to predict on. For instance,
            {"H":5} will only predict the forces and uncertainties
            associated with 5 Hydrogen atoms per frame. Elements not
            specified will be predicted as normal. This is useful for
            systems where you are most interested in a subset of elements.
            This will result in a faster but less exhaustive learning process.
        :param checkpoint_interval: Will be deprecated. Same as
                            train_checkpoint_interval
        :param train_checkpoint_interval: How often to write model after
                        trainings
        :param atom_checkpoint_interval: How often to write model after atoms are
            added (since atoms may be added without training)
        :param model_format: Format to write GP model to
        """

        # Set up parameters
        self.frames = frames
        if shuffle_frames:
            np.random.shuffle(frames)

        # GP Training and Execution parameters
        self.gp = gp
        # Check to see if GP is MGP for later flagging
        self.mgp = isinstance(gp, MappedGaussianProcess)
        self.rel_std_tolerance = rel_std_tolerance
        self.abs_std_tolerance = abs_std_tolerance
        self.abs_force_tolerance = abs_force_tolerance
        self.max_force_error = max_force_error
        self.max_trains = max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_per_train
        self.predict_atoms_per_element = predict_atoms_per_element
        self.verbose = verbose
        self.train_count = 0
        self.calculate_energy = calculate_energy
        self.n_cpus = n_cpus

        if parallel is True:
            warnings.warn(
                "Parallel flag will be deprecated;"
                "we will instead use n_cpu alone.", DeprecationWarning)

        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if not self.mgp:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par

        elif self.mgp:
            self.pred_func = predict_on_structure_mgp

        # Parameters for negotiating with the training frames

        # To later be filled in using the time library
        self.start_time = None

        self.skip = skip
        assert (isinstance(skip, int) and skip >= 1), "Skip needs to be a " \
                                                      "positive integer."
        self.validate_ratio = validate_ratio
        assert (0 <= validate_ratio <= 1), \
            "validate_ratio needs to be [0,1]"

        # Set up for pretraining
        self.pre_train_max_iter = pre_train_max_iter
        self.pre_train_on_skips = pre_train_on_skips
        self.seed_envs = [] if pre_train_seed_envs is None else \
            pre_train_seed_envs
        self.seed_frames = [] if pre_train_seed_frames is None \
            else pre_train_seed_frames

        self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
                                       is None else pre_train_atoms_per_element
        self.train_env_per_species = {} if train_atoms_per_element \
                                           is None else train_atoms_per_element

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(key)] = \
                    self.pre_train_env_per_species[key]

        # Output parameters
        self.verbose = verbose
        if self.verbose:
            self.output = Output(output_name, always_flush=True)
        else:
            self.output = None
        self.train_checkpoint_interval = train_checkpoint_interval or \
                                         checkpoint_interval
        self.atom_checkpoint_interval = atom_checkpoint_interval

        self.model_format = model_format
        self.output_name = output_name

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = None

    def pre_run(self):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.
        1. Print the output.
        2. Pre-train the GP with the seed frames and
        environments. If no seed frames or environments and the GP has no
        training set, then seed with at least one atom from each
        """

        if self.mgp:
            raise NotImplementedError("Pre-running not"
                                      "yet configured for MGP")
        if self.verbose:
            self.output.write_header(
                self.gp.cutoffs,
                self.gp.kernel_name,
                self.gp.hyps,
                self.gp.opt_algorithm,
                dt=0,
                Nsteps=len(self.frames),
                structure=None,
                std_tolerance=(self.rel_std_tolerance, self.abs_std_tolerance),
                optional={
                    'GP Statistics':
                    json.dumps(self.gp.training_statistics),
                    'GP Name':
                    self.gp.name,
                    'GP Write Name':
                    self.output_name + "_model." + self.model_format
                })

        self.start_time = time.time()
        if self.verbose >= 3:
            print("Now beginning pre-run activity.")
        # If seed environments were passed in, add them to the GP.

        for point in self.seed_envs:
            self.gp.add_one_env(point[0], point[1], train=False)

        # No training set ("blank slate" run) and no seeds specified:
        # Take one of each atom species in the first frame
        # so all atomic species are represented in the first step.
        # Otherwise use the seed frames passed in by user.

        # Remove frames used as seed from later part of training
        if self.pre_train_on_skips > 0:
            self.seed_frames = []
            newframes = []
            for i in range(len(self.frames)):
                if (i % self.pre_train_on_skips) == 0:
                    self.seed_frames += [self.frames[i]]
                else:
                    newframes += [self.frames[i]]
            self.frames = newframes

        # If the GP is empty, use the first frame as a seed frame.
        elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
            self.seed_frames = [self.frames[0]]
            self.frames = self.frames[1:]

        atom_count = 0
        for frame in self.seed_frames:
            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                atoms_of_specie = frame.indices_of_specie(species_i)
                np.random.shuffle(atoms_of_specie)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_to_add = min(
                    n_at, self.pre_train_env_per_species.get(species_i, inf),
                    self.max_atoms_from_frame)

                for atom in atoms_of_specie[:n_to_add]:
                    train_atoms.append(atom)
                    atom_count += 1

            self.update_gp_and_print(frame=frame,
                                     train_atoms=train_atoms,
                                     uncertainties=[],
                                     train=False)

        if self.verbose and atom_count > 0:
            self.output.write_to_log(
                f"Added {atom_count} atoms to "
                f"pretrain.\n"
                f"Pre-run GP Statistics: "
                f"{json.dumps(self.gp.training_statistics)} \n",
                flush=True)

        if (self.seed_envs or atom_count or self.seed_frames) and \
                (self.pre_train_max_iter or self.max_trains):
            if self.verbose >= 3:
                print("Now commencing pre-run training of GP (which has "
                      "non-empty training set)")
            self.train_gp(max_iter=self.pre_train_max_iter)
        else:
            if self.verbose >= 3:
                print("Now commencing pre-run set up of GP (which has "
                      "non-empty training set)")
            self.gp.set_L_alpha()

        if self.model_format and not self.mgp:
            self.gp.write_model(f'{self.output_name}_prerun',
                                self.model_format)

    def run(self):
        """
        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.

        :return: None
        """

        # Perform pre-run, in which seed trames are used.
        if self.verbose >= 3:
            print("Commencing run with pre-run...")
        if not self.mgp:
            self.pre_run()

        # Past this frame, stop adding atoms to the training set
        #  (used for validation of model)
        train_frame = int(
            len(self.frames[::self.skip]) * (1 - self.validate_ratio))

        # Loop through trajectory.
        cur_atoms_added_train = 0  # Track atoms added for training
        cur_atoms_added_write = 0  # Track atoms added for writing
        cur_trains_done_write = 0  # Track training done for writing

        for i, cur_frame in enumerate(self.frames[::self.skip]):

            if self.verbose >= 2:
                print(f"=====NOW ON FRAME {i}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(
                cur_frame, self.predict_atoms_per_element)
            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.mgp:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan)
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan)
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan)

            # Get Error
            dft_forces = cur_frame.forces
            error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds

            if self.verbose:
                self.output.write_gp_dft_comparison(
                    curr_step=i,
                    frame=dummy_frame,
                    start_time=time.time(),
                    dft_forces=dft_forces,
                    error=error,
                    local_energies=local_energies)

            if i < train_frame:
                # Noise hyperparameter & relative std tolerance is not for mgp.
                if self.mgp:
                    noise = 0
                else:
                    noise = self.gp.hyps[-1]
                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=noise,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species)

                # Get max force error atoms
                force_in_bound, force_train_atoms = \
                    is_force_in_bound_per_species(
                        abs_force_tolerance=self.abs_force_tolerance,
                        predicted_forces=pred_forces,
                        label_forces=dft_forces,
                        structure=dummy_frame,
                        max_atoms_added=self.max_atoms_from_frame,
                        max_by_species=self.train_env_per_species,
                        max_force_error=self.max_force_error)

                if not std_in_bound or not force_in_bound:

                    # -1 is returned from the is_in_bound methods,
                    # so filter that out and the use sets to remove repeats
                    train_atoms = list(
                        set(std_train_atoms).union(force_train_atoms) - {-1})

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame,
                        train_atoms=train_atoms,
                        uncertainties=pred_stds[train_atoms],
                        train=False)
                    cur_atoms_added_train += len(train_atoms)
                    cur_atoms_added_write += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough

                    if cur_atoms_added_train >= self.min_atoms_per_train or (
                            i + 1) == train_frame:
                        if self.train_count < self.max_trains:
                            self.train_gp()
                            cur_trains_done_write += 1
                        else:
                            self.gp.update_L_alpha()
                        cur_atoms_added_train = 0
                    else:
                        self.gp.update_L_alpha()

                    # Loop to decide of a model should be written this
                    # iteration
                    will_write = False

                    if self.train_checkpoint_interval and \
                            cur_trains_done_write and \
                            self.train_checkpoint_interval \
                            % cur_trains_done_write == 0:
                        will_write = True
                        cur_trains_done_write = 0

                    if self.atom_checkpoint_interval \
                            and cur_atoms_added_write \
                            and self.atom_checkpoint_interval \
                            % cur_atoms_added_write == 0:
                        will_write = True
                        cur_atoms_added_write = 0

                    if self.model_format and will_write:
                        self.gp.write_model(f'{self.output_name}_checkpt',
                                            self.model_format)

                if (i + 1) == train_frame and not self.mgp:
                    self.gp.check_L_alpha()

        if self.verbose:
            self.output.conclude_run()

        if self.model_format and not self.mgp:
            self.gp.write_model(f'{self.output_name}_model', self.model_format)

    def update_gp_and_print(self,
                            frame: Structure,
                            train_atoms: List[int],
                            uncertainties: List[int] = None,
                            train: bool = True):
        """
        Update the internal GP model training set with a list of training
        atoms indexing atoms within the frame. If train is True, re-train
        the GP by optimizing hyperparameters.
        :param frame: Structure to train on
        :param train_atoms: Index atoms to train on
        :param uncertainties: Uncertainties to print, pass in [] to silence
        :param train: Train or not
        :return: None
        """

        # Group added atoms by species for easier output
        added_species = [
            Z_to_element(frame.coded_species[at]) for at in train_atoms
        ]
        added_atoms = {spec: [] for spec in set(added_species)}

        for atom, spec in zip(train_atoms, added_species):
            added_atoms[spec].append(atom)

        if self.verbose:
            self.output.write_to_log(
                '\nAdding atom(s) '
                f'{json.dumps(added_atoms,cls=NumpyEncoder)}'
                ' to the training set.\n')

        if uncertainties is None or len(uncertainties) != 0:
            uncertainties = frame.stds[train_atoms]

        if self.verbose and len(uncertainties) != 0:
            self.output.write_to_log(f'Uncertainties: '
                                     f'{uncertainties}.\n',
                                     flush=True)

        # update gp model; handling differently if it's an MGP
        if not self.mgp:
            self.gp.update_db(frame, frame.forces, custom_range=train_atoms)

            if train:
                self.train_gp()

        else:
            raise NotImplementedError

    def train_gp(self, max_iter: int = None):
        """
        Train the Gaussian process and write the results to the output file.

        :param max_iter: Maximum iterations associated with this training run,
            overriding the Gaussian Process's internally set maxiter.
        :type max_iter: int
        """

        if self.verbose >= 1:
            self.output.write_to_log('Train GP\n')

        # TODO: Improve flexibility in GP training to make this next step
        # unnecessary, so maxiter can be passed as an argument

        # Don't train if maxiter == 0
        if max_iter == 0:
            self.gp.check_L_alpha()
        elif max_iter is not None:
            temp_maxiter = self.gp.maxiter
            self.gp.maxiter = max_iter
            self.gp.train(output=self.output if self.verbose >= 2 else None)
            self.gp.maxiter = temp_maxiter
        else:
            self.gp.train(output=self.output if self.verbose >= 2 else None)

        if self.verbose:
            self.output.write_hyps(self.gp.hyp_labels,
                                   self.gp.hyps,
                                   self.start_time,
                                   self.gp.likelihood,
                                   self.gp.likelihood_gradient,
                                   hyps_mask=self.gp.hyps_mask)
        self.train_count += 1
示例#6
0
class OTF:
    """Trains a Gaussian process force field on the fly during
        molecular dynamics.

    Args:
        dt (float): MD timestep.
        number_of_steps (int): Number of timesteps in the training
            simulation.
        prev_pos_init ([type], optional): Previous positions. Defaults
            to None.
        rescale_steps (List[int], optional): List of frames for which the
            velocities of the atoms are rescaled. Defaults to [].
        rescale_temps (List[int], optional): List of rescaled temperatures.
            Defaults to [].

        gp (gp.GaussianProcess): Initial GP model.
        calculate_energy (bool, optional): If True, the energy of each
            frame is calculated with the GP. Defaults to False.
        calculate_efs (bool, optional): If True, the energy and stress of each
            frame is calculated with the GP. Defaults to False.
        write_model (int, optional): If 0, write never. If 1, write at
            end of run. If 2, write after each training and end of run.
            If 3, write after each time atoms are added and end of run.
        force_only (bool, optional): If True, only use forces for training.
            Default to False, use forces, energy and stress for training.

        std_tolerance_factor (float, optional): Threshold that determines
            when DFT is called. Specifies a multiple of the current noise
            hyperparameter. If the epistemic uncertainty on a force
            component exceeds this value, DFT is called. Defaults to 1.
        skip (int, optional): Number of frames that are skipped when
            dumping to the output file. Defaults to 0.
        init_atoms (List[int], optional): List of atoms from the input
            structure whose local environments and force components are
            used to train the initial GP model. If None is specified, all
            atoms are used to train the initial GP. Defaults to None.
        output_name (str, optional): Name of the output file. Defaults to
            'otf_run'.
        max_atoms_added (int, optional): Number of atoms added each time
            DFT is called. Defaults to 1.
        freeze_hyps (int, optional): Specifies the number of times the
            hyperparameters of the GP are optimized. After this many
            updates to the GP, the hyperparameters are frozen.
            Defaults to 10.
        min_steps_with_model (int, optional): Minimum number of steps the
            model takes in between calls to DFT. Defaults to 0.
        force_source (Union[str, object], optional): DFT code used to calculate
            ab initio forces during training. A custom module can be used here
            in place of the DFT modules available in the FLARE package. The
            module must contain two functions: parse_dft_input, which takes a
            file name (in string format) as input and returns the positions,
            species, cell, and masses of a structure of atoms; and run_dft_par,
            which takes a number of DFT related inputs and returns the forces
            on all atoms.  Defaults to "qe".
        npool (int, optional): Number of k-point pools for DFT
            calculations. Defaults to None.
        mpi (str, optional): Determines how mpi is called. Defaults to
            "srun".
        dft_loc (str): Location of DFT executable.
        dft_input (str): Input file.
        dft_output (str): Output file.
        dft_kwargs ([type], optional): Additional arguments which are
            passed when DFT is called; keyword arguments vary based on the
            program (e.g. ESPRESSO vs. VASP). Defaults to None.
        store_dft_output (Tuple[Union[str,List[str]],str], optional):
            After DFT calculations are called, copy the file or files
            specified in the first element of the tuple to a directory
            specified as the second element of the tuple.
            Useful when DFT calculations are expensive and want to be kept
            for later use. The first element of the tuple can either be a
            single file name, or a list of several. Copied files will be
            prepended with the date and time with the format
            'Year.Month.Day:Hour:Minute:Second:'.

        n_cpus (int, optional): Number of cpus used during training.
            Defaults to 1.
    """
    def __init__(
        self,
        # md args
        dt: float,
        number_of_steps: int,
        prev_pos_init: "ndarray" = None,
        rescale_steps: List[int] = [],
        rescale_temps: List[int] = [],
        # flare args
        gp: gp.GaussianProcess = None,
        calculate_energy: bool = False,
        calculate_efs: bool = False,
        write_model: int = 0,
        force_only: bool = False,
        # otf args
        std_tolerance_factor: float = 1,
        skip: int = 0,
        init_atoms: List[int] = None,
        output_name: str = "otf_run",
        max_atoms_added: int = 1,
        freeze_hyps: int = 10,
        min_steps_with_model: int = 0,
        update_style: str = "add_n",
        update_threshold: float = None,
        # dft args
        force_source: str = "qe",
        npool: int = None,
        mpi: str = "srun",
        dft_loc: str = None,
        dft_input: str = None,
        dft_output="dft.out",
        dft_kwargs=None,
        store_dft_output: Tuple[Union[str, List[str]], str] = None,
        # other args
        n_cpus: int = 1,
        **kwargs,
    ):

        # set DFT
        self.dft_loc = dft_loc
        self.dft_input = dft_input
        self.dft_output = dft_output
        self.dft_step = True
        self.dft_count = 0
        if isinstance(force_source, str):
            self.dft_module = dft_software[force_source]
        else:
            self.dft_module = force_source

        # set md
        self.dt = dt
        self.number_of_steps = number_of_steps
        self.get_structure_from_input(prev_pos_init)  # parse input file
        self.noa = self.structure.positions.shape[0]
        self.rescale_steps = rescale_steps
        self.rescale_temps = rescale_temps

        # set flare
        self.gp = gp
        # initialize local energies
        if calculate_energy:
            self.local_energies = np.zeros(self.noa)
        else:
            self.local_energies = None
        self.force_only = force_only

        # set otf
        self.std_tolerance = std_tolerance_factor
        self.skip = skip
        self.max_atoms_added = max_atoms_added
        self.freeze_hyps = freeze_hyps
        if init_atoms is None:  # set atom list for initial dft run
            self.init_atoms = [int(n) for n in range(self.noa)]
        else:
            self.init_atoms = init_atoms
        self.update_style = update_style
        self.update_threshold = update_threshold

        self.n_cpus = n_cpus  # set number of cpus and npool for DFT runs
        self.npool = npool
        self.mpi = mpi
        self.min_steps_with_model = min_steps_with_model

        self.dft_kwargs = dft_kwargs
        self.store_dft_output = store_dft_output

        # other args
        self.atom_list = list(range(self.noa))
        self.curr_step = 0
        self.steps_since_dft = 0

        # Set the prediction function based on user inputs.
        # Force only prediction.
        if (n_cpus > 1 and gp.per_atom_par
                and gp.parallel) and not (calculate_energy or calculate_efs):
            self.pred_func = predict.predict_on_structure_par
        elif not (calculate_energy or calculate_efs):
            self.pred_func = predict.predict_on_structure
        # Energy and force prediction.
        elif (n_cpus > 1 and gp.per_atom_par
              and gp.parallel) and not (calculate_efs):
            self.pred_func = predict.predict_on_structure_par_en
        elif not calculate_efs:
            self.pred_func = predict.predict_on_structure_en
        # Energy, force, and stress prediction.
        elif n_cpus > 1 and gp.per_atom_par and gp.parallel:
            self.pred_func = predict.predict_on_structure_efs_par
        else:
            self.pred_func = predict.predict_on_structure_efs

        # set logger
        self.output = Output(output_name, always_flush=True)
        self.output_name = output_name
        self.gp_name = self.output_name + "_gp.json"
        self.checkpt_name = self.output_name + "_checkpt.json"

        self.write_model = write_model

    def run(self):
        """
        Performs an on-the-fly training run.

        If OTF has store_dft_output set, then the specified DFT files will
        be copied with the current date and time prepended in the format
        'Year.Month.Day:Hour:Minute:Second:'.
        """

        optional_dict = {"Restart": self.curr_step}
        self.output.write_header(
            str(self.gp),
            self.dt,
            self.number_of_steps,
            self.structure,
            self.std_tolerance,
            optional_dict,
        )

        counter = 0
        self.start_time = time.time()

        while self.curr_step < self.number_of_steps:
            # run DFT and train initial model if first step and DFT is on
            if ((self.curr_step == 0) and (self.std_tolerance != 0)
                    and (len(self.gp.training_data) == 0)):

                # Are the recorded forces from the GP or DFT in ASE OTF?
                # When DFT is called, ASE energy, forces, and stresses should
                # get updated.
                self.initialize_train()

            # after step 1, try predicting with GP model
            else:
                # compute forces and stds with GP
                self.dft_step = False
                self.compute_properties()

                # get max uncertainty atoms
                std_in_bound, target_atoms = is_std_in_bound(
                    self.std_tolerance,
                    self.gp.force_noise,
                    self.structure,
                    max_atoms_added=self.max_atoms_added,
                    update_style=self.update_style,
                    update_threshold=self.update_threshold,
                )

                if (not std_in_bound) and (self.steps_since_dft >
                                           self.min_steps_with_model):
                    # record GP forces
                    self.update_temperature()
                    self.record_state()
                    gp_frcs = deepcopy(self.structure.forces)

                    # run DFT and record forces
                    self.dft_step = True
                    self.steps_since_dft = 0
                    self.run_dft()
                    dft_frcs = deepcopy(self.structure.forces)
                    dft_stress = deepcopy(self.structure.stress)
                    dft_energy = self.structure.potential_energy

                    # run MD step & record the state
                    self.record_state()

                    # compute mae and write to output
                    self.compute_mae(gp_frcs, dft_frcs)

                    # add max uncertainty atoms to training set
                    self.update_gp(
                        target_atoms,
                        dft_frcs,
                        dft_stress=dft_stress,
                        dft_energy=dft_energy,
                    )

            # write gp forces
            if counter >= self.skip and not self.dft_step:
                self.update_temperature()
                self.record_state()
                counter = 0

            counter += 1
            # TODO: Reinstate velocity rescaling.
            self.md_step()  # update positions by Verlet
            self.steps_since_dft += 1
            self.rescale_temperature(self.structure.positions)

            self.curr_step += 1

            if self.write_model == 3:
                self.checkpoint()

        self.output.conclude_run()

        if self.write_model >= 1:
            self.write_gp()
            self.checkpoint()

    def get_structure_from_input(self, prev_pos_init):
        positions, species, cell, masses = self.dft_module.parse_dft_input(
            self.dft_input)

        self.structure = struc.Structure(
            cell=cell,
            species=species,
            positions=positions,
            mass_dict=masses,
            prev_positions=prev_pos_init,
            species_labels=species,
        )

    def initialize_train(self):
        # call dft and update positions
        self.run_dft()
        dft_frcs = deepcopy(self.structure.forces)
        dft_stress = deepcopy(self.structure.stress)
        dft_energy = self.structure.potential_energy

        self.update_temperature()
        self.record_state()

        # make initial gp model and predict forces
        self.update_gp(self.init_atoms,
                       dft_frcs,
                       dft_stress=dft_stress,
                       dft_energy=dft_energy)

    def compute_properties(self):
        """
        In ASE-OTF, it will be replaced by subclass method
        """
        self.gp.check_L_alpha()
        self.pred_func(self.structure, self.gp, self.n_cpus)

    def md_step(self):
        """
        Take an MD step. This updates the positions of the structure.
        """
        md.update_positions(self.dt, self.noa, self.structure)

    def write_gp(self):
        self.gp.write_model(self.gp_name)

    def run_dft(self):
        """Calculates DFT forces on atoms in the current structure.

        If OTF has store_dft_output set, then the specified DFT files will
        be copied with the current date and time prepended in the format
        'Year.Month.Day:Hour:Minute:Second:'.

        Calculates DFT forces on atoms in the current structure."""

        f = logging.getLogger(self.output.basename + "log")
        f.info("\nCalling DFT...\n")

        # calculate DFT forces
        # TODO: Return stress and energy
        forces = self.dft_module.run_dft_par(
            self.dft_input,
            self.structure,
            self.dft_loc,
            n_cpus=self.n_cpus,
            dft_out=self.dft_output,
            npool=self.npool,
            mpi=self.mpi,
            dft_kwargs=self.dft_kwargs,
        )

        self.structure.forces = forces

        # write wall time of DFT calculation
        self.dft_count += 1
        self.output.conclude_dft(self.dft_count, self.start_time)

        # Store DFT outputs in another folder if desired
        # specified in self.store_dft_output
        if self.store_dft_output is not None:
            dest = self.store_dft_output[1]
            target_files = self.store_dft_output[0]
            now = datetime.now()
            dt_string = now.strftime("%Y.%m.%d:%H:%M:%S:")
            if isinstance(target_files, str):
                to_copy = [target_files]
            else:
                to_copy = target_files
            for ofile in to_copy:
                copyfile(ofile, dest + "/" + dt_string + ofile)

    def update_gp(
        self,
        train_atoms: List[int],
        dft_frcs: "ndarray",
        dft_energy: float = None,
        dft_stress: "ndarray" = None,
    ):
        """
        Updates the current GP model.


        Args:
            train_atoms (List[int]): List of atoms whose local environments
                will be added to the training set.
            dft_frcs (np.ndarray): DFT forces on all atoms in the structure.
        """
        self.output.add_atom_info(train_atoms, self.structure.stds)

        if self.force_only:
            dft_energy = None
            dft_stress = None

        # update gp model
        self.gp.update_db(
            self.structure,
            dft_frcs,
            custom_range=train_atoms,
            energy=dft_energy,
            stress=dft_stress,
        )

        self.gp.set_L_alpha()

        # write model
        if (self.dft_count - 1) < self.freeze_hyps:
            self.train_gp()
            if self.write_model == 2:
                self.write_gp()
        if self.write_model == 3:
            self.write_gp()

    def train_gp(self):
        """Optimizes the hyperparameters of the current GP model."""

        self.gp.train(logger_name=self.output.basename + "hyps")

        hyps, labels = self.gp.hyps_and_labels
        if labels is None:
            labels = self.gp.hyp_labels

        self.output.write_hyps(
            labels,
            hyps,
            self.start_time,
            self.gp.likelihood,
            self.gp.likelihood_gradient,
            hyps_mask=self.gp.hyps_mask,
        )

    def compute_mae(self, gp_frcs, dft_frcs):
        mae = np.mean(np.abs(gp_frcs - dft_frcs))
        mac = np.mean(np.abs(dft_frcs))

        f = logging.getLogger(self.output.basename + "log")
        f.info(f"mean absolute error: {mae:.4f} eV/A")
        f.info(f"mean absolute dft component: {mac:.4f} eV/A")

    def rescale_temperature(self, new_pos: "ndarray"):
        """Change the previous positions to update the temperature

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        if self.curr_step in self.rescale_steps:
            rescale_ind = self.rescale_steps.index(self.curr_step)
            temp_fac = self.rescale_temps[rescale_ind] / self.temperature
            vel_fac = np.sqrt(temp_fac)
            self.structure.prev_positions = (
                new_pos - self.velocities * self.dt * vel_fac)

    def update_temperature(self):
        """Updates the instantaneous temperatures of the system.

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        KE, temperature, velocities = md.calculate_temperature(
            self.structure, self.dt, self.noa)
        self.KE = KE
        self.temperature = temperature
        self.velocities = velocities

    def record_state(self):
        self.output.write_md_config(
            self.dt,
            self.curr_step,
            self.structure,
            self.temperature,
            self.KE,
            self.start_time,
            self.dft_step,
            self.velocities,
        )

    def as_dict(self):
        self.dft_module = self.dft_module.__name__
        out_dict = deepcopy(dict(vars(self)))
        self.dft_module = eval(self.dft_module)

        out_dict["gp"] = self.gp_name
        out_dict["structure"] = self.structure.as_dict()

        for key in ["output", "pred_func"]:
            out_dict.pop(key)

        return out_dict

    @staticmethod
    def from_dict(in_dict):
        if in_dict["write_model"] <= 1:  # TODO: detect GP version
            warnings.warn("The GP model might not be the latest")

        gp_model = gp.GaussianProcess.from_file(in_dict["gp"])
        in_dict["gp"] = gp_model
        in_dict["structure"] = struc.Structure.from_dict(in_dict["structure"])

        if "flare.dft_interface" in in_dict["dft_module"]:
            for dft_name in ["qe", "cp2k", "vasp"]:
                if dft_name in in_dict["dft_module"]:
                    in_dict["force_source"] = dft_name
                    break
        else:  # if force source is a module
            in_dict["force_source"] = eval(in_dict["dft_module"])

        new_otf = OTF(**in_dict)
        new_otf.structure = in_dict["structure"]
        new_otf.dft_count = in_dict["dft_count"]
        new_otf.curr_step = in_dict["curr_step"]
        return new_otf

    def checkpoint(self):
        name = self.checkpt_name
        if ".json" != name[-5:]:
            name += ".json"
        with open(name, "w") as f:
            json.dump(self.as_dict(), f, cls=NumpyEncoder)

    @classmethod
    def from_checkpoint(cls, filename):
        with open(filename, "r") as f:
            otf_model = cls.from_dict(json.loads(f.readline()))

        return otf_model
示例#7
0
class TrajectoryTrainer:

    def __init__(self, frames: List[Structure],
                 gp: GaussianProcess,
                 rel_std_tolerance: float = 4,
                 abs_std_tolerance: float = 1,
                 abs_force_tolerance: float = 0,
                 max_force_error: float = inf,
                 parallel: bool = False,
                 n_cpus: int = None,
                 skip: int = 1,
                 validate_ratio: float = 0.1,
                 calculate_energy: bool = False,
                 output_name: str = 'gp_from_aimd',
                 pre_train_max_iter: int = 50,
                 max_atoms_from_frame: int = np.inf,
                 max_trains: int = np.inf,
                 min_atoms_per_train: int = 1,
                 shuffle_frames: bool = False,
                 verbose: int = 0,
                 pre_train_on_skips: int = -1,
                 pre_train_seed_frames: List[Structure] = None,
                 pre_train_seed_envs: List[Tuple[AtomicEnvironment,
                                                 'np.array']] = None,
                 pre_train_atoms_per_element: dict = None,
                 train_atoms_per_element: dict = None,
                 checkpoint_interval: int = None,
                 model_format: str = 'json'):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.

        There are a variety of options which can give you a finer control
        over the training process.

        :param frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param rel_std_tolerance: Train if uncertainty is above this *
            noise variance hyperparameter
        :param abs_std_tolerance: Train if uncertainty is above this
        :param abs_force_tolerance: Add atom force error exceeds this
        :param max_force_error: Don't add atom if force error exceeds this
        :param parallel: Use parallel functions or not
        :param validate_ratio: Fraction of frames used for validation
        :param n_cpus: number of cpus to run with multithreading
        :param skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param output_name: Write output of training to this file
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_added: Only train when this many atoms have been
            added
        :param max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over
        :param shuffle_frames: Randomize order of frames for better training
        :param verbose: 0: Silent, 1: Minimal, 2: Lots of information
        :param pre_train_on_skips: Train model on every n frames before running
        :param pre_train_seed_frames: Frames to train on before running
        :param pre_train_seed_envs: Environments to train on before running
        :param pre_train_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param train_atoms_per_element: Max # of environments to add from
            each species in the training steps
        :param checkpoint_interval: How often to write model after trainings
        :param model_format: Format to write GP model to
        """

        # Set up parameters
        self.frames = frames
        if shuffle_frames:
            np.random.shuffle(frames)

        # GP Training and Execution parameters
        self.gp = gp
        self.rel_std_tolerance = rel_std_tolerance
        self.abs_std_tolerance = abs_std_tolerance
        self.abs_force_tolerance = abs_force_tolerance
        self.max_force_error = max_force_error
        self.max_trains = max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_per_train
        self.verbose = verbose
        self.train_count = 0

        self.parallel = parallel
        self.n_cpus = n_cpus
        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if (parallel and gp.par and gp.per_atom_par):
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par
        else:
            if calculate_energy:
                self.pred_func = predict_on_structure_en
            else:
                self.pred_func = predict_on_structure

        # Parameters for negotiating with the training frames
        self.output = Output(output_name, always_flush=True)

        # To later be filled in using the time library
        self.start_time = None

        self.skip = skip
        assert (isinstance(skip, int) and skip >= 1), "Skip needs to be a " \
                                                     "positive integer."
        self.validate_ratio = validate_ratio
        assert (validate_ratio>=0 and validate_ratio<=1), \
                "validate_ratio needs to be [0,1]"
        
        # Set up for pretraining 
        self.pre_train_max_iter = pre_train_max_iter
        self.pre_train_on_skips = pre_train_on_skips
        self.seed_envs = [] if pre_train_seed_envs is None else \
            pre_train_seed_envs
        self.seed_frames = [] if pre_train_seed_frames is None \
            else pre_train_seed_frames

        self.pre_train_env_per_species = {} if pre_train_atoms_per_element \
                                    is None else pre_train_atoms_per_element
        self.train_env_per_species = {} if train_atoms_per_element \
                                        is None else train_atoms_per_element

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(key)] = \
                    self.pre_train_env_per_species[key]

        # Output parameters
        self.output = Output(output_name, always_flush=True)
        self.verbose = verbose
        self.checkpoint_interval = checkpoint_interval
        self.model_format = model_format
        self.output_name = output_name

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = None

    def pre_run(self):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.
        1. Print the output.
        2. Pre-train the GP with the seed frames and
        environments. If no seed frames or environments and the GP has no
        training set, then seed with at least one atom from each
        """

        self.output.write_header(self.gp.cutoffs,
                                 self.gp.kernel_name,
                                 self.gp.hyps,
                                 self.gp.algo,
                                 dt=0,
                                 Nsteps=len(self.frames),
                                 structure=self.frames[0],
                                 std_tolerance=(self.rel_std_tolerance,
                                                self.abs_std_tolerance))

        self.start_time = time.time()
        if self.verbose >= 3:
            print("Now beginning pre-run activity.")
        # If seed environments were passed in, add them to the GP.

        for point in self.seed_envs:
            self.gp.add_one_env(point[0], point[1], train=False)

        # No training set ("blank slate" run) and no seeds specified:
        # Take one of each atom species in the first frame
        # so all atomic species are represented in the first step.
        # Otherwise use the seed frames passed in by user.

        # Remove frames used as seed from later part of training
        if self.pre_train_on_skips > 0:
            self.seed_frames = []
            newframes = []
            for i in range(len(self.frames)):
                if (i % self.pre_train_on_skips) == 0:
                    self.seed_frames += [self.frames[i]]
                else:
                    newframes += [self.frames[i]]
            self.frames = newframes

        elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
            self.seed_frames = [self.frames[0]]
            self.frames = self.frames[1:]

        atom_count = 0
        for frame in self.seed_frames:
            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                atoms_of_specie = frame.indices_of_specie(species_i)
                np.random.shuffle(atoms_of_specie)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_to_add = min(n_at, self.pre_train_env_per_species.get(
                    species_i, inf), self.max_atoms_from_frame)

                for atom in atoms_of_specie[:n_to_add]:
                    train_atoms.append(atom)
                    atom_count += 1

            self.update_gp_and_print(frame, train_atoms, train=False)

        if self.verbose >= 3 and atom_count > 0:
            print(f"Added {atom_count} atoms to pretrain")

        if (self.seed_envs or atom_count or self.seed_frames) and self.max_trains>0:
            if self.verbose >= 3:
                print("Now commencing pre-run training of GP (which has "
                      "non-empty training set)")
            self.train_gp(max_iter=self.pre_train_max_iter)
        else:
            if self.verbose >= 3:
                print("Now commencing pre-run set up of GP (which has "
                      "non-empty training set)")
            self.gp.set_L_alpha()

        if self.model_format:
            self.gp.write_model(f'{self.output_name}_prerun',
                    self.model_format)

    def run(self):
        """
        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.

        :return: None
        """
        if self.verbose >= 3:
            print("Commencing run with pre-run...")
        self.pre_run()

        train_frame = int(len(self.frames) * (1 - self.validate_ratio))

        # Loop through trajectory
        nsample = 0
        for i, cur_frame in enumerate(self.frames[::self.skip]):

            if self.verbose >= 2:
                print("=====NOW ON FRAME {}=====".format(i))
            dft_forces = deepcopy(cur_frame.forces)

            self.pred_func(cur_frame, self.gp, self.n_cpus)

            # Convert to meV/A
            error = np.abs(cur_frame.forces - dft_forces)

            self.output.write_gp_dft_comparison(
                curr_step=i, frame=cur_frame,
                start_time=time.time(),
                dft_forces=dft_forces,
                error=error,
                local_energies=None)

            if i < train_frame:
                # Get max uncertainty atoms
                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=self.gp.hyps[-1], structure=cur_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species)

                # Get max force error atoms
                force_in_bound, force_train_atoms = \
                    is_force_in_bound_per_species(
                        abs_force_tolerance=self.abs_force_tolerance,
                        predicted_forces=cur_frame.forces,
                        label_forces=dft_forces,
                        structure=cur_frame,
                        max_atoms_added=self.max_atoms_from_frame,
                        max_by_species=self.train_env_per_species,
                        max_force_error=self.max_force_error)

                if (not std_in_bound) or (not force_in_bound):

                    train_atoms = list(set(std_train_atoms).union(
                        force_train_atoms) - {-1})

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame, train_atoms, train=False)
                    nsample += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough
                    if nsample >= self.min_atoms_per_train or (
                            i + 1) == train_frame:
                        if self.train_count < self.max_trains:
                            self.train_gp()
                        else:
                            self.gp.update_L_alpha()
                        nsample = 0
                    else:
                        self.gp.update_L_alpha()

                    if self.checkpoint_interval \
                            and self.train_count % self.checkpoint_interval == 0 \
                            and self.model_format:
                        self.gp.write_model(f'{self.output_name}_ckpt',
                                self.model_format)

                if (i + 1) == train_frame:
                    self.gp.check_L_alpha()

        self.output.conclude_run()

        if self.model_format:
            self.gp.write_model(f'{self.output_name}_model',
                    self.model_format)

    def update_gp_and_print(self, frame: Structure, train_atoms: List[int],
                            train: bool = True):
        """
        Update the internal GP model training set with a list of training
        atoms indexing atoms within the frame. If train is True, re-train
        the GP by optimizing hyperparameters.
        :param frame: Structure to train on
        :param train_atoms: Index atoms to train on
        :param train: Train or not
        :return: None
        """

        self.output.write_to_log('\nAdding atom(s) {} to the '
                                 'training set.\n'
                                 .format(train_atoms, ))
        self.output.write_to_log('Uncertainties: {}.\n'
                                 .format(frame.stds[train_atoms]),
                                 flush=True)

        # update gp model
        self.gp.update_db(frame, frame.forces, custom_range=train_atoms)

        if train:
            self.train_gp()

    def train_gp(self, max_iter: int = None):
        """
        Train the Gaussian process and write the results to the output file.

        :param max_iter: Maximum iterations associated with this training run,
            overriding the Gaussian Process's internally set maxiter.
        :type max_iter: int
        """

        if self.verbose >= 1:
            self.output.write_to_log('Train GP\n')

        # TODO: Improve flexibility in GP training to make this next step
        # unnecessary, so maxiter can be passed as an argument

        # Don't train if maxiter == 0
        if max_iter == 0:
            self.gp.check_L_alpha()
        elif max_iter is not None:
            temp_maxiter = self.gp.maxiter
            self.gp.maxiter = max_iter
            self.gp.train(output=self.output if self.verbose >= 2 else None)
            self.gp.maxiter = temp_maxiter
        else:
            self.gp.train(output=self.output if self.verbose >= 2 else None)

        self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                               self.start_time,
                               self.gp.likelihood, self.gp.likelihood_gradient)
        self.train_count += 1
示例#8
0
文件: otf.py 项目: mfhossam1992/flare
class OTF:
    """Trains a Gaussian process force field on the fly during
        molecular dynamics.

    Args:
        dft_input (str): Input file.
        dt (float): MD timestep.
        number_of_steps (int): Number of timesteps in the training
            simulation.
        gp (gp.GaussianProcess): Initial GP model.
        dft_loc (str): Location of DFT executable.
        std_tolerance_factor (float, optional): Threshold that determines
            when DFT is called. Specifies a multiple of the current noise
            hyperparameter. If the epistemic uncertainty on a force
            component exceeds this value, DFT is called. Defaults to 1.
        prev_pos_init ([type], optional): Previous positions. Defaults
            to None.
        par (bool, optional): If True, force predictions are made in
            parallel. Defaults to False.
        skip (int, optional): Number of frames that are skipped when
            dumping to the output file. Defaults to 0.
        init_atoms (List[int], optional): List of atoms from the input
            structure whose local environments and force components are
            used to train the initial GP model. If None is specified, all
            atoms are used to train the initial GP. Defaults to None.
        calculate_energy (bool, optional): If True, the energy of each
            frame is calculated with the GP. Defaults to False.
        output_name (str, optional): Name of the output file. Defaults to
            'otf_run'.
        max_atoms_added (int, optional): Number of atoms added each time
            DFT is called. Defaults to 1.
        freeze_hyps (int, optional): Specifies the number of times the
            hyperparameters of the GP are optimized. After this many
            updates to the GP, the hyperparameters are frozen.
            Defaults to 10.
        rescale_steps (List[int], optional): List of frames for which the
            velocities of the atoms are rescaled. Defaults to [].
        rescale_temps (List[int], optional): List of rescaled temperatures.
            Defaults to [].
        dft_softwarename (str, optional): DFT code used to calculate
            ab initio forces during training. Defaults to "qe".
        no_cpus (int, optional): Number of cpus used during training.
            Defaults to 1.
        npool (int, optional): Number of k-point pools for DFT
            calculations. Defaults to None.
        mpi (str, optional): Determines how mpi is called. Defaults to
            "srun".
        dft_kwargs ([type], optional): Additional arguments which are
            passed when DFT is called; keyword arguments vary based on the
            program (e.g. ESPRESSO vs. VASP). Defaults to None.
        store_dft_output (Tuple[Union[str,List[str]],str], optional):
            After DFT calculations are called, copy the file or files
            specified in the first element of the tuple to a directory
            specified as the second element of the tuple.
            Useful when DFT calculations are expensive and want to be kept
            for later use. The first element of the tuple can either be a
            single file name, or a list of several. Copied files will be
            prepended with the date and time with the format
            'Year.Month.Day:Hour:Minute:Second:'.
    """
    def __init__(self,
                 dft_input: str,
                 dt: float,
                 number_of_steps: int,
                 gp: gp.GaussianProcess,
                 dft_loc: str,
                 std_tolerance_factor: float = 1,
                 prev_pos_init: 'ndarray' = None,
                 par: bool = False,
                 skip: int = 0,
                 init_atoms: List[int] = None,
                 calculate_energy: bool = False,
                 output_name: str = 'otf_run',
                 max_atoms_added: int = 1,
                 freeze_hyps: int = 10,
                 rescale_steps: List[int] = [],
                 rescale_temps: List[int] = [],
                 dft_softwarename: str = "qe",
                 no_cpus: int = 1,
                 npool: int = None,
                 mpi: str = "srun",
                 dft_kwargs=None,
                 store_dft_output: Tuple[Union[str, List[str]], str] = None):

        self.dft_input = dft_input
        self.dt = dt
        self.number_of_steps = number_of_steps
        self.gp = gp
        self.dft_loc = dft_loc
        self.std_tolerance = std_tolerance_factor
        self.skip = skip
        self.dft_step = True
        self.freeze_hyps = freeze_hyps
        self.dft_module = dft_software[dft_softwarename]

        # parse input file
        positions, species, cell, masses = \
            self.dft_module.parse_dft_input(self.dft_input)

        _, coded_species = struc.get_unique_species(species)

        self.structure = struc.Structure(cell=cell,
                                         species=coded_species,
                                         positions=positions,
                                         mass_dict=masses,
                                         prev_positions=prev_pos_init,
                                         species_labels=species)

        self.noa = self.structure.positions.shape[0]
        self.atom_list = list(range(self.noa))
        self.curr_step = 0

        self.max_atoms_added = max_atoms_added

        # initialize local energies
        if calculate_energy:
            self.local_energies = np.zeros(self.noa)
        else:
            self.local_energies = None

        # set atom list for initial dft run
        if init_atoms is None:
            self.init_atoms = [int(n) for n in range(self.noa)]
        else:
            self.init_atoms = init_atoms

        self.dft_count = 0

        # set pred function
        if not par and not calculate_energy:
            self.pred_func = predict.predict_on_structure
        elif par and not calculate_energy:
            self.pred_func = predict.predict_on_structure_par
        elif not par and calculate_energy:
            self.pred_func = predict.predict_on_structure_en
        elif par and calculate_energy:
            self.pred_func = predict.predict_on_structure_par_en
        self.par = par

        # set rescale attributes
        self.rescale_steps = rescale_steps
        self.rescale_temps = rescale_temps

        self.output = Output(output_name, always_flush=True)

        # set number of cpus and npool for DFT runs
        self.no_cpus = no_cpus
        self.npool = npool
        self.mpi = mpi

        self.dft_kwargs = dft_kwargs
        self.store_dft_output = store_dft_output

    def run(self):
        """
        Performs an on-the-fly training run.
        """

        self.output.write_header(self.gp.cutoffs, self.gp.kernel_name,
                                 self.gp.hyps, self.gp.algo, self.dt,
                                 self.number_of_steps, self.structure,
                                 self.std_tolerance)
        counter = 0
        self.start_time = time.time()

        while self.curr_step < self.number_of_steps:
            print('curr_step:', self.curr_step)
            # run DFT and train initial model if first step and DFT is on
            if self.curr_step == 0 and self.std_tolerance != 0:
                # call dft and update positions
                self.run_dft()
                dft_frcs = copy.deepcopy(self.structure.forces)
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)
                self.update_temperature(new_pos)
                self.record_state()

                # make initial gp model and predict forces
                self.update_gp(self.init_atoms, dft_frcs)
                if (self.dft_count - 1) < self.freeze_hyps:
                    self.train_gp()

            # after step 1, try predicting with GP model
            else:
                self.gp.check_L_alpha()
                self.pred_func(self.structure, self.gp, self.no_cpus)
                self.dft_step = False
                new_pos = md.update_positions(self.dt, self.noa,
                                              self.structure)

                # get max uncertainty atoms
                std_in_bound, target_atoms = \
                    is_std_in_bound(self.std_tolerance,
                                    self.gp.hyps[-1], self.structure,
                                    self.max_atoms_added)

                if not std_in_bound:
                    # record GP forces
                    self.update_temperature(new_pos)
                    self.record_state()
                    gp_frcs = copy.deepcopy(self.structure.forces)

                    # run DFT and record forces
                    self.dft_step = True
                    self.run_dft()
                    dft_frcs = copy.deepcopy(self.structure.forces)
                    new_pos = md.update_positions(self.dt, self.noa,
                                                  self.structure)
                    self.update_temperature(new_pos)
                    self.record_state()

                    # compute mae and write to output
                    mae = np.mean(np.abs(gp_frcs - dft_frcs))
                    mac = np.mean(np.abs(dft_frcs))

                    self.output.write_to_log('\nmean absolute error:'
                                             ' %.4f eV/A \n' % mae)
                    self.output.write_to_log('mean absolute dft component:'
                                             ' %.4f eV/A \n' % mac)

                    # add max uncertainty atoms to training set
                    self.update_gp(target_atoms, dft_frcs)
                    if (self.dft_count - 1) < self.freeze_hyps:
                        self.train_gp()

            # write gp forces
            if counter >= self.skip and not self.dft_step:
                self.update_temperature(new_pos)
                self.record_state()
                counter = 0

            counter += 1
            self.update_positions(new_pos)
            self.curr_step += 1

        self.output.conclude_run()

    def run_dft(self):
        """Calculates DFT forces on atoms in the current structure.
        
        If OTF has store_dft_output set, then the specified DFT files will
        be copied with the current date and time prepended in the format
        'Year.Month.Day:Hour:Minute:Second:'.
        """

        self.output.write_to_log('\nCalling DFT...\n')

        # calculate DFT forces
        forces = self.dft_module.run_dft_par(self.dft_input,
                                             self.structure,
                                             self.dft_loc,
                                             ncpus=self.no_cpus,
                                             npool=self.npool,
                                             mpi=self.mpi,
                                             dft_kwargs=self.dft_kwargs)
        self.structure.forces = forces

        # write wall time of DFT calculation
        self.dft_count += 1
        self.output.write_to_log('DFT run complete.\n')
        time_curr = time.time() - self.start_time
        self.output.write_to_log('number of DFT calls: %i \n' % self.dft_count)
        self.output.write_to_log('wall time from start: %.2f s \n' % time_curr)

        # Store DFT outputs in another folder if desired
        # specified in self.store_dft_output
        if self.store_dft_output is not None:
            dest = self.store_dft_output[1]
            target_files = self.store_dft_output[0]
            now = datetime.now()
            dt_string = now.strftime("%Y.%m.%d:%H:%M:%S:")
            if isinstance(target_files, str):
                to_copy = [target_files]
            else:
                to_copy = target_files
            for file in to_copy:
                copyfile(file, dest + '/' + dt_string + file)

    def update_gp(self, train_atoms: List[int], dft_frcs: 'ndarray'):
        """Updates the current GP model.

        Args:
            train_atoms (List[int]): List of atoms whose local environments
                will be added to the training set.
            dft_frcs (np.ndarray): DFT forces on all atoms in the structure.
        """
        self.output.write_to_log(
            '\nAdding atom {} to the training set.\n'.format(train_atoms))
        self.output.write_to_log('Uncertainty: {}.\n'.format(
            self.structure.stds[train_atoms[0]]))

        # update gp model
        self.gp.update_db(self.structure, dft_frcs, custom_range=train_atoms)

        self.gp.set_L_alpha()

    def train_gp(self):
        """Optimizes the hyperparameters of the current GP model."""

        self.gp.train(self.output)
        self.output.write_hyps(self.gp.hyp_labels, self.gp.hyps,
                               self.start_time, self.gp.likelihood,
                               self.gp.likelihood_gradient)

    def update_positions(self, new_pos: 'ndarray'):
        """Performs a Verlet update of the atomic positions.

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        if self.curr_step in self.rescale_steps:
            rescale_ind = self.rescale_steps.index(self.curr_step)
            temp_fac = self.rescale_temps[rescale_ind] / self.temperature
            vel_fac = np.sqrt(temp_fac)
            self.structure.prev_positions = \
                new_pos - self.velocities * self.dt * vel_fac
        else:
            self.structure.prev_positions = self.structure.positions
        self.structure.positions = new_pos
        self.structure.wrap_positions()

    def update_temperature(self, new_pos: 'ndarray'):
        """Updates the instantaneous temperatures of the system.

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        KE, temperature, velocities = \
                md.calculate_temperature(new_pos, self.structure, self.dt,
                                         self.noa)
        self.KE = KE
        self.temperature = temperature
        self.velocities = velocities

    def record_state(self):
        self.output.write_md_config(self.dt, self.curr_step, self.structure,
                                    self.temperature, self.KE,
                                    self.local_energies, self.start_time,
                                    self.dft_step, self.velocities)
        self.output.write_xyz_config(self.curr_step, self.structure,
                                     self.dft_step)
示例#9
0
class LearningProtocol:
    def __init__(
        self,
        gp: Union[GaussianProcess, MappedGaussianProcess],
        active_frames: List[Structure] = None,
        passive_frames: List[Structure] = None,
        passive_envs: List[Tuple[AtomicEnvironment, "np.array"]] = None,
        active_rel_var_tol: float = 4,
        active_abs_var_tol: float = 1,
        active_abs_error_tol: float = 0,
        active_error_tol_cutoff: float = inf,
        active_max_trains: int = np.inf,
        active_max_element_from_frame: dict = None,
        checkpoint_interval_train: int = 1,
        checkpoint_interval_atom: int = 100,
        predict_atoms_per_element: dict = None,
        max_atoms_from_frame: int = np.inf,
        min_atoms_added_per_train: int = 1,
        max_model_size: int = np.inf,
        passive_on_active_skips: int = -1,
        passive_train_max_iter: int = 50,
        passive_atoms_per_element: dict = None,
        active_skip: int = 1,
        shuffle_active_frames: bool = False,
        n_cpus: int = 1,
        validate_ratio: float = 0.0,
        calculate_energy: bool = False,
        output_name: str = "gp_from_aimd",
        print_as_xyz: bool = False,
        verbose: str = "INFO",
        written_model_format: str = "json",
    ):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.
        All arguments are divided between 'passive' learning and 'active'
        learning. By default, when run is called, a 'passive' learning run
        is called which either adds all 'seed' environments to the model,
        or a randomized subset of atoms from the frames. If no arguments are
        specified, the very first frame of the active learning
        frames will be used.
        "Passive" learning will add data based on random selection of atoms
        from a given ab-initio frame.
        "Active" learning will add data to the dataset based on the
        performance of the GP itself: the force error and the GP's internal
        uncertainty estimate.
        There are a widevariety of options which can give you a finer
        control over the training process.
        :param active_frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param active_rel_var_tol: Train if uncertainty is above this *
            noise variance hyperparameter
        :param active_abs_var_tol: Train if uncertainty is above this
        :param active_abs_error_tol: Add atom force error exceeds this
        :param active_error_tol_cutoff: Don't add atom if force error exceeds this
        :param validate_ratio: Fraction of frames used for validation
        :param active_skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param output_name: Write output of training to this file
        :param print_as_xyz: If True, print the configurations in xyz format
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_added_per_train: Only train when this many atoms have been
            added
        :param active_max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over for parallelization
                over atoms
        :param shuffle_active_frames: Randomize order of frames for better training
        :param verbose: same as logging level, "WARNING", "INFO", "DEBUG"
        :param passive_on_active_skips: Train model on every n frames before running
        :param passive_frames: Frames to train on before running
        :param passive_envs: Environments to train on before running
        :param passive_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param active_max_element_from_frame: Max # of environments to add from
            each species in the training steps
        :param predict_atoms_per_element: Choose a random subset of N random
            atoms from each specified element to predict on. For instance,
            {"H":5} will only predict the forces and uncertainties
            associated with 5 Hydrogen atoms per frame. Elements not
            specified will be predicted as normal. This is useful for
            systems where you are most interested in a subset of elements.
            This will result in a faster but less exhaustive learning process.
        :param checkpoint_interval_train: How often to write model after
                        trainings
        :param checkpoint_interval_atom: How often to write model after atoms are
            added (since atoms may be added without training)
        :param written_model_format: Format to write GP model to
        """

        # GP Training and Execution parameters
        self.gp = gp
        # Check to see if GP is MGP for later flagging
        self.mgp = isinstance(gp, MappedGaussianProcess)

        self.rel_std_tolerance = active_rel_var_tol
        self.abs_std_tolerance = active_abs_var_tol
        self.abs_force_tolerance = active_abs_error_tol
        self.max_force_error = active_error_tol_cutoff
        self.max_trains = active_max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_added_per_train
        self.max_model_size = max_model_size

        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if not self.mgp:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par

        elif self.mgp:
            self.pred_func = predict_on_structure_mgp

        self.start_time = time.time()

        self.train_count = 0
        self.calculate_energy = calculate_energy
        self.n_cpus = n_cpus

        # Output parameters
        self.output = Output(output_name,
                             verbose,
                             print_as_xyz=print_as_xyz,
                             always_flush=True)
        self.logger_name = self.output.basename + "log"
        self.train_checkpoint_interval = checkpoint_interval_train
        self.atom_checkpoint_interval = checkpoint_interval_atom

        self.model_format = written_model_format
        self.output_name = output_name

        # gpfa only function

        self.predict_atoms_per_element = predict_atoms_per_element

        # Set up parameters
        self.frames = active_frames
        if shuffle_active_frames:
            np.random.shuffle(active_frames)

        # Parameters for negotiating with the training active_frames
        self.skip = active_skip
        assert (isinstance(active_skip, int)
                and active_skip >= 1), "Skip needs to be a  positive integer."
        self.validate_ratio = validate_ratio
        assert 0 <= validate_ratio <= 1, "validate_ratio needs to be [0,1]"

        # Set up for pretraining
        self.pre_train_max_iter = passive_train_max_iter
        self.pre_train_on_skips = passive_on_active_skips
        self.seed_envs = [] if passive_envs is None else passive_envs
        self.seed_frames = [] if passive_frames is None else passive_frames

        self.pre_train_env_per_species = ({}
                                          if passive_atoms_per_element is None
                                          else passive_atoms_per_element)
        self.train_env_per_species = ({}
                                      if active_max_element_from_frame is None
                                      else active_max_element_from_frame)

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(
                    key)] = self.pre_train_env_per_species[key]

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = time.time()

    def get_next_env(self):
        self.curr_env_index += 1
        if self.curr_env_index < len(self.seed_envs):
            return self.seed_envs[self.curr_env_index]
        return None

    def get_next_passive_frame(self):
        self.curr_passive_frame_index += 1
        if self.curr_passive_frame_index < len(self.seed_frames):
            return self.seed_frames[self.curr_passive_frame_index]
        return None

    def preparation_for_passive_run(self):
        # Remove frames used as seed from later part of training
        if self.pre_train_on_skips > 0:
            self.seed_frames = []
            newframes = []
            for i in range(len(self.frames)):
                if (i % self.pre_train_on_skips) == 0:
                    self.seed_frames += [self.frames[i]]
                else:
                    newframes += [self.frames[i]]
            self.frames = newframes
        # If the GP is empty, use the first frame as a seed frame.
        elif len(self.gp.training_data) == 0 and len(self.seed_frames) == 0:
            self.seed_frames = [self.frames[0]]
            self.frames = self.frames[1:]

    def preparation_for_active_run(self):
        raise NotImplementedError("need to be implemented in child class")

    def get_next_active_frame(self):
        raise NotImplementedError("need to be implemented in child class")

    def decide_to_update_db(self):
        raise NotImplementedError("need to be implemented in child class")

    def decide_to_checkLalpha(self):
        raise NotImplementedError("need to be implemented in child class")

    def passive_run(self):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.
        1. Print the output.
        2. Pre-train the GP with the seed frames and
        environments. If no seed frames or environments and the GP has no
        training set, then seed with at least one atom from each
        """

        if self.mgp:
            raise NotImplementedError("Pre-running notyet configured for MGP")
        self.output.write_header(
            str(self.gp),
            dt=0,
            Nsteps=len(self.frames),
            structure=None,
            std_tolerance=(self.rel_std_tolerance, self.abs_std_tolerance),
            optional={
                "GP Statistics": json.dumps(self.gp.training_statistics),
                "GP Name": self.gp.name,
                "GP Write Name":
                self.output_name + "_model." + self.model_format,
            },
        )

        self.start_time = time.time()
        logger = logging.getLogger(self.logger_name)
        logger.debug("Now beginning pre-run activity.")

        # If seed environments were passed in, add them to the GP.

        self.preparation_for_passive_run()

        self.curr_env_index = -1
        curr_env = self.get_next_env()
        while curr_env is not None:
            self.gp.add_one_env(curr_env[0], curr_env[1], train=False)
            curr_env = self.get_next_env()

        # No training set ("blank slate" run) and no seeds specified:
        # Take one of each atom species in the first frame
        # so all atomic species are represented in the first step.
        # Otherwise use the seed frames passed in by user.

        self.passive_atom_count = 0
        self.curr_passive_frame_index = -1
        frame = self.get_next_passive_frame()
        while frame is not None:

            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                atoms_of_specie = frame.indices_of_specie(species_i)
                np.random.shuffle(atoms_of_specie)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_to_add = min(
                    n_at,
                    self.pre_train_env_per_species.get(species_i, inf),
                    self.max_atoms_from_frame,
                )

                for atom in atoms_of_specie[:n_to_add]:
                    train_atoms.append(atom)
                    self.passive_atom_count += 1

            self.update_gp_and_print(frame=frame,
                                     train_atoms=train_atoms,
                                     uncertainties=[],
                                     train=False)
            frame = self.get_next_passive_frame()

        logger = logging.getLogger(self.logger_name)
        if self.passive_atom_count > 0:
            logger.info(f"Added {self.passive_atom_count} atoms to "
                        "pretrain.\n"
                        "Pre-run GP Statistics: "
                        f"{json.dumps(self.gp.training_statistics)} ")

        if (self.seed_envs or self.passive_atom_count
                or self.seed_frames) and (self.pre_train_max_iter
                                          or self.max_trains):
            logger.debug("Now commencing pre-run training of GP (which has "
                         "non-empty training set)")
            time0 = time.time()
            self.train_gp(max_iter=self.pre_train_max_iter)
            logger.debug(f"Done train_gp {time.time()-time0}")
        else:
            logger.debug(
                "Now commencing pre-run set up of GP (which has non-empty training set)"
            )
            time0 = time.time()
            self.gp.check_L_alpha()
            logger.debug(f"Done check_L_alpha {time.time()-time0}")

        if self.model_format and not self.mgp:
            self.gp.write_model(f"{self.output_name}_prerun",
                                self.model_format)

    def active_run(self):
        """
        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.
        :return: None
        """

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)
        logger.debug("Commencing run with pre-run...")
        if not self.mgp:
            if len(self.gp) == 0:
                logger.warning("You are attempting to train a model with no "
                               "data in your Gausian Process; it is "
                               "recommended that you begin with "
                               "a passive training process.")

        self.preparation_for_active_run()

        # Loop through trajectory.
        self.cur_atoms_added_train = 0  # Track atoms added for training
        cur_atoms_added_write = 0  # Track atoms added for writing
        cur_trains_done_write = 0  # Track training done for writing

        self.curr_active_frame_index = -1
        cur_frame = self.get_next_active_frame()
        while cur_frame is not None:

            frame_start_time = time.time()
            logger.info(
                f"=====NOW ON FRAME {self.curr_active_frame_index}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(
                cur_frame, self.predict_atoms_per_element)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.mgp:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=self.curr_active_frame_index,
                frame=dummy_frame,
                start_time=time.time(),
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=error,
                local_energies=local_energies,
                KE=0,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if self.decide_to_update_db():

                # Noise hyperparameter & relative std tolerance is not for mgp.
                if self.mgp:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)

                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=noise,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                )

                # Get max force error atoms
                force_in_bound, force_train_atoms = is_force_in_bound_per_species(
                    abs_force_tolerance=self.abs_force_tolerance,
                    predicted_forces=pred_forces,
                    label_forces=dft_forces,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                    max_force_error=self.max_force_error,
                )

                if not std_in_bound or not force_in_bound:

                    # -1 is returned from the is_in_bound methods,
                    # so filter that out and the use sets to remove repeats
                    train_atoms = list(
                        set(std_train_atoms).union(force_train_atoms) - {-1})

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame,
                        train_atoms=train_atoms,
                        uncertainties=pred_stds[train_atoms],
                        train=False,
                    )
                    self.cur_atoms_added_train += len(train_atoms)
                    cur_atoms_added_write += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough

                    if self.decide_to_train():
                        self.train_gp()
                        cur_trains_done_write += 1
                        self.cur_atoms_added_train = 0
                    else:
                        self.gp.update_L_alpha()
                        # self.cur_atoms_added_train = 0

                    # Loop to decide of a model should be written this
                    # iteration
                    will_write = False

                    if (self.train_checkpoint_interval
                            and cur_trains_done_write
                            and self.train_checkpoint_interval <=
                            cur_trains_done_write):
                        will_write = True
                        cur_trains_done_write = 0

                    if (self.atom_checkpoint_interval and cur_atoms_added_write
                            and self.atom_checkpoint_interval <=
                            cur_atoms_added_write):
                        will_write = True
                        cur_atoms_added_write = 0

                    if self.model_format and will_write:
                        self.gp.write_model(f"{self.output_name}_checkpt",
                                            self.model_format)

                if self.decide_to_checkLalpha():
                    self.gp.check_L_alpha()

            cur_frame = self.get_next_active_frame()

        self.output.conclude_run()

        if self.model_format and not self.mgp:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)

    def update_gp_and_print(
        self,
        frame: Structure,
        train_atoms: List[int],
        uncertainties: List[int] = None,
        train: bool = True,
    ):
        """
        Update the internal GP model training set with a list of training
        atoms indexing atoms within the frame. If train is True, re-train
        the GP by optimizing hyperparameters.
        :param frame: Structure to train on
        :param train_atoms: Index atoms to train on
        :param uncertainties: Uncertainties to print, pass in [] to silence
        :param train: Train or not
        :return: None
        """

        # Group added atoms by species for easier output
        added_species = [
            Z_to_element(frame.coded_species[at]) for at in train_atoms
        ]
        added_atoms = {spec: [] for spec in set(added_species)}

        for atom, spec in zip(train_atoms, added_species):
            added_atoms[spec].append(atom)

        logger = logging.getLogger(self.logger_name)
        logger.info("Adding atom(s) "
                    f"{json.dumps(added_atoms,cls=NumpyEncoder)}"
                    " to the training set.")

        if uncertainties is None or len(uncertainties) != 0:
            uncertainties = frame.stds[train_atoms]

        if len(uncertainties) != 0:
            logger.info(f"Uncertainties: {uncertainties}.")

        # update gp model; handling differently if it's an MGP
        if not self.mgp:
            self.gp.update_db(frame, frame.forces, custom_range=train_atoms)

            if train:
                self.train_gp()

        else:
            raise NotImplementedError

    def train_gp(self, max_iter: int = None):
        """
        Train the Gaussian process and write the results to the output file.
        :param max_iter: Maximum iterations associated with this training run,
            overriding the Gaussian Process's internally set maxiter.
        :type max_iter: int
        """

        logger = logging.getLogger(self.logger_name)
        logger.debug("Train GP")

        logger_train = self.output.basename + "hyps"

        # TODO: Improve flexibility in GP training to make this next step
        # unnecessary, so maxiter can be passed as an argument

        # Don't train if maxiter == 0
        if max_iter == 0:
            self.gp.check_L_alpha()
        elif max_iter is not None:
            temp_maxiter = self.gp.maxiter
            self.gp.maxiter = max_iter
            self.gp.train(logger_name=logger_train)
            self.gp.maxiter = temp_maxiter
        else:
            self.gp.train(logger_name=logger_train)

        hyps, labels = Parameters.get_hyps(self.gp.hyps_mask,
                                           self.gp.hyps,
                                           constraint=False,
                                           label=True)
        if labels is None:
            labels = self.gp.hyp_labels
        self.output.write_hyps(
            labels,
            hyps,
            self.start_time,
            self.gp.likelihood,
            self.gp.likelihood_gradient,
            hyps_mask=self.gp.hyps_mask,
        )
        self.train_count += 1
示例#10
0
class OTF:
    """Trains a Gaussian process force field on the fly during
        molecular dynamics.

    Args:
        dt (float): MD timestep.
        number_of_steps (int): Number of timesteps in the training
            simulation.
        prev_pos_init ([type], optional): Previous positions. Defaults
            to None.
        rescale_steps (List[int], optional): List of frames for which the
            velocities of the atoms are rescaled. Defaults to [].
        rescale_temps (List[int], optional): List of rescaled temperatures.
            Defaults to [].

        gp (gp.GaussianProcess): Initial GP model.
        calculate_energy (bool, optional): If True, the energy of each
            frame is calculated with the GP. Defaults to False.
        calculate_efs (bool, optional): If True, the energy and stress of each
            frame is calculated with the GP. Defaults to False.
        write_model (int, optional): If 0, write never. If 1, write at
            end of run. If 2, write after each training and end of run.
            If 3, write after each time atoms are added and end of run.

        std_tolerance_factor (float, optional): Threshold that determines
            when DFT is called. Specifies a multiple of the current noise
            hyperparameter. If the epistemic uncertainty on a force
            component exceeds this value, DFT is called. Defaults to 1.
        skip (int, optional): Number of frames that are skipped when
            dumping to the output file. Defaults to 0.
        init_atoms (List[int], optional): List of atoms from the input
            structure whose local environments and force components are
            used to train the initial GP model. If None is specified, all
            atoms are used to train the initial GP. Defaults to None.
        output_name (str, optional): Name of the output file. Defaults to
            'otf_run'.
        max_atoms_added (int, optional): Number of atoms added each time
            DFT is called. Defaults to 1.
        freeze_hyps (int, optional): Specifies the number of times the
            hyperparameters of the GP are optimized. After this many
            updates to the GP, the hyperparameters are frozen.
            Defaults to 10.

        force_source (Union[str, object], optional): DFT code used to calculate
            ab initio forces during training. A custom module can be used here
            in place of the DFT modules available in the FLARE package. The
            module must contain two functions: parse_dft_input, which takes a
            file name (in string format) as input and returns the positions,
            species, cell, and masses of a structure of atoms; and run_dft_par,
            which takes a number of DFT related inputs and returns the forces
            on all atoms.  Defaults to "qe".
        npool (int, optional): Number of k-point pools for DFT
            calculations. Defaults to None.
        mpi (str, optional): Determines how mpi is called. Defaults to
            "srun".
        dft_loc (str): Location of DFT executable.
        dft_input (str): Input file.
        dft_output (str): Output file.
        dft_kwargs ([type], optional): Additional arguments which are
            passed when DFT is called; keyword arguments vary based on the
            program (e.g. ESPRESSO vs. VASP). Defaults to None.
        store_dft_output (Tuple[Union[str,List[str]],str], optional):
            After DFT calculations are called, copy the file or files
            specified in the first element of the tuple to a directory
            specified as the second element of the tuple.
            Useful when DFT calculations are expensive and want to be kept
            for later use. The first element of the tuple can either be a
            single file name, or a list of several. Copied files will be
            prepended with the date and time with the format
            'Year.Month.Day:Hour:Minute:Second:'.

        n_cpus (int, optional): Number of cpus used during training.
            Defaults to 1.
    """

    def __init__(
     self,
     # md args
     dt: float, number_of_steps: int, prev_pos_init: 'ndarray' = None,
     rescale_steps: List[int] = [], rescale_temps: List[int] = [],
     # flare args
     gp: gp.GaussianProcess = None, calculate_energy: bool = False,
     calculate_efs: bool = False, write_model: int = 0,
     # otf args
     std_tolerance_factor: float = 1, skip: int = 0,
     init_atoms: List[int] = None, output_name: str = 'otf_run',
     max_atoms_added: int = 1, freeze_hyps: int = 10,
     # dft args
     force_source: str = "qe", npool: int = None, mpi: str = "srun",
     dft_loc: str = None, dft_input: str = None, dft_output='dft.out',
     dft_kwargs=None,
     store_dft_output: Tuple[Union[str, List[str]], str] = None,
     # par args
     n_cpus: int = 1):

        self.dft_input = dft_input
        self.dft_output = dft_output
        self.dt = dt
        self.number_of_steps = number_of_steps
        self.gp = gp
        self.dft_loc = dft_loc
        self.std_tolerance = std_tolerance_factor
        self.skip = skip
        self.dft_step = True
        self.freeze_hyps = freeze_hyps

        if isinstance(force_source, str):
            self.dft_module = dft_software[force_source]
        else:
            self.dft_module = force_source

        # parse input file
        self.get_structure_from_input(prev_pos_init)

        self.noa = self.structure.positions.shape[0]
        self.atom_list = list(range(self.noa))
        self.curr_step = 0

        self.max_atoms_added = max_atoms_added

        # initialize local energies
        if calculate_energy:
            self.local_energies = np.zeros(self.noa)
        else:
            self.local_energies = None

        # set atom list for initial dft run
        if init_atoms is None:
            self.init_atoms = [int(n) for n in range(self.noa)]
        else:
            self.init_atoms = init_atoms

        self.dft_count = 0

        # Set the prediction function based on user inputs.
        # Force only prediction.
        if (n_cpus > 1 and gp.per_atom_par and gp.parallel) and not \
           (calculate_energy or calculate_efs):
            self.pred_func = predict.predict_on_structure_par
        elif not (calculate_energy or calculate_efs):
            self.pred_func = predict.predict_on_structure
        # Energy and force prediction.
        elif (n_cpus > 1 and gp.per_atom_par and gp.parallel) and not \
             (calculate_efs):
            self.pred_func = predict.predict_on_structure_par_en
        elif not calculate_efs:
            self.pred_func = predict.predict_on_structure_en
        # Energy, force, and stress prediction.
        elif (n_cpus > 1 and gp.per_atom_par and gp.parallel):
            self.pred_func = predict.predict_on_structure_efs_par
        else:
            self.pred_func = predict.predict_on_structure_efs

        # set rescale attributes
        self.rescale_steps = rescale_steps
        self.rescale_temps = rescale_temps

        # set logger
        self.output = Output(output_name, always_flush=True)
        self.output_name = output_name

        # set number of cpus and npool for DFT runs
        self.n_cpus = n_cpus
        self.npool = npool
        self.mpi = mpi

        self.dft_kwargs = dft_kwargs
        self.store_dft_output = store_dft_output
        self.write_model = write_model

    def run(self):
        """
        Performs an on-the-fly training run.

        If OTF has store_dft_output set, then the specified DFT files will
        be copied with the current date and time prepended in the format
        'Year.Month.Day:Hour:Minute:Second:'.
        """

        self.output.write_header(
            str(self.gp), self.dt, self.number_of_steps, self.structure,
            self.std_tolerance)

        counter = 0
        self.start_time = time.time()

        while self.curr_step < self.number_of_steps:
            # run DFT and train initial model if first step and DFT is on
            if (self.curr_step == 0) and (self.std_tolerance != 0) and \
               (len(self.gp.training_data) == 0):

                # Are the recorded forces from the GP or DFT in ASE OTF?
                # When DFT is called, ASE energy, forces, and stresses should
                # get updated.
                self.initialize_train()
                self.update_temperature()
                self.record_state()

            # after step 1, try predicting with GP model
            else:
                # compute forces and stds with GP
                self.dft_step = False
                self.compute_properties()

                # get max uncertainty atoms
                noise_sig = Parameters.get_noise(
                        self.gp.hyps_mask, self.gp.hyps, constraint=False)
                std_in_bound, target_atoms = is_std_in_bound(
                    self.std_tolerance, noise_sig, self.structure,
                    self.max_atoms_added)

                if not std_in_bound:
                    # record GP forces
                    self.update_temperature()
                    self.record_state()
                    gp_frcs = deepcopy(self.structure.forces)

                    # run DFT and record forces
                    self.dft_step = True
                    self.run_dft()
                    dft_frcs = deepcopy(self.structure.forces)

                    # run MD step & record the state
                    self.record_state()

                    # compute mae and write to output
                    self.compute_mae(gp_frcs, dft_frcs)

                    # add max uncertainty atoms to training set
                    self.update_gp(target_atoms, dft_frcs)

            # write gp forces
            if counter >= self.skip and not self.dft_step:
                self.update_temperature()
                self.record_state()
                counter = 0

            counter += 1
            # TODO: Reinstate velocity rescaling.
            self.md_step()
            self.curr_step += 1

        self.output.conclude_run()

        if self.write_model >= 1:
            self.gp.write_model(self.output_name+"_model")

    def get_structure_from_input(self, prev_pos_init):
        positions, species, cell, masses = \
            self.dft_module.parse_dft_input(self.dft_input)

        self.structure = struc.Structure(
            cell=cell, species=species, positions=positions, mass_dict=masses,
            prev_positions=prev_pos_init, species_labels=species)

    def initialize_train(self):
        # call dft and update positions
        self.run_dft()
        dft_frcs = deepcopy(self.structure.forces)

        # make initial gp model and predict forces
        self.update_gp(self.init_atoms, dft_frcs)

    def compute_properties(self):
        '''
        In ASE-OTF, it will be replaced by subclass method
        '''
        self.gp.check_L_alpha()
        self.pred_func(self.structure, self.gp, self.n_cpus)

    def md_step(self):
        '''
        Take an MD step. This updates the positions of the structure.
        '''
        md.update_positions(self.dt, self.noa, self.structure)

    def run_dft(self):
        """Calculates DFT forces on atoms in the current structure.

        If OTF has store_dft_output set, then the specified DFT files will
        be copied with the current date and time prepended in the format
        'Year.Month.Day:Hour:Minute:Second:'.

        Calculates DFT forces on atoms in the current structure."""

        f = logging.getLogger(self.output.basename+'log')
        f.info('\nCalling DFT...\n')

        # calculate DFT forces
        # TODO: Return stress and energy
        forces = self.dft_module.run_dft_par(
            self.dft_input, self.structure, self.dft_loc, n_cpus=self.n_cpus,
            dft_out=self.dft_output, npool=self.npool, mpi=self.mpi,
            dft_kwargs=self.dft_kwargs)

        # Note: also need to update stresses when performing a simulation
        # in the NPT ensemble.
        self.structure.forces = forces

        # write wall time of DFT calculation
        self.dft_count += 1
        self.output.conclude_dft(self.dft_count, self.start_time)

        # Store DFT outputs in another folder if desired
        # specified in self.store_dft_output
        if self.store_dft_output is not None:
            dest = self.store_dft_output[1]
            target_files = self.store_dft_output[0]
            now = datetime.now()
            dt_string = now.strftime("%Y.%m.%d:%H:%M:%S:")
            if isinstance(target_files, str):
                to_copy = [target_files]
            else:
                to_copy = target_files
            for ofile in to_copy:
                copyfile(ofile, dest+'/'+dt_string+ofile)

    def update_gp(self, train_atoms: List[int], dft_frcs: 'ndarray'):
        """
        Updates the current GP model.


        Args:
            train_atoms (List[int]): List of atoms whose local environments
                will be added to the training set.
            dft_frcs (np.ndarray): DFT forces on all atoms in the structure.
        """
        self.output.add_atom_info(train_atoms, self.structure.stds)

        # update gp model
        self.gp.update_db(self.structure, dft_frcs,
                          custom_range=train_atoms)

        self.gp.set_L_alpha()

        # write model
        if (self.dft_count-1) < self.freeze_hyps:
            self.train_gp()
            if self.write_model == 2:
                self.gp.write_model(self.output_name+"_model")
        if self.write_model == 3:
            self.gp.write_model(self.output_name+'_model')

    def train_gp(self):
        """Optimizes the hyperparameters of the current GP model."""

        self.gp.train(logger_name=self.output.basename+'hyps')
        hyps, labels = Parameters.get_hyps(
                self.gp.hyps_mask, self.gp.hyps, constraint=False,
                label=True)
        if labels is None:
            labels = self.gp.hyp_labels
        self.output.write_hyps(labels, hyps,
                               self.start_time,
                               self.gp.likelihood, self.gp.likelihood_gradient,
                               hyps_mask=self.gp.hyps_mask)

    def compute_mae(self, gp_frcs, dft_frcs):
        mae = np.mean(np.abs(gp_frcs - dft_frcs))
        mac = np.mean(np.abs(dft_frcs))

        f = logging.getLogger(self.output.basename+'log')
        f.info(f'mean absolute error: {mae:.4f} eV/A')
        f.info(f'mean absolute dft component: {mac:.4f} eV/A')

    def update_positions(self, new_pos: 'ndarray'):
        """Performs a Verlet update of the atomic positions.

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        if self.curr_step in self.rescale_steps:
            rescale_ind = self.rescale_steps.index(self.curr_step)
            temp_fac = self.rescale_temps[rescale_ind] / self.temperature
            vel_fac = np.sqrt(temp_fac)
            self.structure.prev_positions = \
                new_pos - self.velocities * self.dt * vel_fac
        else:
            self.structure.prev_positions = self.structure.positions
        self.structure.positions = new_pos
        self.structure.positions[:] = self.structure.wrap_positions()

    def update_temperature(self):
        """Updates the instantaneous temperatures of the system.

        Args:
            new_pos (np.ndarray): Positions of atoms in the next MD frame.
        """
        KE, temperature, velocities = \
            md.calculate_temperature(self.structure, self.dt, self.noa)
        self.KE = KE
        self.temperature = temperature
        self.velocities = velocities

    def record_state(self):
        self.output.write_md_config(
            self.dt, self.curr_step, self.structure, self.temperature,
            self.KE, self.start_time, self.dft_step, self.velocities)