def test_subset_of_frame_by_element(): spec_list = ["H", "H", "O", "O", "O", "C"] test_struc_1 = Structure( cell=np.eye(3), species=spec_list, positions=np.zeros(shape=(len(spec_list), 3)) ) assert np.array_equal( subset_of_frame_by_element(test_struc_1, {}), list(range(len(test_struc_1))) ) assert np.array_equal( subset_of_frame_by_element(test_struc_1, {"H": 2, "O": 3}), list(range(len(test_struc_1))), ) assert np.array_equal( subset_of_frame_by_element(test_struc_1, {"H": 2, "O": 15}), list(range(len(test_struc_1))), ) assert set(subset_of_frame_by_element(test_struc_1, {"H": 1, "O": 1})).issubset( range(len(spec_list)) ) assert len(subset_of_frame_by_element(test_struc_1, {"H": 1, "O": 1, "C": 1})) == 3 assert subset_of_frame_by_element(test_struc_1, {"H": 0, "O": 0, "C": 0}) == [] assert subset_of_frame_by_element(test_struc_1, {"H": 0, "O": 0, "C": 1}) == [5]
def test_subset_of_frame_by_element(): spec_list = ['H', 'H', 'O', 'O','O', 'C'] test_struc_1 = Structure(cell=np.eye(3), species=spec_list, positions=np.zeros(shape=(len(spec_list), 3))) assert np.array_equal(subset_of_frame_by_element(test_struc_1, {}), list(range(len(test_struc_1)))) assert np.array_equal(subset_of_frame_by_element(test_struc_1, {'H':2,'O':3}), list(range(len(test_struc_1)))) assert np.array_equal(subset_of_frame_by_element(test_struc_1, {'H': 2, 'O': 15}), list(range(len(test_struc_1)))) assert set(subset_of_frame_by_element(test_struc_1,{'H': 1, 'O': 1})).issubset(range(len(spec_list))) assert len(subset_of_frame_by_element(test_struc_1,{'H':1,'O':1, 'C':1}))==3 assert subset_of_frame_by_element(test_struc_1, {'H': 0, 'O': 0, 'C': 0}) == [] assert subset_of_frame_by_element(test_struc_1, {'H': 0, 'O': 0, 'C': 1}) == [5]
def run(self): """ UPDATE: SOON TO BE DEPRECATED, CIRCA SEPTEMBER 2020 Loop through frames and record the error between the GP predictions and the ground-truth forces. Train the GP and update the training set upon the triggering of the uncertainty or force error threshold. :return: None """ # Perform pre-run, in which seed trames are used. logger = logging.getLogger(self.logger_name) logger.debug("Commencing run with pre-run...") if not self.gp_is_mapped: self.pre_run() # Past this frame, stop adding atoms to the training set # (used for validation of model) train_frame = int( len(self.frames[::self.skip]) * (1 - self.validate_ratio)) # Loop through trajectory. cur_atoms_added_train = 0 # Track atoms added for training cur_atoms_added_write = 0 # Track atoms added for writing cur_trains_done_write = 0 # Track training done for writing # Keep track of which atoms trigger force / uncertainty condition training_plan = {} for i, cur_frame in enumerate(self.frames[::self.skip]): frame_start_time = time.time() logger.info(f"=====NOW ON FRAME {i}=====") # If no predict_atoms_per_element was specified, predict_atoms # will be equal to every atom in the frame. predict_atoms = subset_of_frame_by_element( cur_frame, self.predict_atoms_per_element) # Atoms which are skipped will have NaN as their force / std values local_energies = None # Three different predictions: Either MGP, GP with energy, # or GP without if self.gp_is_mapped: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, mgp=self.gp, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, energy=True, ) elif self.calculate_energy: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) else: pred_forces, pred_stds = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) # Get Error dft_forces = cur_frame.forces dft_energy = cur_frame.energy error = np.abs(pred_forces - dft_forces) # Create dummy frame with the predicted forces written dummy_frame = deepcopy(cur_frame) dummy_frame.forces = pred_forces dummy_frame.stds = pred_stds cur_frame.stds = pred_stds self.output.write_gp_dft_comparison( curr_step=i, frame=dummy_frame, start_time=frame_start_time, dft_forces=dft_forces, dft_energy=dft_energy, error=error, local_energies=local_energies, KE=0, cell=cur_frame.cell, ) logger.debug( f"Single frame calculation time {time.time()-frame_start_time}" ) if i < train_frame: # Noise hyperparameter & relative std tolerance is not for gp_is_mapped. if self.gp_is_mapped: noise = 0 else: noise = Parameters.get_noise(self.gp.hyps_mask, self.gp.hyps, constraint=False) std_in_bound, std_train_atoms = is_std_in_bound_per_species( rel_std_tolerance=self.rel_std_tolerance, abs_std_tolerance=self.abs_std_tolerance, noise=noise, structure=dummy_frame, max_atoms_added=self.max_atoms_from_frame, max_by_species=self.train_env_per_species, ) # Get max force error atoms force_in_bound, force_train_atoms = is_force_in_bound_per_species( abs_force_tolerance=self.abs_force_tolerance, predicted_forces=pred_forces, label_forces=dft_forces, structure=dummy_frame, max_atoms_added=self.max_atoms_from_frame, max_by_species=self.train_env_per_species, max_force_error=self.max_force_error, ) if not std_in_bound or not force_in_bound: # -1 is returned from the is_in_bound methods, # so filter that out and the use sets to remove repeats train_atoms = list( set(std_train_atoms).union(force_train_atoms) - {-1}) # Record frame and training atoms, uncertainty, error force_errors = list(np.abs(pred_forces - dft_forces)) uncertainties = list(dummy_frame.stds) training_plan[int(i)] = [(int(a), uncertainties[a], force_errors[a]) for a in train_atoms] # Compute mae and write to output; # Add max uncertainty atoms to training set self.update_gp_and_print( cur_frame, train_atoms=train_atoms, uncertainties=pred_stds[train_atoms], train=False, ) cur_atoms_added_train += len(train_atoms) cur_atoms_added_write += len(train_atoms) # Re-train if number of sampled atoms is high enough if (cur_atoms_added_train >= self.min_atoms_per_train or (i + 1) == train_frame): if self.train_count < self.max_trains: self.train_gp() cur_trains_done_write += 1 else: self.gp.update_L_alpha() cur_atoms_added_train = 0 else: self.gp.update_L_alpha() # Loop to decide of a model should be written this # iteration will_write = False if (self.train_checkpoint_interval and cur_trains_done_write and self.train_checkpoint_interval <= cur_trains_done_write): will_write = True cur_trains_done_write = 0 if (self.atom_checkpoint_interval and cur_atoms_added_write and self.atom_checkpoint_interval <= cur_atoms_added_write): will_write = True cur_atoms_added_write = 0 if self.model_format and will_write: self.gp.write_model(f"{self.output_name}_checkpt", self.model_format) if (i + 1) == train_frame and not self.gp_is_mapped: self.gp.check_L_alpha() # Print training statistics for GP model used conclusion_strings = [ "Final GP statistics:" + json.dumps(self.gp.training_statistics) ] self.output.conclude_run(conclusion_strings) if self.print_training_plan: with open(f"{self.output_name}_training_plan.json", "w") as f: f.write(json.dumps(training_plan, cls=NumpyEncoder)) if self.model_format and not self.gp_is_mapped: self.gp.write_model(f"{self.output_name}_model", self.model_format)
def run_active_learning( self, frames: Union[List[Structure], Trajectory] = (), rel_std_tolerance: float = 4, abs_std_tolerance: float = 0, abs_force_tolerance: float = 0.15, min_atoms_per_train: int = 200, max_force_error: float = inf, max_atoms_from_frame: int = inf, max_trains: int = inf, max_model_size: int = inf, max_elts_per_frame: Dict[str, int] = None, max_model_elts: Dict[str, int] = None, predict_atoms_per_elt: Dict[str, int] = None, write_model_train_interval: int = 1, write_model_atom_interval: int = 100, validate_ratio: float = 0, post_write: bool = True, ): # Perform pre-run, in which seed trames are used. logger = logging.getLogger(self.logger_name) if len(self.gp) == 0: logger.warning( "You are attempting active learning with an empty model. " "One atom of each element will be added from the first frame, " "but be warned: Hyperparameter optimzation on a very small " "subset of data can lead to suboptimal training set " "choices, as the hyperparameters will take time to become " "representative of their converged state relative to your data of " "interest.") self.run_passive_learning( frames[0:1], max_model_elts={elt: 1 for elt in frames[0].species_labels}) if isinstance(frames, list): frames = Trajectory(deepcopy(frames)) train_frame = int(len(frames) * (1 - validate_ratio)) # Loop through trajectory. train_model_atom_counter = 0 # Track atoms added for training write_model_atom_counter = 0 # Track atoms added for writing train_counter = 0 # Track # of times training done # Keep track of which atoms trigger force / uncertainty condition training_plan = {} # MAIN LOOP - Frames for i, cur_frame in enumerate(frames): frame_start_time = time.time() logger.info(f"=====NOW ON FRAME {i}=====") # If no predict_atoms_per_element was specified, predict_atoms # will be equal to every atom in the frame. predict_atoms = subset_of_frame_by_element(cur_frame, predict_atoms_per_elt) # Atoms which are skipped will have NaN as their force / std values local_energies = None # Three different predictions: Either MGP, GP with energy, # or GP without if self.gp_is_mapped: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, mgp=self.gp, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, energy=True, ) elif self.calculate_energy: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) else: pred_forces, pred_stds = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) # Get Error dft_forces = cur_frame.forces dft_energy = cur_frame.energy force_error = np.abs(pred_forces - dft_forces) # Create dummy frame with the predicted forces written dummy_frame = deepcopy(cur_frame) dummy_frame.forces = pred_forces dummy_frame.stds = pred_stds cur_frame.stds = pred_stds self.output.write_gp_dft_comparison( curr_step=i, frame=dummy_frame, start_time=frame_start_time, dft_forces=dft_forces, dft_energy=dft_energy, error=force_error, local_energies=local_energies, KE=0, cell=cur_frame.cell, ) logger.debug( f"Single frame calculation time {time.time()-frame_start_time}" ) if i < train_frame: # Noise hyperparameter & relative std tolerance is not for gp_is_mapped. if self.gp_is_mapped: noise = 0 else: noise = Parameters.get_noise(self.gp.hyps_mask, self.gp.hyps, constraint=False) in_bound, train_atoms = evaluate_training_atoms( rel_std_tolerance=rel_std_tolerance, abs_std_tolerance=abs_std_tolerance, noise=noise, abs_force_tolerance=abs_force_tolerance, max_force_error=max_force_error, pred_forces=pred_forces, dft_forces=dft_forces, structure=dummy_frame, max_model_elts=max_model_elts, max_atoms_from_frame=max_atoms_from_frame, max_elts_per_frame=max_elts_per_frame, training_statistics=self.gp.training_statistics, ) # Protocol for adding atoms to training set if not in_bound: # Record frame and training atoms, uncertainty, error force_errors = list(np.abs(pred_forces - dft_forces)) uncertainties = list(dummy_frame.stds) training_plan[int(i)] = [(int(a), uncertainties[a], force_errors[a]) for a in train_atoms] if self.gp_is_mapped: continue if len(self.gp) + len(train_atoms) <= max_model_size: self.update_gp_and_print( cur_frame, train_atoms=train_atoms, uncertainties=pred_stds[train_atoms], train=False, ) else: logger.info( f"GP is at maximum model size of {max_model_size}. " f"No further atoms will be added for " f"remainder of run, but predictions will still be " f"made. Setting max_atoms_from_frame " f"to 0.") max_atoms_from_frame = 0 if self.model_format: self.gp.write_model( f"{self.output_name}_saturated", self.model_format) train_model_atom_counter += len(train_atoms) write_model_atom_counter += len(train_atoms) # Re-train if number of sampled atoms is high enough if (train_model_atom_counter >= min_atoms_per_train or (i + 1) == train_frame and train_counter <= max_trains): self.train_gp() train_counter += 1 train_model_atom_counter = 0 else: self.gp.update_L_alpha() written = self.write_model_decision( write_model_train_interval, write_model_atom_counter, write_model_atom_interval, train_counter, ) if written: write_model_atom_counter = 0 # Print training statistics for GP model used conclusion_strings = [ "Final GP statistics:" + json.dumps(self.gp.training_statistics) ] self.output.conclude_run(conclusion_strings) if self.print_training_plan: with open(f"{self.output_name}_training_plan.json", "w") as f: f.write(json.dumps(training_plan, cls=NumpyEncoder)) if self.model_format and post_write and not self.gp_is_mapped: self.gp.write_model(f"{self.output_name}_model", self.model_format)
def active_run(self): """ Loop through frames and record the error between the GP predictions and the ground-truth forces. Train the GP and update the training set upon the triggering of the uncertainty or force error threshold. :return: None """ # Perform pre-run, in which seed trames are used. logger = logging.getLogger(self.logger_name) logger.debug("Commencing run with pre-run...") if not self.mgp: if len(self.gp) == 0: logger.warning("You are attempting to train a model with no " "data in your Gausian Process; it is " "recommended that you begin with " "a passive training process.") self.preparation_for_active_run() # Loop through trajectory. self.cur_atoms_added_train = 0 # Track atoms added for training cur_atoms_added_write = 0 # Track atoms added for writing cur_trains_done_write = 0 # Track training done for writing self.curr_active_frame_index = -1 cur_frame = self.get_next_active_frame() while cur_frame is not None: frame_start_time = time.time() logger.info( f"=====NOW ON FRAME {self.curr_active_frame_index}=====") # If no predict_atoms_per_element was specified, predict_atoms # will be equal to every atom in the frame. predict_atoms = subset_of_frame_by_element( cur_frame, self.predict_atoms_per_element) # Atoms which are skipped will have NaN as their force / std values local_energies = None # Three different predictions: Either MGP, GP with energy, # or GP without if self.mgp: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, mgp=self.gp, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, energy=True, ) elif self.calculate_energy: pred_forces, pred_stds, local_energies = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) else: pred_forces, pred_stds = self.pred_func( structure=cur_frame, gp=self.gp, n_cpus=self.n_cpus, write_to_structure=False, selective_atoms=predict_atoms, skipped_atom_value=np.nan, ) # Get Error dft_forces = cur_frame.forces dft_energy = cur_frame.energy error = np.abs(pred_forces - dft_forces) # Create dummy frame with the predicted forces written dummy_frame = deepcopy(cur_frame) dummy_frame.forces = pred_forces dummy_frame.stds = pred_stds self.output.write_gp_dft_comparison( curr_step=self.curr_active_frame_index, frame=dummy_frame, start_time=time.time(), dft_forces=dft_forces, dft_energy=dft_energy, error=error, local_energies=local_energies, KE=0, ) logger.debug( f"Single frame calculation time {time.time()-frame_start_time}" ) if self.decide_to_update_db(): # Noise hyperparameter & relative std tolerance is not for mgp. if self.mgp: noise = 0 else: noise = Parameters.get_noise(self.gp.hyps_mask, self.gp.hyps, constraint=False) std_in_bound, std_train_atoms = is_std_in_bound_per_species( rel_std_tolerance=self.rel_std_tolerance, abs_std_tolerance=self.abs_std_tolerance, noise=noise, structure=dummy_frame, max_atoms_added=self.max_atoms_from_frame, max_by_species=self.train_env_per_species, ) # Get max force error atoms force_in_bound, force_train_atoms = is_force_in_bound_per_species( abs_force_tolerance=self.abs_force_tolerance, predicted_forces=pred_forces, label_forces=dft_forces, structure=dummy_frame, max_atoms_added=self.max_atoms_from_frame, max_by_species=self.train_env_per_species, max_force_error=self.max_force_error, ) if not std_in_bound or not force_in_bound: # -1 is returned from the is_in_bound methods, # so filter that out and the use sets to remove repeats train_atoms = list( set(std_train_atoms).union(force_train_atoms) - {-1}) # Compute mae and write to output; # Add max uncertainty atoms to training set self.update_gp_and_print( cur_frame, train_atoms=train_atoms, uncertainties=pred_stds[train_atoms], train=False, ) self.cur_atoms_added_train += len(train_atoms) cur_atoms_added_write += len(train_atoms) # Re-train if number of sampled atoms is high enough if self.decide_to_train(): self.train_gp() cur_trains_done_write += 1 self.cur_atoms_added_train = 0 else: self.gp.update_L_alpha() # self.cur_atoms_added_train = 0 # Loop to decide of a model should be written this # iteration will_write = False if (self.train_checkpoint_interval and cur_trains_done_write and self.train_checkpoint_interval <= cur_trains_done_write): will_write = True cur_trains_done_write = 0 if (self.atom_checkpoint_interval and cur_atoms_added_write and self.atom_checkpoint_interval <= cur_atoms_added_write): will_write = True cur_atoms_added_write = 0 if self.model_format and will_write: self.gp.write_model(f"{self.output_name}_checkpt", self.model_format) if self.decide_to_checkLalpha(): self.gp.check_L_alpha() cur_frame = self.get_next_active_frame() self.output.conclude_run() if self.model_format and not self.mgp: self.gp.write_model(f"{self.output_name}_model", self.model_format)