示例#1
0
def test_element_to_Z():
    for i in range(120):
        assert element_to_Z(i) == i

    assert element_to_Z("1") == 1
    assert element_to_Z(int(1.0)) == 1

    for pair in zip(["H", "C", "O", "Og"], [1, 6, 8, 118]):
        assert element_to_Z(pair[0]) == pair[1]
示例#2
0
def test_element_to_Z():
    for i in range(120):
        assert element_to_Z(i) == i

    assert element_to_Z('1') == 1
    assert element_to_Z(np.int(1.0)) == 1

    for pair in zip(['H', 'C', 'O', 'Og'], [1, 6, 8, 118]):
        assert element_to_Z(pair[0]) == pair[1]
示例#3
0
    def __init__(
        self,
        grid_params: dict,
        unique_species: list = [],
        GP: GaussianProcess = None,
        var_map: str = None,
        container_only: bool = True,
        lmp_file_name: str = "lmp",
        n_cpus: int = None,
        n_sample: int = 10,
    ):

        # load all arguments as attributes
        self.var_map = var_map
        self.lmp_file_name = lmp_file_name
        self.n_cpus = n_cpus
        self.n_sample = n_sample
        self.grid_params = grid_params
        self.species_labels = []
        self.coded_species = []

        self.hyps_mask = None
        self.cutoffs = None
        self.training_statistics = None

        species_labels = []
        coded_species = []
        for i, ele in enumerate(unique_species):
            if isinstance(ele, str):
                species_labels.append(ele)
                coded_species.append(element_to_Z(ele))
            elif isinstance(ele, int):
                coded_species.append(ele)
                species_labels.append(Z_to_element(ele))
            else:
                print("element type not accepted", ele, type(ele))
        sort_id = np.argsort(coded_species)
        for i in sort_id:
            self.coded_species.append(coded_species[i])
            self.species_labels.append(species_labels[i])

        self.load_grid = grid_params.get("load_grid", None)
        self.update = grid_params.get("update", False)
        self.lower_bound_relax = grid_params.get("lower_bound_relax", 0.1)

        self.maps = {}

        optional_xb_params = ["lower_bound", "upper_bound", "svd_rank"]
        for key in grid_params:
            if "body" in key:
                if "twobody" == key:
                    mapxbody = Map2body
                elif "threebody" == key:
                    mapxbody = Map3body
                else:
                    raise KeyError("Only 'twobody' & 'threebody' are allowed")

                xb_dict = grid_params[key]

                # set to 'auto' if the param is not given
                args = {}
                for oxp in optional_xb_params:
                    args[oxp] = xb_dict.get(oxp, "auto")
                args["grid_num"] = xb_dict.get("grid_num", None)

                for k in xb_dict:
                    args[k] = xb_dict[k]

                xb_maps = mapxbody(**args, **self.__dict__)
                self.maps[key] = xb_maps
    def __init__(
        self,
        frames: List[Structure] = None,
        gp: Union[GaussianProcess, MappedGaussianProcess] = None,
        rel_std_tolerance: float = 4,
        abs_std_tolerance: float = 1,
        abs_force_tolerance: float = 0,
        max_force_error: float = inf,
        parallel: bool = False,
        n_cpus: int = 1,
        skip: int = 1,
        validate_ratio: float = 0.0,
        calculate_energy: bool = False,
        include_energies: bool = False,
        output_name: str = "gp_from_aimd",
        print_as_xyz: bool = False,
        pre_train_max_iter: int = 50,
        max_atoms_from_frame: int = np.inf,
        max_trains: int = np.inf,
        min_atoms_per_train: int = 1,
        shuffle_frames: bool = False,
        verbose: str = "INFO",
        pre_train_on_skips: int = -1,
        pre_train_seed_frames: List[Structure] = None,
        pre_train_seed_envs: List[Tuple[AtomicEnvironment, "np.array"]] = None,
        pre_train_atoms_per_element: dict = None,
        train_atoms_per_element: dict = None,
        predict_atoms_per_element: dict = None,
        train_checkpoint_interval: int = 1,
        checkpoint_interval: int = 1,
        atom_checkpoint_interval: int = 100,
        print_training_plan: bool = True,
        model_format: str = "pickle",
    ):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.

        There are a variety of options which can give you a finer control
        over the training process.

        :param frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param rel_std_tolerance: Train if uncertainty is above this *
            noise variance hyperparameter
        :param abs_std_tolerance: Train if uncertainty is above this
        :param abs_force_tolerance: Add atom force error exceeds this
        :param max_force_error: Don't add atom if force error exceeds this
        :param parallel: Use parallel functions or not
        :param validate_ratio: Fraction of frames used for validation
        :param skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param include_energies: Add energies associated with individual frames
        :param output_name: Write output of training to this file
        :param print_as_xyz: If True, print the configurations in xyz format
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_per_train: Only train when this many atoms have been
            added
        :param max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over for parallelization
                over atoms
        :param shuffle_frames: Randomize order of frames for better training
        :param verbose: same as logging level, "WARNING", "INFO", "DEBUG"
        :param pre_train_on_skips: Train model on every n frames before running
        :param pre_train_seed_frames: Frames to train on before running
        :param pre_train_seed_envs: Environments to train on before running
        :param pre_train_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param train_atoms_per_element: Max # of environments to add from
            each species in the training steps
        :param predict_atoms_per_element: Choose a random subset of N random
            atoms from each specified element to predict on. For instance,
            {"H":5} will only predict the forces and uncertainties
            associated with 5 Hydrogen atoms per frame. Elements not
            specified will be predicted as normal. This is useful for
            systems where you are most interested in a subset of elements.
            This will result in a faster but less exhaustive learning process.
        :param checkpoint_interval: Will be deprecated. Same as
                            train_checkpoint_interval
        :param train_checkpoint_interval: How often to write model after
                        trainings
        :param atom_checkpoint_interval: How often to write model after atoms are
            added (since atoms may be added without training)
        :param model_format: Format to write GP model to
        :param print_training_plan: Write which atoms in which frames that
            triggered uncertainty or force conditions, so that training can
            be 'fast-forwarded' later. Also useful for gauging MGP results and
            then applying the atoms with high uncertainty and error to a GP.
        """

        # Set up parameters
        self.frames = frames
        if shuffle_frames:
            np.random.shuffle(frames)
            if print_training_plan:
                warnings.warn("Frames are shuffled so training plan will not"
                              " map onto the structures used; Try to "
                              "shuffle the frames outside of the GPFA module "
                              "for now.")

        # GP Training and Execution parameters
        self.gp = gp
        # Check to see if GP is MGP for later flagging
        self.gp_is_mapped = isinstance(gp, MappedGaussianProcess)
        self.rel_std_tolerance = rel_std_tolerance
        self.abs_std_tolerance = abs_std_tolerance
        self.abs_force_tolerance = abs_force_tolerance
        self.max_force_error = max_force_error
        self.max_trains = max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_per_train
        self.predict_atoms_per_element = predict_atoms_per_element
        self.train_count = 0
        self.calculate_energy = calculate_energy
        self.n_cpus = n_cpus
        self.include_energies = include_energies

        if parallel is True:
            warnings.warn(
                "Parallel flag will be deprecated;we will instead use n_cpu alone.",
                DeprecationWarning,
            )

        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if self.gp_is_mapped:
            self.pred_func = predict_on_structure_mgp
            self.pred_func_env = self.gp.predict
        else:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par
            self.pred_func_env = self.gp.predict_force_xyz

        # Parameters for negotiating with the training frames

        # To later be filled in using the time library
        self.start_time = None

        self.skip = skip
        assert (isinstance(skip, int)
                and skip >= 1), "Skip needs to be a positive integer."
        self.validate_ratio = validate_ratio
        assert 0 <= validate_ratio <= 1, "validate_ratio needs to be [0,1]"

        # Set up for pretraining
        self.pre_train_max_iter = pre_train_max_iter
        self.pre_train_on_skips = pre_train_on_skips
        self.seed_envs = [] if pre_train_seed_envs is None else pre_train_seed_envs
        self.seed_frames = ([] if pre_train_seed_frames is None else
                            pre_train_seed_frames)

        self.pre_train_env_per_species = ({} if
                                          pre_train_atoms_per_element is None
                                          else pre_train_atoms_per_element)
        self.train_env_per_species = ({} if train_atoms_per_element is None
                                      else train_atoms_per_element)

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(
                    key)] = self.pre_train_env_per_species[key]

        # Output parameters
        self.output = Output(output_name,
                             verbose,
                             print_as_xyz=print_as_xyz,
                             always_flush=True)
        self.logger_name = self.output.basename + "log"
        self.train_checkpoint_interval = (train_checkpoint_interval
                                          or checkpoint_interval)
        self.atom_checkpoint_interval = atom_checkpoint_interval

        self.model_format = model_format
        self.output_name = output_name
        self.print_training_plan = print_training_plan

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = time.time()
    def run_passive_learning(
        self,
        frames: List[Structure] = (),
        environments: List[AtomicEnvironment] = (),
        max_atoms_per_frame: int = np.inf,
        post_training_iterations: int = 0,
        post_build_matrices: bool = False,
        max_elts_per_frame: Dict[str, int] = None,
        max_model_size: int = np.inf,
        max_model_elts: Dict[str, int] = None,
    ):
        """
        Various tasks to set up the AIMD training before commencing
        the run through the AIMD trajectory.

        If you want to skip frames, splice the input as
        frames[::skip_n].

        If you want to randomize the frame order, try the random module's shuffle function.

        Loads the GP with the seed frames and
        environments. ALL environments passed in will be added. Randomly chosen
        atoms from each frame will be added. If no seed frames or environments and
        the GP has no training set, then seed with at least one atom from each
        """

        if self.gp_is_mapped:
            raise NotImplementedError(
                "Passive learning not yet configured for MGP")
        if max_elts_per_frame is None:
            max_elts_per_frame = dict()
        if max_model_elts is None:
            max_model_elts = dict()

        logger = logging.getLogger(self.logger_name)
        logger.debug("Beginning passive learning.")
        # If seed environments were passed in, add them to the GP.

        for env in environments:
            self.gp.add_one_env(env, env.force, train=False)

        # Ensure compatibility with number / symbol elemental notation
        for cur_dict in [max_elts_per_frame, max_model_elts]:
            for key in list(cur_dict.keys()):
                if isinstance(key, int):
                    cur_dict[Z_to_element(key)] = cur_dict[key]
                elif isinstance(key, str):
                    cur_dict[element_to_Z(key)] = cur_dict[key]

        # Main frame loop
        total_added = 0
        for frame in frames:
            current_stats = self.gp.training_statistics
            available_to_add = max_model_size - current_stats["N"]

            train_atoms = []
            for species_i in set(frame.coded_species):
                # Get a randomized set of atoms of species i from the frame
                # So that it is not always the lowest-indexed atoms chosen
                elt = Z_to_element(species_i)
                atoms_of_specie = frame.indices_of_specie(species_i)
                n_at = len(atoms_of_specie)
                # Determine how many to add based on user defined cutoffs
                n_add = min(
                    n_at,
                    max_elts_per_frame.get(species_i, inf),
                    max_atoms_per_frame - len(train_atoms),
                    available_to_add - len(train_atoms),
                    max_model_elts.get(elt, np.inf) -
                    current_stats["envs_by_species"].get(elt, 0),
                )
                n_add = max(0, n_add)

                train_atoms += sample(atoms_of_specie, n_add)
                available_to_add -= n_add
                total_added += n_add

            self.update_gp_and_print(
                frame=frame,
                train_atoms=train_atoms,
                uncertainties=[],
                train=False,
            )

        logger = logging.getLogger(self.logger_name)
        logger.info(f"Added {total_added} atoms to "
                    "GP.\n"
                    "Current GP Statistics: "
                    f"{json.dumps(self.gp.training_statistics)} ")

        if post_training_iterations:
            logger.debug("Now commencing pre-run training of GP (which has "
                         "non-empty training set)")
            time0 = time.time()
            self.train_gp(max_iter=post_training_iterations)
            logger.debug(f"Done train_gp {time.time() - time0}")
        elif post_build_matrices:
            logger.debug(
                "Now commencing pre-run set up of GP (which has non-empty training set)"
            )
            time0 = time.time()
            self.gp.check_L_alpha()
            logger.debug(f"Done check_L_alpha {time.time() - time0}")
示例#6
0
    def summarize_group(self, group_type):
        """Sort and combine all the previous definition to internal varialbes

        Args:
            group_type (str): species, twobody, threebody, cut3b, manybody
        """

        aeg = self.all_group_names[group_type]
        nspecie = self.n["specie"]

        # specie need special sorting
        if group_type == "specie":

            self.nspecie = nspecie
            self.specie_mask = np.ones(118, dtype=np.int) * (nspecie - 1)

            # mark the species_mask with atom type
            # default is nspecie-1
            for idt in range(self.nspecie):
                for ele in self.groups["specie"][idt]:
                    atom_n = element_to_Z(ele)
                    if atom_n >= len(self.specie_mask):
                        new_mask = np.ones(atom_n, dtype=np.int) * (nspecie - 1)
                        new_mask[: len(self.specie_mask)] = self.specie_mask
                        self.species_mask = new_mask
                    self.specie_mask[atom_n] = idt
                    self.logger.debug(
                        f"elemtn {ele} is defined as type {idt} with name {aeg[idt]}"
                    )
            self.logger.debug(f"All the remaining elements are left as type {idt}")

        elif group_type in self.all_group_types:

            if self.n[group_type] == 0:
                self.logger.debug(f"{group_type} is not defined. Skipped")
                return

            if (group_type not in self.kernels) and (
                group_type in ParameterHelper.all_kernel_types
            ):
                self.kernels.append(group_type)

            self.mask[group_type] = np.ones(
                nspecie ** ParameterHelper.ndim[group_type], dtype=np.int
            ) * (self.n[group_type] - 1)

            self.hyps_sig[group_type] = []
            self.hyps_ls[group_type] = []
            all_opt_sig = []
            all_opt_ls = []

            for idt in range(self.n[group_type]):
                name = aeg[idt]
                for ele_list in self.groups[group_type][idt]:
                    # generate all possible permutation
                    perms = list(permutations(ele_list))
                    for ele_list in perms:
                        mask_id = 0
                        for ele in ele_list:
                            mask_id += ele
                            mask_id *= nspecie
                        mask_id = mask_id // nspecie
                        self.mask[group_type][mask_id] = idt
                    def_str = "-".join(map(str, self.groups["specie"]))
                    self.logger.debug(
                        f"{group_type} {def_str} is defined as type {idt} "
                        f"with name {name}"
                    )

                if group_type not in self.cutoff_types:
                    sig = self.sigma.get(name, -1)
                    opt_sig = self.opt.get(name + "sig", True)
                    if sig == -1:
                        sig = self.sigma.get(group_type, -1)
                        opt_sig = self.opt.get(group_type + "sig", True)
                    if sig == -1:
                        sig = self.universal.get("sigma", -1)
                        opt_sig = self.opt.get("sigma", True)

                    ls = self.ls.get(name, -1)
                    opt_ls = self.opt.get(name + "ls", True)
                    if ls == -1:
                        ls = self.ls.get(group_type, -1)
                        opt_ls = self.opt.get(group_type + "ls", True)
                    if ls == -1:
                        ls = self.universal.get("lengthscale", -1)
                        opt_ls = self.opt.get("lengthscale", True)

                    if sig < 0 or ls < 0:
                        self.logger.error(
                            f"hyper parameters for group {name} is not defined"
                        )
                        raise RuntimeError
                    self.hyps_sig[group_type] += [sig]
                    self.hyps_ls[group_type] += [ls]
                    all_opt_sig += [opt_sig]
                    all_opt_ls += [opt_ls]
                    self.logger.debug(
                        f"   using hyper-parameters of {sig:6.2g} "
                        f"{ls:6.2g} {opt_sig} {opt_ls}"
                    )
            self.hyps_opt[group_type] = all_opt_sig + all_opt_ls
            self.logger.debug(f"All the remaining elements are left as type {idt}")

            # sort out the cutoffs
            if group_type in self.cutoff_types:
                universal_cutoff = self.universal.get(
                    "cutoff_" + self.cutoff_types[group_type], 0
                )
            else:
                universal_cutoff = self.universal.get("cutoff_" + group_type, 0)

            allcut = []
            alldefine = True
            for idt in range(self.n[group_type]):
                if aeg[idt] in self.all_cutoff:
                    allcut += [self.all_cutoff[aeg[idt]]]
                else:
                    alldefine = False
                    self.logger.info(
                        f"{aeg[idt]} cutoff is not defined. "
                        "it's going to use the universal cutoff."
                    )
            if group_type not in self.cutoff_types_values:

                if len(allcut) > 0:
                    if universal_cutoff <= 0:
                        universal_cutoff = np.max(allcut)
                        self.logger.info(
                            f"universal cutoff for {group_type} is defined as zero!"
                            f" reset it to {universal_cutoff}"
                        )

                    self.cutoff_list[group_type] = []
                    for idt in range(self.n[group_type]):
                        self.cutoff_list[group_type] += [
                            self.all_cutoff.get(aeg[idt], universal_cutoff)
                        ]
                    self.cutoff_list[group_type] = np.array(
                        self.cutoff_list[group_type], dtype=float
                    )

                    max_cutoff = np.max(self.cutoff_list[group_type])

                    # update the universal cutoff to make it higher than
                    if alldefine:
                        universal_cutoff = max_cutoff
                        self.logger.info(
                            f"universal cutoff is updated to {universal_cutoff}"
                        )
                    elif not np.any(self.cutoff_list[group_type] - max_cutoff):
                        # if not all the cutoffs are defined separately
                        # and they are all the same value
                        del self.cutoff_list[group_type]
                        universal_cutoff = max_cutoff
                        if group_type in self.cutoff_types:
                            self.n[group_type] = 0
                        self.logger.info(
                            f"universal cutoff is updated to {universal_cutoff}"
                        )

            else:
                if universal_cutoff <= 0 and len(allcut) > 0:
                    universal_cutoff = np.max(allcut)
                    self.logger.info(
                        "threebody universal cutoff is updated to"
                        f"{universal_cutoff}, but the separate definitions will"
                        "be ignored"
                    )

            if universal_cutoff > 0:
                if group_type in self.cutoff_types:
                    keyname = "cutoff_" + self.cutoff_types[group_type]
                else:
                    keyname = "cutoff_" + group_type
                self.universal[keyname] = universal_cutoff
            else:
                self.logger.error(f"cutoffs for {group_type} is undefined")
                raise RuntimeError

        else:
            pass
示例#7
0
    def __init__(
        self,
        cell: "ndarray",
        species: Union[List[str], List[int]],
        positions: "ndarray",
        mass_dict: dict = None,
        prev_positions: "ndarray" = None,
        species_labels: List[str] = None,
        forces=None,
        stds=None,
        energy: float = None,
    ):

        # Define cell (each row is a Bravais lattice vector).
        self.cell = np.array(cell)

        # Compute the max cutoff compatible with a 3x3x3 supercell of the
        # structure.
        self.max_cutoff = get_max_cutoff(self.cell)

        # Set positions.
        self.positions = np.array(positions)

        # If species are strings, convert species to integers by atomic number
        if species_labels is None:
            self.species_labels = species
        else:
            self.species_labels = species_labels
        self.coded_species = np.array([element_to_Z(spec) for spec in species])
        self.nat = len(species)

        # Default: atoms have no velocity
        if prev_positions is None:
            self.prev_positions = np.copy(self.positions)
        else:
            assert len(positions) == len(
                prev_positions
            ), "Previous positions and positions are not same length"
            self.prev_positions = prev_positions

        # Set forces, energies, and stresses and their uncertainties.
        if forces is not None:
            self.forces = np.array(forces)
        else:
            self.forces = np.zeros((len(positions), 3))

        if stds is not None:
            self.stds = np.array(stds)
        else:
            self.stds = np.zeros((len(positions), 3))

        self.energy = energy

        self.local_energies = None
        self.local_energy_stds = None
        self.partial_stresses = None
        self.partial_stress_stds = None
        self.stress = None
        self.stress_stds = None

        # Potential energy attribute needed to mirror ASE atoms object.
        self.potential_energy = None

        self.mass_dict = mass_dict

        # Convert from elements to atomic numbers in mass dict
        if mass_dict is not None:
            keys = list(mass_dict.keys())
            for elt in keys:
                if isinstance(elt, str):
                    mass_dict[element_to_Z(elt)] = mass_dict[elt]
                    if elt.isnumeric():
                        mass_dict[int(elt)] = mass_dict[elt]
示例#8
0
def test_elt_warning():
    with pytest.warns(Warning):
        element_to_Z("Fe2")
示例#9
0
    def __init__(
        self,
        gp: Union[GaussianProcess, MappedGaussianProcess],
        active_frames: List[Structure] = None,
        passive_frames: List[Structure] = None,
        passive_envs: List[Tuple[AtomicEnvironment, "np.array"]] = None,
        active_rel_var_tol: float = 4,
        active_abs_var_tol: float = 1,
        active_abs_error_tol: float = 0,
        active_error_tol_cutoff: float = inf,
        active_max_trains: int = np.inf,
        active_max_element_from_frame: dict = None,
        checkpoint_interval_train: int = 1,
        checkpoint_interval_atom: int = 100,
        predict_atoms_per_element: dict = None,
        max_atoms_from_frame: int = np.inf,
        min_atoms_added_per_train: int = 1,
        max_model_size: int = np.inf,
        passive_on_active_skips: int = -1,
        passive_train_max_iter: int = 50,
        passive_atoms_per_element: dict = None,
        active_skip: int = 1,
        shuffle_active_frames: bool = False,
        n_cpus: int = 1,
        validate_ratio: float = 0.0,
        calculate_energy: bool = False,
        output_name: str = "gp_from_aimd",
        print_as_xyz: bool = False,
        verbose: str = "INFO",
        written_model_format: str = "json",
    ):
        """
        Class which trains a GP off of an AIMD trajectory, and generates
        error statistics between the DFT and GP calls.
        All arguments are divided between 'passive' learning and 'active'
        learning. By default, when run is called, a 'passive' learning run
        is called which either adds all 'seed' environments to the model,
        or a randomized subset of atoms from the frames. If no arguments are
        specified, the very first frame of the active learning
        frames will be used.
        "Passive" learning will add data based on random selection of atoms
        from a given ab-initio frame.
        "Active" learning will add data to the dataset based on the
        performance of the GP itself: the force error and the GP's internal
        uncertainty estimate.
        There are a widevariety of options which can give you a finer
        control over the training process.
        :param active_frames: List of structures to evaluate / train GP on
        :param gp: Gaussian Process object
        :param active_rel_var_tol: Train if uncertainty is above this *
            noise variance hyperparameter
        :param active_abs_var_tol: Train if uncertainty is above this
        :param active_abs_error_tol: Add atom force error exceeds this
        :param active_error_tol_cutoff: Don't add atom if force error exceeds this
        :param validate_ratio: Fraction of frames used for validation
        :param active_skip: Skip through frames
        :param calculate_energy: Use local energy kernel or not
        :param output_name: Write output of training to this file
        :param print_as_xyz: If True, print the configurations in xyz format
        :param max_atoms_from_frame: Largest # of atoms added from one frame
        :param min_atoms_added_per_train: Only train when this many atoms have been
            added
        :param active_max_trains: Stop training GP after this many calls to train
        :param n_cpus: Number of CPUs to parallelize over for parallelization
                over atoms
        :param shuffle_active_frames: Randomize order of frames for better training
        :param verbose: same as logging level, "WARNING", "INFO", "DEBUG"
        :param passive_on_active_skips: Train model on every n frames before running
        :param passive_frames: Frames to train on before running
        :param passive_envs: Environments to train on before running
        :param passive_atoms_per_element: Max # of environments to add from
            each species in the seed pre-training steps
        :param active_max_element_from_frame: Max # of environments to add from
            each species in the training steps
        :param predict_atoms_per_element: Choose a random subset of N random
            atoms from each specified element to predict on. For instance,
            {"H":5} will only predict the forces and uncertainties
            associated with 5 Hydrogen atoms per frame. Elements not
            specified will be predicted as normal. This is useful for
            systems where you are most interested in a subset of elements.
            This will result in a faster but less exhaustive learning process.
        :param checkpoint_interval_train: How often to write model after
                        trainings
        :param checkpoint_interval_atom: How often to write model after atoms are
            added (since atoms may be added without training)
        :param written_model_format: Format to write GP model to
        """

        # GP Training and Execution parameters
        self.gp = gp
        # Check to see if GP is MGP for later flagging
        self.mgp = isinstance(gp, MappedGaussianProcess)

        self.rel_std_tolerance = active_rel_var_tol
        self.abs_std_tolerance = active_abs_var_tol
        self.abs_force_tolerance = active_abs_error_tol
        self.max_force_error = active_error_tol_cutoff
        self.max_trains = active_max_trains
        self.max_atoms_from_frame = max_atoms_from_frame
        self.min_atoms_per_train = min_atoms_added_per_train
        self.max_model_size = max_model_size

        # Set prediction function based on if forces or energies are
        # desired, and parallelization accordingly
        if not self.mgp:
            if calculate_energy:
                self.pred_func = predict_on_structure_par_en
            else:
                self.pred_func = predict_on_structure_par

        elif self.mgp:
            self.pred_func = predict_on_structure_mgp

        self.start_time = time.time()

        self.train_count = 0
        self.calculate_energy = calculate_energy
        self.n_cpus = n_cpus

        # Output parameters
        self.output = Output(output_name,
                             verbose,
                             print_as_xyz=print_as_xyz,
                             always_flush=True)
        self.logger_name = self.output.basename + "log"
        self.train_checkpoint_interval = checkpoint_interval_train
        self.atom_checkpoint_interval = checkpoint_interval_atom

        self.model_format = written_model_format
        self.output_name = output_name

        # gpfa only function

        self.predict_atoms_per_element = predict_atoms_per_element

        # Set up parameters
        self.frames = active_frames
        if shuffle_active_frames:
            np.random.shuffle(active_frames)

        # Parameters for negotiating with the training active_frames
        self.skip = active_skip
        assert (isinstance(active_skip, int)
                and active_skip >= 1), "Skip needs to be a  positive integer."
        self.validate_ratio = validate_ratio
        assert 0 <= validate_ratio <= 1, "validate_ratio needs to be [0,1]"

        # Set up for pretraining
        self.pre_train_max_iter = passive_train_max_iter
        self.pre_train_on_skips = passive_on_active_skips
        self.seed_envs = [] if passive_envs is None else passive_envs
        self.seed_frames = [] if passive_frames is None else passive_frames

        self.pre_train_env_per_species = ({}
                                          if passive_atoms_per_element is None
                                          else passive_atoms_per_element)
        self.train_env_per_species = ({}
                                      if active_max_element_from_frame is None
                                      else active_max_element_from_frame)

        # Convert to Coded Species
        if self.pre_train_env_per_species:
            pre_train_species = list(self.pre_train_env_per_species.keys())
            for key in pre_train_species:
                self.pre_train_env_per_species[element_to_Z(
                    key)] = self.pre_train_env_per_species[key]

        # Defining variables to be used later
        self.curr_step = 0
        self.train_count = 0
        self.start_time = time.time()