def train_argmax_policy( load_dir: pyrado.PathLike, env_sim: MetaDomainRandWrapper, subrtn: Algorithm, num_restarts: int, num_samples: int, policy_param_init: to.Tensor = None, valuefcn_param_init: to.Tensor = None, subrtn_snapshot_mode: str = "best", ) -> Policy: """ Train a policy based on the maximizer of the posterior mean. :param load_dir: directory to load from :param env_sim: simulation environment :param subrtn: algorithm which performs the policy / value-function optimization :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine :return: the final BayRn policy """ # Load the required data cands = pyrado.load("candidates.pt", load_dir) cands_values = pyrado.load("candidates_values.pt", load_dir).unsqueeze(1) ddp_space = pyrado.load("ddp_space.pkl", load_dir) if cands.shape[0] > cands_values.shape[0]: print_cbt( f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring the" f"candidates without evaluation for computing the argmax.", "y", ) cands = cands[:cands_values.shape[0], :] # Find the maximizer argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values, ddp_space, num_restarts, num_samples) # Set the domain randomizer env_sim.adapt_randomizer(argmax_cand.numpy()) # Reset the subroutine algorithm which includes resetting the exploration subrtn.reset() # Do a warm start subrtn.init_modules(warmstart=True, policy_param_init=policy_param_init, valuefcn_param_init=valuefcn_param_init) subrtn.train(snapshot_mode=subrtn_snapshot_mode, meta_info=dict(suffix="argmax")) return subrtn.policy
def train_argmax_policy(load_dir: str, env_sim: MetaDomainRandWrapper, subroutine: Algorithm, num_restarts: int, num_samples: int, policy_param_init: to.Tensor = None, valuefcn_param_init: to.Tensor = None) -> Policy: """ Train a policy based on the maximizer of the posterior mean. :param load_dir: directory to load from :param env_sim: simulation environment :param subroutine: algorithm which performs the policy / value-function optimization :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random :return: the final BayRn policy """ # Load the required data cands = to.load(osp.join(load_dir, 'candidates.pt')) cands_values = to.load(osp.join(load_dir, 'candidates_values.pt')).unsqueeze(1) bounds = to.load(osp.join(load_dir, 'bounds.pt')) uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :]) # Find the maximizer argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values, uc_normalizer, num_restarts, num_samples) # Set the domain randomizer given the hyper-parameters env_sim.adapt_randomizer(argmax_cand.numpy()) # Reset the subroutine's algorithm which includes resetting the exploration subroutine.reset() # Reset the subroutine's policy (and value function) subroutine.policy.init_param(policy_param_init) if isinstance(subroutine, ActorCritic): subroutine.critic.value_fcn.init_param(valuefcn_param_init) if policy_param_init is None: print_cbt('Learning the argmax solution from scratch', 'y') else: print_cbt('Learning the argmax solution given an initialization', 'y') subroutine.train( snapshot_mode='best') # meta_info=dict(prefix='final') return subroutine.policy