Пример #1
0
    def train_argmax_policy(
        load_dir: pyrado.PathLike,
        env_sim: MetaDomainRandWrapper,
        subrtn: Algorithm,
        num_restarts: int,
        num_samples: int,
        policy_param_init: to.Tensor = None,
        valuefcn_param_init: to.Tensor = None,
        subrtn_snapshot_mode: str = "best",
    ) -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :return: the final BayRn policy
        """
        # Load the required data
        cands = pyrado.load("candidates.pt", load_dir)
        cands_values = pyrado.load("candidates_values.pt",
                                   load_dir).unsqueeze(1)
        ddp_space = pyrado.load("ddp_space.pkl", load_dir)

        if cands.shape[0] > cands_values.shape[0]:
            print_cbt(
                f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring the"
                f"candidates without evaluation for computing the argmax.",
                "y",
            )
            cands = cands[:cands_values.shape[0], :]

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values,
                                                  ddp_space, num_restarts,
                                                  num_samples)

        # Set the domain randomizer
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine algorithm which includes resetting the exploration
        subrtn.reset()

        # Do a warm start
        subrtn.init_modules(warmstart=True,
                            policy_param_init=policy_param_init,
                            valuefcn_param_init=valuefcn_param_init)

        subrtn.train(snapshot_mode=subrtn_snapshot_mode,
                     meta_info=dict(suffix="argmax"))
        return subrtn.policy
Пример #2
0
    def train_argmax_policy(load_dir: str,
                            env_sim: MetaDomainRandWrapper,
                            subroutine: Algorithm,
                            num_restarts: int,
                            num_samples: int,
                            policy_param_init: to.Tensor = None,
                            valuefcn_param_init: to.Tensor = None) -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :return: the final BayRn policy
        """
        # Load the required data
        cands = to.load(osp.join(load_dir, 'candidates.pt'))
        cands_values = to.load(osp.join(load_dir,
                                        'candidates_values.pt')).unsqueeze(1)
        bounds = to.load(osp.join(load_dir, 'bounds.pt'))
        uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values,
                                                  uc_normalizer, num_restarts,
                                                  num_samples)

        # Set the domain randomizer given the hyper-parameters
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        subroutine.reset()

        # Reset the subroutine's policy (and value function)
        subroutine.policy.init_param(policy_param_init)
        if isinstance(subroutine, ActorCritic):
            subroutine.critic.value_fcn.init_param(valuefcn_param_init)
        if policy_param_init is None:
            print_cbt('Learning the argmax solution from scratch', 'y')
        else:
            print_cbt('Learning the argmax solution given an initialization',
                      'y')

        subroutine.train(
            snapshot_mode='best')  # meta_info=dict(prefix='final')
        return subroutine.policy