示例#1
0
    def gae(self, concat_ros: StepSequence, v_pred: to.Tensor = None, requires_grad: bool = False) -> to.Tensor:
        """
        Compute the generalized advantage estimation as described in [1].

        :param concat_ros: concatenated rollouts (sequence of steps from potentially different rollouts)
        :param v_pred: state-value predictions if already computed, else pass None
        :param requires_grad: is the gradient required
        :return adv: tensor of advantages
        """
        with ExitStack() as stack:
            if not requires_grad:
                stack.enter_context(to.no_grad())
            if v_pred is None:
                # Get the predictions from the value function
                v_pred = self.values(concat_ros)

            # Compute the advantages
            adv = to.empty_like(v_pred)
            for k in reversed(range(concat_ros.length)):
                if concat_ros[k].done:
                    adv[k] = concat_ros[k].reward - v_pred[k]
                else:
                    adv[k] = concat_ros[k].reward + self.gamma*v_pred[k + 1] - v_pred[k] + \
                             self.gamma*self.lamda*adv[k + 1]

            if self.standardize_adv:
                if isinstance(self.standardizer, RunningStandardizer):
                    adv = self.standardizer(adv, axis=0)
                else:
                    adv = standardize(adv)

            return adv
示例#2
0
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results.mean_returns

        # Get the perturbations (deltas from the current policy parameters)
        s = param_results.parameters - self._policy.param_values
        # also divide by the standard deviation to fully standardize
        s /= self._expl_strat.std

        if self.transform_returns:
            # Ascending sort according to return values
            idcs_acs = np.argsort(rets_avg_ros)[::-1]
            s_asc = s[list(idcs_acs), :]

            # Update the mean (see [1, 2])
            delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (see [1, 2])
            grad_std = self.eta_std_util @ (s_asc**2 - 1.)
            new_std = self._expl_strat.std * to.exp(
                self.lr_std * grad_std / 2.)
            self._expl_strat.adapt(std=new_std)

        else:
            # Standardize averaged returns over all pop_size rollouts
            rets_stdized = standardize(rets_avg_ros)
            rets_stdized = to.from_numpy(rets_stdized).to(
                to.get_default_dtype())

            # delta_mean = 1./len(param_results) * (rets_stdized @ s)
            delta_mean = 1. / (self._expl_strat.std *
                               len(param_results)) * (rets_stdized @ s)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (monotonous exponential decay)
            new_std = self._expl_strat.std * 0.999**self._curr_iter
            self._expl_strat.adapt(std=new_std)

        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std))
        self.logger.add_value(
            'avg expl strat std',
            to.mean(self._expl_strat.std.data).detach().numpy())
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std))
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy().item())
示例#3
0
    def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor,
                              uc_normalizer: UnitCubeProjector,
                              num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param uc_normalizer: unit cube normalizer used during the experiments (can be recovered form the bounds)
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        # Normalize the input data and standardize the output data
        cands_norm = uc_normalizer.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise',
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, acq_value = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack([
                to.zeros_like(uc_normalizer.bound_lo),
                to.ones_like(uc_normalizer.bound_up)
            ]),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples)

        cand = uc_normalizer.project_back(cand_norm.detach())
        print_cbt(f'Converged to argmax of the posterior mean\n{cand.numpy()}',
                  'g',
                  bright=True)
        return cand
示例#4
0
    def step(self, snapshot_mode: str, meta_info: dict = None):
        if not self.initialized:
            # Start initialization phase
            self.train_init_policies()
            self.eval_init_policies()
            self.initialized = True

        # Normalize the input data and standardize the output data
        cands_norm = self.uc_normalizer.project_to(self.cands)
        cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise',
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)
        print_cbt('Fitted the GP.', 'g')

        # Acquisition functions
        if self.acq_fcn_type == 'UCB':
            acq_fcn = UpperConfidenceBound(gp,
                                           beta=self.acq_param.get(
                                               'beta', 0.1),
                                           maximize=True)
        elif self.acq_fcn_type == 'EI':
            acq_fcn = ExpectedImprovement(
                gp, best_f=cands_values_stdized.max().item(), maximize=True)
        elif self.acq_fcn_type == 'PI':
            acq_fcn = ProbabilityOfImprovement(
                gp, best_f=cands_values_stdized.max().item(), maximize=True)
        else:
            raise pyrado.ValueErr(given=self.acq_fcn_type,
                                  eq_constraint="'UCB', 'EI', 'PI'")

        # Optimize acquisition function and get new candidate point
        cand, acq_value = optimize_acqf(
            acq_function=acq_fcn,
            bounds=to.stack([to.zeros(self.cand_dim),
                             to.ones(self.cand_dim)]),
            q=1,
            num_restarts=self.acq_restarts,
            raw_samples=self.acq_samples)
        next_cand = self.uc_normalizer.project_back(cand)
        print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
        self.cands = to.cat([self.cands, next_cand], dim=0)
        to.save(self.cands, osp.join(self._save_dir, 'candidates.pt'))

        # Train and valuate the new candidate (saves to iter_{self._curr_iter}_policy.pt)
        prefix = f'iter_{self._curr_iter}'
        wrapped_trn_fcn = until_thold_exceeded(
            self.thold_succ_subroutine.item(),
            max_iter=self.max_subroutine_rep)(self.train_policy_sim)
        wrapped_trn_fcn(cand, prefix)

        # Evaluate the current policy on the target domain
        policy = to.load(osp.join(self._save_dir, f'{prefix}_policy.pt'))
        self.curr_cand_value = self.eval_policy(self._save_dir, self._env_real,
                                                policy,
                                                self.montecarlo_estimator,
                                                prefix,
                                                self.num_eval_rollouts_real)

        self.cands_values = to.cat(
            [self.cands_values,
             self.curr_cand_value.view(1)], dim=0)
        to.save(self.cands_values,
                osp.join(self._save_dir, 'candidates_values.pt'))

        # Store the argmax after training and evaluating
        curr_argmax_cand = BayRn.argmax_posterior_mean(
            self.cands, self.cands_values.unsqueeze(1), self.uc_normalizer,
            self.acq_restarts, self.acq_samples)
        self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
        to.save(self.argmax_cand,
                osp.join(self._save_dir, 'candidates_argmax.pt'))

        self.make_snapshot(snapshot_mode, float(to.mean(self.cands_values)),
                           meta_info)
示例#5
0
    def update(self):
        """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """
        # Containers for logging
        policy_losses = to.zeros(self.num_batch_updates)
        expl_strat_stds = to.zeros(self.num_batch_updates)
        q_fcn_1_losses = to.zeros(self.num_batch_updates)
        q_fcn_2_losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_1_grad_norm = to.zeros(self.num_batch_updates)
        q_fcn_2_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates),
                      total=self.num_batch_updates,
                      desc=f'Updating',
                      unit='batches',
                      file=sys.stdout,
                      leave=False):

            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Standardize rewards
            if self.standardize_rew:
                rewards = standardize(steps.rewards).unsqueeze(1)
            else:
                rewards = steps.rewards.unsqueeze(1)
            rew_scale = 1.
            rewards *= rew_scale

            with to.no_grad():
                # Create masks for the non-final observations
                not_done = to.tensor(1. - steps.done,
                                     dtype=to.get_default_dtype()).unsqueeze(1)

                # Compute the (next)state-(next)action values Q(s',a') from the target networks
                if self.policy.is_recurrent:
                    next_act_expl, next_log_probs, _ = self._expl_strat(
                        next_steps.observations, next_steps.hidden_states)
                else:
                    next_act_expl, next_log_probs = self._expl_strat(
                        next_steps.observations)
                next_q_val_target_1 = self.q_targ_1(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_2 = self.q_targ_2(
                    to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_min = to.min(
                    next_q_val_target_1,
                    next_q_val_target_2) - self.alpha * next_log_probs
                next_q_val = rewards + not_done * self.gamma * next_q_val_target_min

            # Compute the two Q-function losses
            # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2]
            q_val_1 = self.q_fcn_1(
                to.cat([steps.observations, steps.actions], dim=1))
            q_val_2 = self.q_fcn_2(
                to.cat([steps.observations, steps.actions], dim=1))
            q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val)
            q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val)
            q_fcn_1_losses[b] = q_1_loss.data
            q_fcn_2_losses[b] = q_2_loss.data

            # Compute the policy loss
            # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))]
            if self.policy.is_recurrent:
                act_expl, log_probs, _ = self._expl_strat(
                    steps.observations, steps.hidden_states)
            else:
                act_expl, log_probs = self._expl_strat(steps.observations)
            q1_pi = self.q_fcn_1(to.cat([steps.observations, act_expl], dim=1))
            q2_pi = self.q_fcn_2(to.cat([steps.observations, act_expl], dim=1))
            min_q_pi = to.min(q1_pi, q2_pi)
            policy_loss = to.mean(self.alpha * log_probs - min_q_pi)
            policy_losses[b] = policy_loss.data
            expl_strat_stds[b] = to.mean(self._expl_strat.std.data)

            # Do one optimization step for each optimizer, and clip the gradients if desired
            # Q-fcn 1
            self._optim_q_fcn_1.zero_grad()
            q_1_loss.backward()
            q_fcn_1_grad_norm[b] = self.clip_grad(self.q_fcn_1, None)
            self._optim_q_fcn_1.step()
            # Q-fcn 2
            self._optim_q_fcn_2.zero_grad()
            q_2_loss.backward()
            q_fcn_2_grad_norm[b] = self.clip_grad(self.q_fcn_2, None)
            self._optim_q_fcn_2.step()
            # Policy
            self._optim_policy.zero_grad()
            policy_loss.backward()
            policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy,
                                                 self.max_grad_norm)
            self._optim_policy.step()

            if self.learn_alpha:
                # Compute entropy coefficient loss
                alpha_loss = -to.mean(
                    self._log_alpha *
                    (log_probs.detach() + self.target_entropy))
                # Do one optimizer step for the entropy coefficient optimizer
                self._alpha_optim.zero_grad()
                alpha_loss.backward()
                self._alpha_optim.step()

            # Soft-update the target networks
            if (self._curr_iter * self.num_batch_updates +
                    b) % self.target_update_intvl == 0:
                SAC.soft_update(self.q_targ_1, self.q_fcn_1, self.tau)
                SAC.soft_update(self.q_targ_2, self.q_fcn_2, self.tau)

        # Update the learning rate if the schedulers have been specified
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.step()
            self._lr_scheduler_q_fcn_1.step()
            self._lr_scheduler_q_fcn_2.step()

        # Logging
        self.logger.add_value('Q1 loss', to.mean(q_fcn_1_losses).item())
        self.logger.add_value('Q2 loss', to.mean(q_fcn_2_losses).item())
        self.logger.add_value('policy loss', to.mean(policy_losses).item())
        self.logger.add_value('avg policy grad norm',
                              to.mean(policy_grad_norm).item())
        self.logger.add_value('avg expl strat std',
                              to.mean(expl_strat_stds).item())
        self.logger.add_value('alpha', self.alpha.item())
        if self._lr_scheduler_policy is not None:
            self.logger.add_value('learning rate',
                                  self._lr_scheduler_policy.get_lr())