def gae(self, concat_ros: StepSequence, v_pred: to.Tensor = None, requires_grad: bool = False) -> to.Tensor: """ Compute the generalized advantage estimation as described in [1]. :param concat_ros: concatenated rollouts (sequence of steps from potentially different rollouts) :param v_pred: state-value predictions if already computed, else pass None :param requires_grad: is the gradient required :return adv: tensor of advantages """ with ExitStack() as stack: if not requires_grad: stack.enter_context(to.no_grad()) if v_pred is None: # Get the predictions from the value function v_pred = self.values(concat_ros) # Compute the advantages adv = to.empty_like(v_pred) for k in reversed(range(concat_ros.length)): if concat_ros[k].done: adv[k] = concat_ros[k].reward - v_pred[k] else: adv[k] = concat_ros[k].reward + self.gamma*v_pred[k + 1] - v_pred[k] + \ self.gamma*self.lamda*adv[k + 1] if self.standardize_adv: if isinstance(self.standardizer, RunningStandardizer): adv = self.standardizer(adv, axis=0) else: adv = standardize(adv) return adv
def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns # Get the perturbations (deltas from the current policy parameters) s = param_results.parameters - self._policy.param_values # also divide by the standard deviation to fully standardize s /= self._expl_strat.std if self.transform_returns: # Ascending sort according to return values idcs_acs = np.argsort(rets_avg_ros)[::-1] s_asc = s[list(idcs_acs), :] # Update the mean (see [1, 2]) delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc) self._policy.param_values += self.lr_mean * delta_mean # Update the std (see [1, 2]) grad_std = self.eta_std_util @ (s_asc**2 - 1.) new_std = self._expl_strat.std * to.exp( self.lr_std * grad_std / 2.) self._expl_strat.adapt(std=new_std) else: # Standardize averaged returns over all pop_size rollouts rets_stdized = standardize(rets_avg_ros) rets_stdized = to.from_numpy(rets_stdized).to( to.get_default_dtype()) # delta_mean = 1./len(param_results) * (rets_stdized @ s) delta_mean = 1. / (self._expl_strat.std * len(param_results)) * (rets_stdized @ s) self._policy.param_values += self.lr_mean * delta_mean # Update the std (monotonous exponential decay) new_std = self._expl_strat.std * 0.999**self._curr_iter self._expl_strat.adapt(std=new_std) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std)) self.logger.add_value( 'avg expl strat std', to.mean(self._expl_strat.std.data).detach().numpy()) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std)) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy().item())
def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor, uc_normalizer: UnitCubeProjector, num_restarts: int, num_samples: int) -> to.Tensor: """ Compute the GP input with the maximal posterior mean. :param cands: candidates a.k.a. x :param cands_values: observed values a.k.a. y :param uc_normalizer: unit cube normalizer used during the experiments (can be recovered form the bounds) :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :return: un-normalized candidate with maximum posterior value a.k.a. x """ # Normalize the input data and standardize the output data cands_norm = uc_normalizer.project_to(cands) cands_values_stdized = standardize(cands_values) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) # Find position with maximal posterior mean cand_norm, acq_value = optimize_acqf( acq_function=PosteriorMean(gp), bounds=to.stack([ to.zeros_like(uc_normalizer.bound_lo), to.ones_like(uc_normalizer.bound_up) ]), q=1, num_restarts=num_restarts, raw_samples=num_samples) cand = uc_normalizer.project_back(cand_norm.detach()) print_cbt(f'Converged to argmax of the posterior mean\n{cand.numpy()}', 'g', bright=True) return cand
def step(self, snapshot_mode: str, meta_info: dict = None): if not self.initialized: # Start initialization phase self.train_init_policies() self.eval_init_policies() self.initialized = True # Normalize the input data and standardize the output data cands_norm = self.uc_normalizer.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get( 'beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement( gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement( gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.cand_dim), to.ones(self.cand_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples) next_cand = self.uc_normalizer.project_back(cand) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) to.save(self.cands, osp.join(self._save_dir, 'candidates.pt')) # Train and valuate the new candidate (saves to iter_{self._curr_iter}_policy.pt) prefix = f'iter_{self._curr_iter}' wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subroutine.item(), max_iter=self.max_subroutine_rep)(self.train_policy_sim) wrapped_trn_fcn(cand, prefix) # Evaluate the current policy on the target domain policy = to.load(osp.join(self._save_dir, f'{prefix}_policy.pt')) self.curr_cand_value = self.eval_policy(self._save_dir, self._env_real, policy, self.montecarlo_estimator, prefix, self.num_eval_rollouts_real) self.cands_values = to.cat( [self.cands_values, self.curr_cand_value.view(1)], dim=0) to.save(self.cands_values, osp.join(self._save_dir, 'candidates_values.pt')) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.uc_normalizer, self.acq_restarts, self.acq_samples) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) to.save(self.argmax_cand, osp.join(self._save_dir, 'candidates_argmax.pt')) self.make_snapshot(snapshot_mode, float(to.mean(self.cands_values)), meta_info)
def update(self): """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """ # Containers for logging policy_losses = to.zeros(self.num_batch_updates) expl_strat_stds = to.zeros(self.num_batch_updates) q_fcn_1_losses = to.zeros(self.num_batch_updates) q_fcn_2_losses = to.zeros(self.num_batch_updates) policy_grad_norm = to.zeros(self.num_batch_updates) q_fcn_1_grad_norm = to.zeros(self.num_batch_updates) q_fcn_2_grad_norm = to.zeros(self.num_batch_updates) for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates, desc=f'Updating', unit='batches', file=sys.stdout, leave=False): # Sample steps and the associated next step from the replay memory steps, next_steps = self._memory.sample(self.batch_size) steps.torch(data_type=to.get_default_dtype()) next_steps.torch(data_type=to.get_default_dtype()) # Standardize rewards if self.standardize_rew: rewards = standardize(steps.rewards).unsqueeze(1) else: rewards = steps.rewards.unsqueeze(1) rew_scale = 1. rewards *= rew_scale with to.no_grad(): # Create masks for the non-final observations not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype()).unsqueeze(1) # Compute the (next)state-(next)action values Q(s',a') from the target networks if self.policy.is_recurrent: next_act_expl, next_log_probs, _ = self._expl_strat( next_steps.observations, next_steps.hidden_states) else: next_act_expl, next_log_probs = self._expl_strat( next_steps.observations) next_q_val_target_1 = self.q_targ_1( to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_2 = self.q_targ_2( to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_min = to.min( next_q_val_target_1, next_q_val_target_2) - self.alpha * next_log_probs next_q_val = rewards + not_done * self.gamma * next_q_val_target_min # Compute the two Q-function losses # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2] q_val_1 = self.q_fcn_1( to.cat([steps.observations, steps.actions], dim=1)) q_val_2 = self.q_fcn_2( to.cat([steps.observations, steps.actions], dim=1)) q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val) q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val) q_fcn_1_losses[b] = q_1_loss.data q_fcn_2_losses[b] = q_2_loss.data # Compute the policy loss # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))] if self.policy.is_recurrent: act_expl, log_probs, _ = self._expl_strat( steps.observations, steps.hidden_states) else: act_expl, log_probs = self._expl_strat(steps.observations) q1_pi = self.q_fcn_1(to.cat([steps.observations, act_expl], dim=1)) q2_pi = self.q_fcn_2(to.cat([steps.observations, act_expl], dim=1)) min_q_pi = to.min(q1_pi, q2_pi) policy_loss = to.mean(self.alpha * log_probs - min_q_pi) policy_losses[b] = policy_loss.data expl_strat_stds[b] = to.mean(self._expl_strat.std.data) # Do one optimization step for each optimizer, and clip the gradients if desired # Q-fcn 1 self._optim_q_fcn_1.zero_grad() q_1_loss.backward() q_fcn_1_grad_norm[b] = self.clip_grad(self.q_fcn_1, None) self._optim_q_fcn_1.step() # Q-fcn 2 self._optim_q_fcn_2.zero_grad() q_2_loss.backward() q_fcn_2_grad_norm[b] = self.clip_grad(self.q_fcn_2, None) self._optim_q_fcn_2.step() # Policy self._optim_policy.zero_grad() policy_loss.backward() policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy, self.max_grad_norm) self._optim_policy.step() if self.learn_alpha: # Compute entropy coefficient loss alpha_loss = -to.mean( self._log_alpha * (log_probs.detach() + self.target_entropy)) # Do one optimizer step for the entropy coefficient optimizer self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() # Soft-update the target networks if (self._curr_iter * self.num_batch_updates + b) % self.target_update_intvl == 0: SAC.soft_update(self.q_targ_1, self.q_fcn_1, self.tau) SAC.soft_update(self.q_targ_2, self.q_fcn_2, self.tau) # Update the learning rate if the schedulers have been specified if self._lr_scheduler_policy is not None: self._lr_scheduler_policy.step() self._lr_scheduler_q_fcn_1.step() self._lr_scheduler_q_fcn_2.step() # Logging self.logger.add_value('Q1 loss', to.mean(q_fcn_1_losses).item()) self.logger.add_value('Q2 loss', to.mean(q_fcn_2_losses).item()) self.logger.add_value('policy loss', to.mean(policy_losses).item()) self.logger.add_value('avg policy grad norm', to.mean(policy_grad_norm).item()) self.logger.add_value('avg expl strat std', to.mean(expl_strat_stds).item()) self.logger.add_value('alpha', self.alpha.item()) if self._lr_scheduler_policy is not None: self.logger.add_value('learning rate', self._lr_scheduler_policy.get_lr())