def __init__(self, mu_approximator, sigma_approximator, min_a, max_a): """ Constructor. Args: mu_approximator (Regressor): a regressor computing mean in given a state; sigma_approximator (Regressor): a regressor computing the variance in given a state; min_a (np.ndarray): a vector specifying the minimum action value for each component; max_a (np.ndarray): a vector specifying the maximum action value for each component. """ self._mu_approximator = mu_approximator self._sigma_approximator = sigma_approximator self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda) self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda) use_cuda = self._mu_approximator.model.use_cuda if use_cuda: self._delta_a = self._delta_a.cuda() self._central_a = self._central_a.cuda()
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff * entropy_loss
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def distribution(self, state): """ Compute the policy distribution in the given states. Args: state (np.ndarray): the set of states where the distribution is computed. Returns: The torch distribution for the provided states. """ s = to_float_tensor(state, self._use_cuda) return self.distribution_t(s)
def entropy(self, state=None): """ Compute the entropy of the policy. Args: state (np.ndarray, None): the set of states to consider. If the entropy of the policy can be computed in closed form, then ``state`` can be None. Returns: The value of the entropy of the policy. """ s = to_float_tensor(state, self._use_cuda) if state is not None else None return self.entropy_t(s).detach().cpu().numpy().item()
def entropy(self, state=None): s = to_float_tensor(state, self._use_cuda) if state is not None else None return self.entropy_t(s)
def distribution(self, state): s = to_float_tensor(state, self._use_cuda) return self.distribution_t(s)
def draw_action(self, state): with torch.no_grad(): s = to_float_tensor(np.atleast_2d(state), self._use_cuda) a = self.draw_action_t(s) return torch.squeeze(a, dim=0).detach().cpu().numpy()
def __call__(self, state, action): s = to_float_tensor(state, self._use_cuda) a = to_float_tensor(action, self._use_cuda) return np.exp(self.log_prob_t(s, a).item())