def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def __init__(self, mu_approximator, sigma_approximator, min_a, max_a): """ Constructor. Args: mu_approximator (Regressor): a regressor computing mean in given a state; sigma_approximator (Regressor): a regressor computing the variance in given a state; min_a (np.ndarray): a vector specifying the minimum action value for each component; max_a (np.ndarray): a vector specifying the maximum action value for each component. """ self._mu_approximator = mu_approximator self._sigma_approximator = sigma_approximator self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda) self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda) use_cuda = self._mu_approximator.model.use_cuda if use_cuda: self._delta_a = self._delta_a.cuda() self._central_a = self._central_a.cuda()
def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff() * entropy_loss
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def __init__(self, mu, scale, dim, use_cuda): """ Constructor. Args: mu (np.ndarray): centers of the gaussian RBFs; scale (np.ndarray): scales for the RBFs; dim (np.ndarray): list of dimension to be considered for the computation of the features; use_cuda (bool): whether to use cuda for the computation or not. """ self._mu = to_float_tensor(mu, use_cuda) self._scale = to_float_tensor(scale, use_cuda) if dim is not None: self._dim = to_int_tensor(dim, use_cuda) else: self._dim = None self._use_cuda = use_cuda
def __init__(self, P, phi, nu, use_cuda): r""" Constructor. Args: P (np.ndarray): weights matrix, every weight should be drawn from a normal distribution; phi (np.ndarray): bias vector, every weight should be drawn from a uniform distribution in the interval [-\pi, \pi); values of the input variables, i.e. delta = high - low; nu (float): bandwidth parameter, it should be chosen approximately as the average pairwise distances between different observation vectors; use_cuda (bool): whether to use cuda for the computation or not. """ self._P = to_float_tensor(P, use_cuda) self._phi = to_float_tensor(phi, use_cuda) self._nu = nu self._use_cuda = use_cuda
def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_min, log_std_max): """ Constructor. Args: mu_approximator (Regressor): a regressor computing mean in given a state; sigma_approximator (Regressor): a regressor computing the variance in given a state; min_a (np.ndarray): a vector specifying the minimum action value for each component; max_a (np.ndarray): a vector specifying the maximum action value for each component. log_std_min ([float, Parameter]): min value for the policy log std; log_std_max ([float, Parameter]): max value for the policy log std. """ self._mu_approximator = mu_approximator self._sigma_approximator = sigma_approximator self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda) self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda) self._log_std_min = to_parameter(log_std_min) self._log_std_max = to_parameter(log_std_max) self._eps_log_prob = 1e-6 use_cuda = self._mu_approximator.model.use_cuda if use_cuda: self._delta_a = self._delta_a.cuda() self._central_a = self._central_a.cuda() self._add_save_attr(_mu_approximator='mushroom', _sigma_approximator='mushroom', _delta_a='torch', _central_a='torch', _log_std_min='mushroom', _log_std_max='mushroom', _eps_log_prob='primitive')
def __call__(self, *args): x = self._concatenate(args) x = to_float_tensor(np.atleast_2d(x)) y_list = [self._phi[i].forward(x) for i in range(len(self._phi))] y = torch.cat(y_list, 1).squeeze() y = y.detach().numpy() if y.shape[0] == 1: return y[0] else: return y
def distribution(self, state): """ Compute the policy distribution in the given states. Args: state (np.ndarray): the set of states where the distribution is computed. Returns: The torch distribution for the provided states. """ s = to_float_tensor(state, self._use_cuda) return self.distribution_t(s)
def entropy(self, state=None): """ Compute the entropy of the policy. Args: state (np.ndarray, None): the set of states to consider. If the entropy of the policy can be computed in closed form, then ``state`` can be None. Returns: The value of the entropy of the policy. """ s = to_float_tensor(state, self._use_cuda) if state is not None else None return self.entropy_t(s).detach().cpu().numpy().item()
def draw_action(self, state): with torch.no_grad(): s = to_float_tensor(np.atleast_2d(state), self._use_cuda) a = self.draw_action_t(s) return torch.squeeze(a, dim=0).detach().cpu().numpy()
def __call__(self, state, action): s = to_float_tensor(np.atleast_2d(state), self._use_cuda) a = to_float_tensor(np.atleast_2d(action), self._use_cuda) return np.exp(self.log_prob_t(s, a).item())