Exemplo n.º 1
0
    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a):
        """
        Constructor.

        Args:
            mu_approximator (Regressor): a regressor computing mean in given a
                state;
            sigma_approximator (Regressor): a regressor computing the variance
                in given a state;
            min_a (np.ndarray): a vector specifying the minimum action value
                for each component;
            max_a (np.ndarray): a vector specifying the maximum action value
                for each component.

        """
        self._mu_approximator = mu_approximator
        self._sigma_approximator = sigma_approximator

        self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda)
        self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda)

        use_cuda = self._mu_approximator.model.use_cuda

        if use_cuda:
            self._delta_a = self._delta_a.cuda()
            self._central_a = self._central_a.cuda()
Exemplo n.º 2
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        x, u, r, xn, absorbing, last = parse_dataset(dataset)
        x = x.astype(np.float32)
        u = u.astype(np.float32)
        r = r.astype(np.float32)
        xn = xn.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        old_pol_dist = self.policy.distribution_t(obs)
        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()

        self._V.fit(x, v_target, **self._critic_fit_params)

        self._update_policy(obs, act, adv, old_log_p)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Exemplo n.º 3
0
    def _loss(self, state, action, adv):
        use_cuda = self.policy.use_cuda

        s = to_float_tensor(state, use_cuda)
        a = to_float_tensor(action, use_cuda)

        adv_t = to_float_tensor(adv, use_cuda)

        gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t)
        entropy_loss = -self.policy.entropy_t(s)

        return gradient_loss + self._entropy_coeff * entropy_loss
Exemplo n.º 4
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        state, action, reward, next_state, absorbing, last = parse_dataset(
            dataset)
        x = state.astype(np.float32)
        u = action.astype(np.float32)
        r = reward.astype(np.float32)
        xn = next_state.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
                                       self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        # Policy update
        self._old_policy = deepcopy(self.policy)
        old_pol_dist = self._old_policy.distribution_t(obs)
        old_log_prob = self._old_policy.log_prob_t(obs, act).detach()

        zero_grad(self.policy.parameters())
        loss = self._compute_loss(obs, act, adv, old_log_prob)

        prev_loss = loss.item()

        # Compute Gradient
        loss.backward()
        g = get_gradient(self.policy.parameters())

        # Compute direction through conjugate gradient
        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)

        # Line search
        self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss,
                          stepdir)

        # VF update
        self._V.fit(x, v_target, **self._critic_fit_params)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Exemplo n.º 5
0
    def distribution(self, state):
        """
        Compute the policy distribution in the given states.

        Args:
            state (np.ndarray): the set of states where the distribution is
                computed.

        Returns:
            The torch distribution for the provided states.

        """
        s = to_float_tensor(state, self._use_cuda)

        return self.distribution_t(s)
Exemplo n.º 6
0
    def entropy(self, state=None):
        """
        Compute the entropy of the policy.

        Args:
            state (np.ndarray, None): the set of states to consider. If the
                entropy of the policy can be computed in closed form, then
                ``state`` can be None.

        Returns:
            The value of the entropy of the policy.

        """
        s = to_float_tensor(state, self._use_cuda) if state is not None else None

        return self.entropy_t(s).detach().cpu().numpy().item()
Exemplo n.º 7
0
 def entropy(self, state=None):
     s = to_float_tensor(state,
                         self._use_cuda) if state is not None else None
     return self.entropy_t(s)
Exemplo n.º 8
0
 def distribution(self, state):
     s = to_float_tensor(state, self._use_cuda)
     return self.distribution_t(s)
Exemplo n.º 9
0
    def draw_action(self, state):
        with torch.no_grad():
            s = to_float_tensor(np.atleast_2d(state), self._use_cuda)
            a = self.draw_action_t(s)

        return torch.squeeze(a, dim=0).detach().cpu().numpy()
Exemplo n.º 10
0
    def __call__(self, state, action):
        s = to_float_tensor(state, self._use_cuda)
        a = to_float_tensor(action, self._use_cuda)

        return np.exp(self.log_prob_t(s, a).item())