예제 #1
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        x, u, r, xn, absorbing, last = parse_dataset(dataset)
        x = x.astype(np.float32)
        u = u.astype(np.float32)
        r = r.astype(np.float32)
        xn = xn.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        old_pol_dist = self.policy.distribution_t(obs)
        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()

        self._V.fit(x, v_target, **self._critic_fit_params)

        self._update_policy(obs, act, adv, old_log_p)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
예제 #2
0
    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a):
        """
        Constructor.

        Args:
            mu_approximator (Regressor): a regressor computing mean in given a
                state;
            sigma_approximator (Regressor): a regressor computing the variance
                in given a state;
            min_a (np.ndarray): a vector specifying the minimum action value
                for each component;
            max_a (np.ndarray): a vector specifying the maximum action value
                for each component.

        """
        self._mu_approximator = mu_approximator
        self._sigma_approximator = sigma_approximator

        self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda)
        self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda)

        use_cuda = self._mu_approximator.model.use_cuda

        if use_cuda:
            self._delta_a = self._delta_a.cuda()
            self._central_a = self._central_a.cuda()
예제 #3
0
    def _loss(self, state, action, adv):
        use_cuda = self.policy.use_cuda

        s = to_float_tensor(state, use_cuda)
        a = to_float_tensor(action, use_cuda)

        adv_t = to_float_tensor(adv, use_cuda)

        gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t)
        entropy_loss = -self.policy.entropy_t(s)

        return gradient_loss + self._entropy_coeff() * entropy_loss
예제 #4
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        state, action, reward, next_state, absorbing, last = parse_dataset(
            dataset)
        x = state.astype(np.float32)
        u = action.astype(np.float32)
        r = reward.astype(np.float32)
        xn = next_state.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
                                       self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        # Policy update
        self._old_policy = deepcopy(self.policy)
        old_pol_dist = self._old_policy.distribution_t(obs)
        old_log_prob = self._old_policy.log_prob_t(obs, act).detach()

        zero_grad(self.policy.parameters())
        loss = self._compute_loss(obs, act, adv, old_log_prob)

        prev_loss = loss.item()

        # Compute Gradient
        loss.backward()
        g = get_gradient(self.policy.parameters())

        # Compute direction through conjugate gradient
        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)

        # Line search
        self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss,
                          stepdir)

        # VF update
        self._V.fit(x, v_target, **self._critic_fit_params)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
예제 #5
0
    def __init__(self, mu, scale, dim, use_cuda):
        """
        Constructor.

        Args:
            mu (np.ndarray): centers of the gaussian RBFs;
            scale (np.ndarray): scales for the RBFs;
            dim (np.ndarray): list of dimension to be considered for the computation of the features;
            use_cuda (bool): whether to use cuda for the computation or not.

        """
        self._mu = to_float_tensor(mu, use_cuda)
        self._scale = to_float_tensor(scale, use_cuda)
        if dim is not None:
            self._dim = to_int_tensor(dim, use_cuda)
        else:
            self._dim = None

        self._use_cuda = use_cuda
예제 #6
0
    def __init__(self, P, phi, nu, use_cuda):
        r"""
        Constructor.

        Args:
            P (np.ndarray): weights matrix, every weight should be drawn from a normal distribution;
            phi (np.ndarray): bias vector, every weight should be drawn from a uniform distribution in the interval
                [-\pi, \pi);
             values of the input variables, i.e. delta = high - low;
            nu (float):  bandwidth parameter, it should be chosen approximately as the average pairwise distances
                between different observation vectors;
            use_cuda (bool): whether to use cuda for the computation or not.

        """
        self._P = to_float_tensor(P, use_cuda)
        self._phi = to_float_tensor(phi, use_cuda)
        self._nu = nu

        self._use_cuda = use_cuda
예제 #7
0
    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a,
                 log_std_min, log_std_max):
        """
        Constructor.

        Args:
            mu_approximator (Regressor): a regressor computing mean in given a
                state;
            sigma_approximator (Regressor): a regressor computing the variance
                in given a state;
            min_a (np.ndarray): a vector specifying the minimum action value
                for each component;
            max_a (np.ndarray): a vector specifying the maximum action value
                for each component.
            log_std_min ([float, Parameter]): min value for the policy log std;
            log_std_max ([float, Parameter]): max value for the policy log std.

        """
        self._mu_approximator = mu_approximator
        self._sigma_approximator = sigma_approximator

        self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda)
        self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda)

        self._log_std_min = to_parameter(log_std_min)
        self._log_std_max = to_parameter(log_std_max)

        self._eps_log_prob = 1e-6

        use_cuda = self._mu_approximator.model.use_cuda

        if use_cuda:
            self._delta_a = self._delta_a.cuda()
            self._central_a = self._central_a.cuda()

        self._add_save_attr(_mu_approximator='mushroom',
                            _sigma_approximator='mushroom',
                            _delta_a='torch',
                            _central_a='torch',
                            _log_std_min='mushroom',
                            _log_std_max='mushroom',
                            _eps_log_prob='primitive')
예제 #8
0
    def __call__(self, *args):
        x = self._concatenate(args)

        x = to_float_tensor(np.atleast_2d(x))

        y_list = [self._phi[i].forward(x) for i in range(len(self._phi))]
        y = torch.cat(y_list, 1).squeeze()

        y = y.detach().numpy()

        if y.shape[0] == 1:
            return y[0]
        else:
            return y
예제 #9
0
    def distribution(self, state):
        """
        Compute the policy distribution in the given states.

        Args:
            state (np.ndarray): the set of states where the distribution is
                computed.

        Returns:
            The torch distribution for the provided states.

        """
        s = to_float_tensor(state, self._use_cuda)

        return self.distribution_t(s)
예제 #10
0
    def entropy(self, state=None):
        """
        Compute the entropy of the policy.

        Args:
            state (np.ndarray, None): the set of states to consider. If the
                entropy of the policy can be computed in closed form, then
                ``state`` can be None.

        Returns:
            The value of the entropy of the policy.

        """
        s = to_float_tensor(state, self._use_cuda) if state is not None else None

        return self.entropy_t(s).detach().cpu().numpy().item()
예제 #11
0
    def draw_action(self, state):
        with torch.no_grad():
            s = to_float_tensor(np.atleast_2d(state), self._use_cuda)
            a = self.draw_action_t(s)

        return torch.squeeze(a, dim=0).detach().cpu().numpy()
예제 #12
0
    def __call__(self, state, action):
        s = to_float_tensor(np.atleast_2d(state), self._use_cuda)
        a = to_float_tensor(np.atleast_2d(action), self._use_cuda)

        return np.exp(self.log_prob_t(s, a).item())