Пример #1
0
    def _enforce_act_expl_bounds(log_probs: to.Tensor,
                                 act_expl: to.Tensor,
                                 eps: float = 1e-6):
        r"""
        Transform the `log_probs` accounting for the squashed tanh exploration.

        .. seealso::
            Eq. (21) in [2]

        :param log_probs: $\log( \mu(u|s) )$
        :param act_expl: action values with explorative noise
        :param eps: additive term for numerical stability of the logarithm function
        :return: $\log( \pi(a|s) )$
        """
        # Batch dim along the first dim
        act_expl_ = atleast_2D(act_expl)
        log_probs_ = atleast_2D(log_probs)

        # Sum over action dimensions
        log_probs_ = to.sum(
            log_probs_ -
            to.log(to.ones_like(act_expl_) - to.pow(act_expl_, 2) + eps),
            1,
            keepdim=True)
        if act_expl_.shape[0] > 1:
            return log_probs_  # batched mode
        else:
            return log_probs_.squeeze(1)  # one sample at a time
Пример #2
0
def test_atleast_2D(x):
    x_al2d = atleast_2D(x)
    assert x_al2d.ndim >= 2

    # We want to mimic the numpy function
    x_np = np.atleast_2d(x.numpy())
    assert np.all(x_al2d.numpy() == x_np)
Пример #3
0
    def derivative(self, inp: to.Tensor) -> to.Tensor:
        """
        Compute the drivative of the features w.r.t. the inputs.

        .. note::
            Only processing of 1-dim input (e.g., no images)! The input can be batched along the first dimension.

        :param inp: input i.e. observations in the RL setting
        :return: value of all features derivatives given the observations
        """

        if inp.ndimension() > 2:
            raise pyrado.ShapeErr(msg='RBF class can only handle 1-dim or 2-dim input!')
        inp = atleast_2D(inp)  # first dim is the batch size, the second dim it the actual input dimension
        inp = inp.reshape(inp.shape[0], 1, inp.shape[1]).repeat(1, self.centers.shape[0], 1)  # reshape explicitly

        exp_sq_dist = to.exp(-self.scale*to.pow(inp - self.centers, 2))
        exp_sq_dist_d = -2*self.scale * (inp - self.centers)

        feat_val = to.empty(inp.shape[0], self.num_feat)
        feat_val_dot = to.empty(inp.shape[0], self.num_feat)

        for i, (sample, sample_d) in enumerate(zip(exp_sq_dist, exp_sq_dist_d)):
            if self._state_wise_norm:
                # Normalize the features such that the activation for every state dimension sums up to one
                feat_val[i, :] = normalize(sample, axis=0, order=1).reshape(-1, )
            else:
                # Turn the features into a vector and normalize over all of them
                feat_val[i, :] = normalize(sample.t().reshape(-1, ), axis=-1, order=1)

            feat_val_dot[i, :] = sample_d.squeeze() * feat_val[i, :] - feat_val[i, :] * sum(sample_d.squeeze() * feat_val[i, :])

        return feat_val_dot
Пример #4
0
    def __init__(self, inp_dim: int, num_feat_per_dim: int,
                 bandwidth: Union[float, np.ndarray, to.Tensor]):
        r"""
        Gaussian kernel: $k(x,y) = \exp(-\sigma**2 / (2*d) * ||x-y||^2)$
                         Sample from $\mathcal{N}(0,1)$ and scale the result by $\sigma / \sqrt{2*d}$

        :param inp_dim: flat dimension of the inputs i.e. the observations, called $d$ in [1]
        :param num_feat_per_dim: number of random Fourier features, called $D$ in [1]. In contrast to the `RBFFeat`
                                 class, the output dimensionality, thus the number of associated policy parameters is
                                 `num_feat_per_dim` and not`num_feat_per_dim * inp_dim`.
        :param bandwidth: scaling factor for the sampled frequencies.
                          Pass a constant of for example env.obs_space.bound_up.
                          According to [1] and the note above we should use d here.
                          Actually, it is not a bandwidth since it is not a frequency.
        """
        self.num_feat_per_dim = num_feat_per_dim
        self.scale = to.sqrt(to.tensor(2. / num_feat_per_dim))
        # Sample omega from a standardized normal distribution
        self.freq = to.randn(num_feat_per_dim, inp_dim)
        # Scale the frequency matrix with the bandwidth factor
        if not isinstance(bandwidth, to.Tensor):
            bandwidth = to.from_numpy(np.asanyarray(bandwidth))
        self.freq *= to.sqrt(to.tensor(2.) / atleast_2D(bandwidth))
        # Sample b from a uniform distribution [0, 2pi]
        self.shift = 2 * np.pi * to.rand(num_feat_per_dim)
Пример #5
0
    def _build_q_table(self, obs: to.Tensor) -> (to.Tensor, to.Tensor, int):
        """
        Compute the state-action values for the given observations and all possible actions.
        Since we operate on a discrete action space, we can construct a table (here for 3 actions)
        o_1 a_1
        o_1 a_2
        o_1 a_3
          ...
        o_N a_1
        o_N a_2
        o_N a_3

        :param obs: current observations
        :return: Q-values for all state-action combinations of dimension batch_size x act_space_flat_sim,
                 indices, batch size
        """
        # Create batched state-action table
        obs = atleast_2D(obs)  # batch dim is along first axis
        columns_obs = obs.repeat_interleave(
            repeats=self.env_spec.act_space.flat_dim, dim=0)
        columns_act = to.tensor(self.env_spec.act_space.eles).repeat(
            obs.shape[0], 1)

        # Batch process via PyTorch Module class
        q_vals = self.net(to.cat([columns_obs, columns_act], dim=1))

        # Reshaped (different actions are over columns)
        q_vals = q_vals.reshape(-1, self.env_spec.act_space.flat_dim)

        # Select the action that maximizes the Q-value
        argmax_act_idcs = to.argmax(q_vals, dim=1)

        return q_vals, argmax_act_idcs, obs.shape[0]
Пример #6
0
    def forward(self, obs: to.Tensor) -> to.Tensor:
        # Get the Q-values from the owned FNN
        obs = atleast_2D(obs)
        batch_dim = obs.shape[0]
        q_vals = self.q_values(obs)

        # Select the actions with the highest Q-value
        act_idcs = to.argmax(q_vals, dim=1)
        all_acts = to.tensor(self.env_spec.act_space.eles).view(1, -1)  # get all possible (discrete) actions
        acts = all_acts.repeat(batch_dim, 1)
        if batch_dim == 1:
            return acts.gather(dim=1, index=act_idcs.view(-1, 1)).squeeze(0)  # select column
        if batch_dim > 1:
            return acts.gather(dim=1, index=act_idcs.view(-1, 1))  # select columns
Пример #7
0
    def q_values_chosen(self, obs: to.Tensor) -> to.Tensor:
        """
        Compute the state-action values for the given observations and all possible actions.
        Since we operate on a discrete ation space, we can construct a table.

        :param obs: obersvations
        :return: Q-values for all state-action combinations, dimension equals flat action space dimension
        """

        # Get the Q-values from the owned FNN
        obs = atleast_2D(obs)
        q_vals = self.q_values(obs)

        # Select the Q-values from the that the policy would have selected
        act_idcs = to.argmax(q_vals, dim=1)
        return q_vals.gather(dim=1, index=act_idcs.view(-1, 1)).squeeze(1)  # select columns
Пример #8
0
    def __call__(self, inp: to.Tensor) -> to.Tensor:
        """
        Evaluate the features, see [1].

        .. note::
            Only processing of 1-dim input (e.g., no images)! The input can be batched along the first dimension.

        :param inp: input i.e. observations in the RL setting
        :return: 1-dim vector of all feature values given the observations
        """
        if inp.ndimension() > 2:
            raise pyrado.ShapeErr(msg='RBF class can only handle 1-dim or 2-dim input!')
        inp = atleast_2D(inp)  # first dim is the batch size, the second dim it the actual input dimension

        # Resize if batched and return the feature value
        shift = self.shift.repeat(inp.shape[0], 1)
        return self.scale*to.cos([email protected]() + shift)
Пример #9
0
    def q_values(self, obs: to.Tensor) -> to.Tensor:
        """
        Compute the state-action values for the given observations and all possible actions.
        Since we operate on a discrete ation space, we can construct a table
        o1 a1
        o1 a2
         ...
        oN a1
        oN a2

        :param obs: obersvations
        :return: Q-values for all state-action combinations, dim = batch_size x act_space_flat_sim
        """
        # Create batched state-action table
        obs = atleast_2D(obs)  # btach dim is along first axis
        columns_obs = obs.repeat_interleave(repeats=self.env_spec.act_space.flat_dim, dim=0)
        columns_act = to.tensor(self.env_spec.act_space.eles).repeat(obs.shape[0], 1)

        # Batch process via Pytorch Module class
        q_vals = self.net(to.cat([columns_obs, columns_act], dim=1))

        # Return reshaped tensor (different actions are over columns)
        return q_vals.view(-1, self.env_spec.act_space.flat_dim)
Пример #10
0
    def __init__(self,
                 data: to.Tensor,
                 window_size: int,
                 ratio_train: float,
                 standardize_data: bool = False,
                 scale_min_max_data: bool = False,
                 name: str = 'Unnamed data set'):
        r"""
        Constructor

        :param data: complete raw data set, where the samples are along the first dimension
        :param window_size: length of the sequences fed to the policy for predicting the next value
        :param ratio_train: ratio of the training samples w.r.t. the total sample count
        :param standardize_data: if `True`, the data is standardized to be $~ N(0,1)$
        :param scale_min_max_data:  if `True`, the data is scaled to $\in [-1, 1]$
        :param name: descriptive name for the data set
        """
        if not isinstance(data, to.Tensor):
            raise pyrado.TypeErr(given=data, expected_type=to.Tensor)
        if not isinstance(window_size, int):
            raise pyrado.TypeErr(given=window_size, expected_type=int)
        if window_size < 1:
            raise pyrado.ValueErr(given=window_size, ge_constraint='1')
        if not isinstance(ratio_train, float):
            raise pyrado.TypeErr(given=ratio_train, expected_type=float)
        if not (0 < ratio_train < 1):
            raise pyrado.ValueErr(given=ratio_train, g_constraint='0', l_constraint='1')
        if standardize_data and scale_min_max_data:
            raise pyrado.ValueErr(msg='Scaling and normalizing the data at the same time is not supported!')

        self.data_all_raw = atleast_2D(data).T if data.ndimension() == 1 else data  # samples along rows
        self._ratio_train = ratio_train
        self._window_size = window_size
        self.name = name

        # Process the data
        self.is_standardized, self.is_scaled = False, False
        if standardize_data:
            self.data_all = standardize(self.data_all_raw)  # ~ N(0,1)
            self.is_standardized = True
        elif scale_min_max_data:
            self.data_all = scale_min_max(self.data_all_raw, -1, 1)  # in [-1, 1]
            self.is_scaled = True
        else:
            self.data_all = self.data_all_raw

        # Split the data into training and testing data
        self.data_trn = self.data_all[:self.num_samples_trn]
        self.data_tst = self.data_all[self.num_samples_trn:]

        # Targets are the next time steps
        self.data_all_inp = self.data_all[:-1, :]
        self.data_trn_inp = self.data_trn[:-1, :]
        self.data_tst_inp = self.data_tst[:-1, :]
        self.data_all_targ = self.data_all[1:, :]
        self.data_trn_targ = self.data_trn[1:, :]
        self.data_tst_targ = self.data_tst[1:, :]

        # Create sequences
        self.data_trn_ws = self.cut_to_window_size(self.data_trn, self._window_size)
        self.data_tst_ws = self.cut_to_window_size(self.data_tst, self._window_size)
        self.data_trn_seqs = create_sequences(self.data_trn_ws, len_seq=self._window_size + 1)
        self.data_tst_seqs = create_sequences(self.data_tst_ws, len_seq=self._window_size + 1)

        print_cbt(f'Created {str(self)}', 'w')