Пример #1
0
    def _get_consecutive_observations(self, start_idx, num_steps):
        if num_steps == 0 and not (isinstance(start_idx, int)
                                   or isinstance(start_idx, np.int)):
            observation = stack_list_of_tuples(self.memory[start_idx])
            return Observation(*map(lambda x: x.unsqueeze(1), observation))
        num_steps = max(1, num_steps)
        if start_idx + num_steps <= self.max_len:
            obs_list = self.memory[start_idx:start_idx + num_steps]
        else:  # The trajectory is split by the circular buffer.
            delta_idx = start_idx + num_steps - self.max_len
            obs_list = np.concatenate(
                (self.memory[start_idx:self.max_len], self.memory[:delta_idx]))

        return stack_list_of_tuples(obs_list)
Пример #2
0
    def simulate(
        self, initial_state, policy, initial_action=None, logger=None, stack_obs=True
    ):
        """Simulate a set of particles starting from `state' and following `policy'."""
        if self.num_samples > 0:
            initial_state = repeat_along_dimension(
                initial_state, number=self.num_samples, dim=0
            )
            initial_state = initial_state.reshape(-1, *self.dynamical_model.dim_state)
            if initial_action is not None:
                initial_action = repeat_along_dimension(
                    initial_action, number=self.num_samples, dim=0
                )
                initial_action = initial_action.reshape(*initial_state.shape[:-1], -1)

        trajectory = rollout_model(
            dynamical_model=self.dynamical_model,
            reward_model=self.reward_model,
            policy=policy,
            initial_state=initial_state,
            initial_action=initial_action,
            max_steps=self.num_steps,
            termination_model=self.termination_model,
        )
        if not stack_obs:
            self._log_trajectory(trajectory)
            return trajectory
        else:
            observation = stack_list_of_tuples(trajectory, dim=initial_state.ndim - 1)
            self._log_observation(observation)
            return observation
Пример #3
0
    def forward(self, observation):
        """Compute the losses.

        Given an Observation, it will compute the losses.
        Given a list of Trajectories, it tries to stack them to vectorize operations.
        If it fails, will iterate over the trajectories.
        """
        if isinstance(observation, Observation):
            trajectories = [observation]
        elif len(observation) > 1:
            try:
                # When possible, stack to parallelize the trajectories.
                # This requires all trajectories to be equal of length.
                trajectories = [stack_list_of_tuples(observation)]
            except RuntimeError:
                trajectories = observation
        else:
            trajectories = observation

        self.reset_info()

        loss = Loss()
        for trajectory in trajectories:
            loss += self.actor_loss(trajectory)
            loss += self.critic_loss(trajectory)
            loss += self.regularization_loss(trajectory, len(trajectories))

        return loss / len(trajectories)
Пример #4
0
    def end_episode(self):
        """See `AbstractAgent.end_episode'.

        If the agent is training, and the base model is a GP Model, then add the
        transitions to the GP, and summarize and sparsify the GP Model.

        Then train the agent.
        """
        if self.training:
            if isinstance(self.dynamical_model.base_model, ExactGPModel):
                observation = stack_list_of_tuples(self.last_trajectory)
                for transform in self.dataset.transformations:
                    observation = transform(observation)
                print(colorize("Add data to GP Model", "yellow"))
                self.dynamical_model.base_model.add_data(
                    observation.state,
                    observation.action[
                        ..., :self.dynamical_model.base_model.dim_action[0]],
                    observation.next_state,
                )

                print(colorize("Summarize GP Model", "yellow"))
                self.dynamical_model.base_model.summarize_gp()

                for i, gp in enumerate(self.dynamical_model.base_model.gp):
                    self.logger.update(
                        **{f"gp{i} num inputs": len(gp.train_targets)})

                    if isinstance(gp, SparseGP):
                        self.logger.update(
                            **{f"gp{i} num inducing inputs": gp.xu.shape[0]})

            self.learn()
        super().end_episode()
Пример #5
0
def test_stack_list_of_lists():
    trajectory = [[1, 2, 3, 4], [20, 30, 40, 50], [3, 4, 5, 6],
                  [40, 50, 60, 70]]
    stacked_trajectory = stack_list_of_tuples(trajectory)

    np.testing.assert_allclose(stacked_trajectory[0], np.array([1, 20, 3, 40]))
    np.testing.assert_allclose(stacked_trajectory[1], np.array([2, 30, 4, 50]))
    np.testing.assert_allclose(stacked_trajectory[2], np.array([3, 40, 5, 60]))
    np.testing.assert_allclose(stacked_trajectory[3], np.array([4, 50, 6, 70]))
Пример #6
0
    def test_update(self, trajectory, preserve_origin):
        transformer = StateNormalizer(preserve_origin)

        trajectory = stack_list_of_tuples(trajectory)

        mean = torch.mean(trajectory.state, 0)
        var = torch.var(trajectory.state, 0)

        transformer.update(trajectory)
        torch.testing.assert_allclose(transformer._normalizer.mean, mean)
        torch.testing.assert_allclose(transformer._normalizer.variance, var)
Пример #7
0
    def test_inverse(self, trajectory, preserve_origin):
        transformer = ActionNormalizer(preserve_origin)
        trajectory = stack_list_of_tuples(trajectory)
        transformer.update(trajectory)

        observation = get_observation()
        obs = observation.clone()
        inverse_observation = transformer.inverse(transformer(observation))
        for x, y in zip(obs, inverse_observation):
            if x.shape == y.shape:
                torch.testing.assert_allclose(x, y)
Пример #8
0
def test_stack_list_of_observations():
    trajectory = get_trajectory()
    stacked_trajectory = stack_list_of_tuples(trajectory)
    stacked_trajectory = stacked_trajectory.to_torch()
    assert type(stacked_trajectory) is Observation
    assert stacked_trajectory.state.shape == (3, 4)
    assert stacked_trajectory.action.shape == (3, 2)
    assert stacked_trajectory.next_state.shape == (3, 4)
    assert stacked_trajectory.reward.shape == (3, )
    assert stacked_trajectory.done.shape == (3, )
    for val in stacked_trajectory:
        assert val.dtype is torch.get_default_dtype()
Пример #9
0
    def _update_model_posterior(self, last_trajectory):
        """Update model posterior of GP-models with new data."""
        if isinstance(self.dynamical_model.base_model, ExactGPModel):
            observation = stack_list_of_tuples(last_trajectory)  # Parallelize.
            if observation.action.shape[-1] > self.dynamical_model.dim_action[
                    0]:
                observation.action = observation.action[
                    ..., :self.dynamical_model.dim_action[0]]
            for transform in self.train_set.transformations:
                observation = transform(observation)
            print(colorize("Add data to GP Model", "yellow"))
            self.dynamical_model.base_model.add_data(observation.state,
                                                     observation.action,
                                                     observation.next_state)

            print(colorize("Summarize GP Model", "yellow"))
            self.dynamical_model.base_model.summarize_gp()
Пример #10
0
    def learn(self):
        """Train Policy Gradient Agent."""
        trajectories = [
            stack_list_of_tuples(t).clone() for t in self.trajectories
        ]

        def closure():
            """Gradient calculation."""
            self.optimizer.zero_grad()
            losses = self.algorithm(trajectories)
            losses.combined_loss.backward()

            torch.nn.utils.clip_grad_norm_(self.algorithm.parameters(),
                                           self.clip_gradient_val)

            return losses

        self._learn_steps(closure)
Пример #11
0
    def simulate_model(self):
        """Simulate the model.

        The simulation is initialized by concatenating samples from:
            - The empirical initial state distribution.
            - A learned or fixed initial state distribution.
            - The empirical state distribution.
        """
        # Samples from empirical initial state distribution.
        initial_states = self.initial_states.sample_batch(
            self.sim_initial_states_num_trajectories)

        # Samples from initial distribution.
        if self.sim_initial_dist_num_trajectories > 0:
            initial_states_ = self.initial_distribution.sample(
                (self.sim_initial_dist_num_trajectories, ))
            initial_states = torch.cat((initial_states, initial_states_),
                                       dim=0)

        # Samples from experience replay empirical distribution.
        if self.sim_memory_num_trajectories > 0:
            obs, *_ = self.dataset.sample_batch(
                self.sim_memory_num_trajectories)
            for transform in self.dataset.transformations:
                obs = transform.inverse(obs)
            initial_states_ = obs.state[:, 0, :]  # obs is an n-step return.
            initial_states = torch.cat((initial_states, initial_states_),
                                       dim=0)

        initial_states = initial_states.unsqueeze(0)
        self.policy.reset()
        trajectory = rollout_model(
            dynamical_model=self.dynamical_model,
            reward_model=self.reward_model,
            policy=self.policy,
            initial_state=initial_states,
            max_steps=self.sim_num_steps,
            termination_model=self.termination_model,
        )

        self.sim_trajectory = stack_list_of_tuples(trajectory)
        states = self.sim_trajectory.state.reshape(
            -1, *self.dynamical_model.dim_state)
        self.sim_dataset.append(states[::self.sim_num_subsample])
Пример #12
0
    def evaluate_action_sequence(self, action_sequence, state):
        """Evaluate action sequence by performing a rollout."""
        trajectory = stack_list_of_tuples(
            rollout_actions(
                self.dynamical_model,
                self.reward_model,
                self.action_scale * action_sequence,  # scale actions.
                state,
                self.termination_model,
            ),
            dim=-2,
        )

        returns = discount_sum(trajectory.reward, self.gamma)

        if self.terminal_reward:
            terminal_reward = self.terminal_reward(trajectory.next_state[..., -1, :])
            returns = returns + self.gamma ** self.horizon * terminal_reward
        return returns
Пример #13
0
    def test_call(self, trajectory, preserve_origin):
        transformer = StateNormalizer(preserve_origin)
        trajectory = stack_list_of_tuples(trajectory)

        transformer.update(trajectory)
        observation = get_observation()
        obs = observation.clone()
        transformed = transformer(observation)

        if preserve_origin:
            mean = 0
            scale = torch.sqrt(transformer._normalizer.variance +
                               transformer._normalizer.mean**2)
        else:
            mean = transformer._normalizer.mean
            scale = torch.sqrt(transformer._normalizer.variance)
        torch.testing.assert_allclose(transformed.state,
                                      (obs.state - mean) / scale)
        torch.testing.assert_allclose(transformed.next_state, obs.next_state)

        torch.testing.assert_allclose(transformed.action, obs.action)
        torch.testing.assert_allclose(transformed.reward, obs.reward)
        assert transformed.done == obs.done
Пример #14
0
    def test_correctness(self, gamma, value_function, entropy_reg):
        trajectory = [
            Observation(0, 0, reward=1, done=False, entropy=0.2).to_torch(),
            Observation(0, 0, reward=0.5, done=False, entropy=0.3).to_torch(),
            Observation(0, 0, reward=2, done=False, entropy=0.5).to_torch(),
            Observation(0, 0, reward=-0.2, done=False,
                        entropy=-0.2).to_torch(),
        ]

        r0 = 1 + entropy_reg * 0.2
        r1 = 0.5 + entropy_reg * 0.3
        r2 = 2 + entropy_reg * 0.5
        r3 = -0.2 - entropy_reg * 0.2

        v = 0.01 if value_function is not None else 0

        reward = mc_return(
            stack_list_of_tuples(trajectory, -2),
            gamma,
            value_function=value_function,
            entropy_regularization=entropy_reg,
            reduction="min",
        )

        torch.testing.assert_allclose(
            reward,
            torch.tensor([
                r0 + r1 * gamma + r2 * gamma**2 + r3 * gamma**3 + v * gamma**4
            ]),
        )
        assert (mc_return(
            Observation(state=0, reward=0).to_torch(),
            gamma,
            value_function,
            entropy_reg,
        ) == 0)
Пример #15
0
 def all_raw(self):
     """Get all the un-transformed data."""
     all_raw = stack_list_of_tuples(self.memory[self.valid_indexes])
     return all_raw
Пример #16
0
def plot_pendulum_trajectories(agent, environment, episode: int):
    """Plot GP inputs and trajectory in a Pendulum environment."""
    model = agent.dynamical_model.base_model
    trajectory = stack_list_of_tuples(agent.last_trajectory)
    sim_obs = agent.sim_trajectory

    for transformation in agent.dataset.transformations:
        trajectory = transformation(trajectory)
        sim_obs = transformation(sim_obs)
    if isinstance(model, ExactGPModel):
        fig, axes = plt.subplots(
            1 + model.dim_state[0] // 2, 2, sharex="col", sharey="row"
        )
    else:
        fig, axes = plt.subplots(1, 2, sharex="col", sharey="row")
        axes = axes[np.newaxis]
    fig.set_size_inches(5.5, 2.0)
    # Plot real trajectory
    sin, cos = torch.sin(trajectory.state[:, 0]), torch.cos(trajectory.state[:, 0])
    axes[0, 0].scatter(
        torch.atan2(sin, cos) * 180 / np.pi,
        trajectory.state[:, 1],
        c=trajectory.action[:, 0],
        cmap="jet",
        vmin=-1,
        vmax=1,
    )
    axes[0, 0].set_title("Real Trajectory")

    # Plot sim trajectory
    sin = torch.sin(sim_obs.state[:, 0, 0, 0])
    cos = torch.cos(sim_obs.state[:, 0, 0, 0])
    axes[0, 1].scatter(
        torch.atan2(sin, cos) * 180 / np.pi,
        sim_obs.state[:, 0, 0, 1],
        c=sim_obs.action[:, 0, 0, 0],
        cmap="jet",
        vmin=-1,
        vmax=1,
    )
    axes[0, 1].set_title("Optimistic Trajectory")

    if isinstance(model, ExactGPModel):
        for i in range(model.dim_state[0]):
            inputs = model.gp[i].train_inputs[0]
            sin, cos = inputs[:, 1], inputs[:, 0]
            axes[1 + i // 2, i % 2].scatter(
                torch.atan2(sin, cos) * 180 / np.pi,
                inputs[:, 2],
                c=inputs[:, 3],
                cmap="jet",
                vmin=-1,
                vmax=1,
            )
            axes[1 + i // 2, i % 2].set_title(f"GP {i} data.")

            if hasattr(model.gp[i], "xu"):
                inducing_points = model.gp[i].xu
                sin, cos = inducing_points[:, 1], inducing_points[:, 0]
                axes[1 + i // 2, i % 2].scatter(
                    torch.atan2(sin, cos) * 180 / np.pi,
                    inducing_points[:, 2],
                    c=inducing_points[:, 3],
                    cmap="jet",
                    marker="*",
                    vmin=-1,
                    vmax=1,
                )

    for ax_row in axes:
        for ax in ax_row:
            ax.set_xlim([-180, 180])
            ax.set_ylim([-15, 15])

    for i in range(axes.shape[0]):
        axes[i, 0].set_ylabel("Angular Velocity [rad/s]")

    for j in range(axes.shape[1]):
        axes[-1, j].set_xlabel("Angle [degree]")

    # img_name = f"{agent.comment.title()}"
    if "optimistic" in agent.comment.lower():
        name = "H-UCRL"
    elif "expected" in agent.comment.lower():
        name = "Greedy"
    elif "thompson" in agent.comment.lower():
        name = "Thompson"
    else:
        raise NotImplementedError
    plt.suptitle(f"{name} Episode {episode + 1}", x=0.53, y=0.96)

    plt.tight_layout()
    plt.savefig(f"{agent.logger.log_dir}/{episode + 1}.pdf")

    if "DISPLAY" in os.environ:
        plt.show()
    plt.close(fig)
Пример #17
0
 def _log_trajectory(self, trajectory):
     """Log the simulated trajectory."""
     observation = stack_list_of_tuples(trajectory, dim=trajectory[0].state.ndim - 1)
     self._log_observation(observation)
Пример #18
0
def mb_return(
    state,
    dynamical_model,
    reward_model,
    policy,
    num_steps=1,
    gamma=1.0,
    value_function=None,
    num_samples=1,
    entropy_reg=0.0,
    reward_transformer=RewardTransformer(),
    termination_model=None,
    reduction="none",
):
    r"""Estimate the value of a state by propagating the state with a model for N-steps.

    Rolls out the model for a number of `steps` and sums up the rewards. After this,
    it bootstraps using the value function. With T = steps:

    .. math:: V(s) = \sum_{t=0}^T \gamma^t r(s, \pi(s)) + \gamma^{T+1} V(s_{T+1})

    Note that `steps=0` means that the model is still used to predict the next state.

    Parameters
    ----------
    state: torch.Tensor
        Initial state from which planning starts. It accepts a batch of initial states.
    dynamical_model: AbstractModel
        The model predicts a distribution over next states given states and actions.
    reward_model: AbstractReward
        The reward predicts a distribution over floats or ints given states and actions.
    policy: AbstractPolicy
        The policy predicts a distribution over actions given the state.
    num_steps: int, optional. (default=1).
        Number of steps predicted with the model before (optionally) bootstrapping.
    gamma: float, optional. (default=1.).
        Discount factor.
    value_function: AbstractValueFunction, optional. (default=None).
        The value function used for bootstrapping, takes states as input.
    num_samples: int, optional. (default=0).
        The states are repeated `num_repeats` times in order to estimate the expected
        value by MC sampling of the policy, rewards and dynamics (jointly).
    entropy_reg: float, optional. (default=0).
        Entropy regularization parameter.
    termination_model: AbstractModel, optional. (default=None).
        Callable that returns True if the transition yields a terminal state.
    reward_transformer: RewardTransformer.

    Returns
    -------
    return: DynaReturn
        q_target:
            Num_samples of MC estimation of q-function target.
        trajectory:
            Sample trajectory that MC estimation produces.

    References
    ----------
    Lowrey, K., Rajeswaran, A., Kakade, S., Todorov, E., & Mordatch, I. (2018).
    Plan online, learn offline: Efficient learning and exploration via model-based
    control. ICLR.

    Sutton, R. S. (1991).
    Dyna, an integrated architecture for learning, planning, and reacting. ACM.

    Silver, D., Sutton, R. S., & Müller, M. (2008).
    Sample-based learning and search with permanent and transient memories. ICML.
    """
    # Repeat states to get a better estimate of the expected value
    state = repeat_along_dimension(state, number=num_samples, dim=0)
    trajectory = rollout_model(
        dynamical_model=dynamical_model,
        reward_model=reward_model,
        policy=policy,
        initial_state=state,
        max_steps=num_steps,
        termination_model=termination_model,
    )
    observation = stack_list_of_tuples(trajectory, dim=state.ndim - 1)
    value = mc_return(
        observation=observation,
        gamma=gamma,
        value_function=value_function,
        entropy_regularization=entropy_reg,
        reward_transformer=reward_transformer,
        reduction=reduction,
    )

    return MBValueReturn(value, observation)