Пример #1
0
 def _statistics_from_paths(self, paths, stat_prefix):
     eval_replay_buffer = UpdatableSubtrajReplayBuffer(
         len(paths) * (self.max_path_length + 1),
         self.env,
         self.subtraj_length,
         self.memory_dim,
     )
     for path in paths:
         eval_replay_buffer.add_trajectory(path)
     raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories()
     assert raw_subtraj_batch is not None
     subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch)
     if self.save_memory_gradients:
         subtraj_batch['memories'].requires_grad = True
     statistics = self._statistics_from_subtraj_batch(
         subtraj_batch, stat_prefix=stat_prefix
     )
     statistics.update(eval_util.get_generic_path_information(
         paths, stat_prefix="Test",
     ))
     env_actions = np.vstack([path["actions"][:self.action_dim] for path in
                              paths])
     writes = np.vstack([path["actions"][self.action_dim:] for path in
                         paths])
     statistics.update(create_stats_ordered_dict(
         'Env Actions', env_actions, stat_prefix=stat_prefix
     ))
     statistics.update(create_stats_ordered_dict(
         'Writes', writes, stat_prefix=stat_prefix
     ))
     return statistics
Пример #2
0
 def test_size_add_none(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     self.assertEqual(buff.num_steps_can_sample(return_all=True), 0)
Пример #3
0
 def test_size_add_one(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     observation = rand(), rand()
     action = rand(), rand()
     buff.add_sample(observation, action, 1, False)
     self.assertEqual(buff.num_steps_can_sample(return_all=True), 0)
Пример #4
0
 def test_dloss_dwrites_are_zero_initially(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     last_write = rand()
     for _ in range(13):
         observation = rand(), last_write
         write = rand()
         action = rand(), write
         last_write = write
         buff.add_sample(observation, action, 1, False)
     subtrajs, _ = buff.random_subtrajectories(5)
     self.assertNpEqual(subtrajs['dloss_dwrites'], np.zeros((5, 2, 1)))
Пример #5
0
 def test_next_memory_equals_write_after_overflow(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=10,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     last_write = rand()
     for _ in range(13):
         observation = rand(), last_write
         write = rand()
         action = rand(), write
         last_write = write
         buff.add_sample(observation, action, 1, False)
     subtrajs, _ = buff.random_subtrajectories(5)
     self.assertNpEqual(subtrajs['next_memories'], subtrajs['writes'])
Пример #6
0
 def test_random_subtraj_shape(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     observation = rand(), rand()
     action = rand(), rand()
     for _ in range(10):
         buff.add_sample(observation, action, 1, False)
     subtrajs, _ = buff.random_subtrajectories(5)
     self.assertEqual(subtrajs['env_obs'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['env_actions'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['next_env_obs'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['memories'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['next_memories'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['writes'].shape, (5, 2, 1))
     self.assertEqual(subtrajs['rewards'].shape, (5, 2))
     self.assertEqual(subtrajs['dloss_dwrites'].shape, (5, 2, 1))
Пример #7
0
    def test_update_memories_does_not_update_other_memories(self):
        env = StubMemoryEnv()
        buff = UpdatableSubtrajReplayBuffer(
            max_replay_buffer_size=100,
            env=env,
            subtraj_length=2,
            memory_dim=1,
        )
        last_write = rand()
        buff.terminate_episode((rand(), rand()), True)
        for _ in range(5):
            observation = rand(), last_write
            write = rand()
            action = rand(), write
            last_write = write
            buff.add_sample(observation, action, 1, False)
        # White box testing...sue me.
        old_memories = buff._memories.copy()
        """
        For writes
        0 - same
        1 - different
        2 - different
        3 - same
        4 - same

        For memories
        0 - same
        1 - same
        2 - different
        3 - different
        4 - same
        """
        start_indices = [1]
        written_writes = np.random.rand(len(start_indices), 2, 1)
        buff.update_write_subtrajectories(written_writes, start_indices)
        new_memories = buff._memories

        expected_new_memories = old_memories.copy()
        expected_new_memories[2:4] = written_writes
        self.assertNpArraysNotEqual(old_memories, expected_new_memories)
        self.assertNpArraysNotEqual(old_memories, new_memories)
        self.assertNpEqual(new_memories, expected_new_memories)
Пример #8
0
 def test_update_dloss_dmemories_works(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     last_write = rand()
     for _ in range(13):
         observation = rand(), last_write
         write = rand()
         action = rand(), write
         last_write = write
         buff.add_sample(observation, action, 1, False)
     """
     internal
     dL/dm idx   dL/dw idx   changed?
     0           n/a         different
     1           0           different
     2           1           same
     3           2           same
     4           3           different
     5           4           different
     6           5           same
     7           6           same
     8           7           different
     9           8           different
     10          9           same
     11          10          same
     12          11          same
     13          12          same
     """
     start_indices = [0, 4, 8]
     dloss_dmem = np.random.rand(len(start_indices), 2, 1)
     buff.update_dloss_dmemories_subtrajectories(dloss_dmem, start_indices)
     new_subtrajs, _ = buff.random_subtrajectories(
         len(start_indices),
         _fixed_start_indices=start_indices,
     )
     expected_dloss_dwrite = np.zeros_like(dloss_dmem)
     for i in range(len(start_indices)):
         expected_dloss_dwrite[i, 0, :] = dloss_dmem[i, 1, :]
     self.assertNpEqual(new_subtrajs['dloss_dwrites'],
                        expected_dloss_dwrite)
Пример #9
0
 def test_update_dloss_dmemories_works_overlap(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     last_write = rand()
     for _ in range(13):
         observation = rand(), last_write
         write = rand()
         action = rand(), write
         last_write = write
         buff.add_sample(observation, action, 1, False)
     """
     internal
     dL/dm idx   dL/dw idx   changed?
     0           n/a         different
     1           0           different
     2           1           same
     3           2           different
     4           3           different
     5           4           same
     """
     start_indices = [0, 3]
     dloss_dmem = np.random.rand(len(start_indices), 2, 1)
     buff.update_dloss_dmemories_subtrajectories(dloss_dmem, start_indices)
     new_subtrajs, _ = buff.random_subtrajectories(
         len(start_indices),
         _fixed_start_indices=[0, 1, 2, 3],
     )
     expected_dloss_dwrite = np.zeros((4, 2, 1))
     expected_dloss_dwrite[0, 0, :] = dloss_dmem[0, 1, :]
     expected_dloss_dwrite[1, 1, :] = dloss_dmem[1, 0, :]
     expected_dloss_dwrite[2, 0, :] = dloss_dmem[1, 0, :]
     expected_dloss_dwrite[2, 1, :] = dloss_dmem[1, 1, :]
     expected_dloss_dwrite[3, 0, :] = dloss_dmem[1, 1, :]
     self.assertNpEqual(new_subtrajs['dloss_dwrites'],
                        expected_dloss_dwrite)
Пример #10
0
 def test__fixed_start_indices(self):
     env = StubMemoryEnv()
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=1,
     )
     for _ in range(25):
         observation = rand(), rand()
         action = rand(), rand()
         buff.add_sample(observation, action, 1, False)
     _, start_indices = buff.random_subtrajectories(15)
     _, new_start_indices = buff.random_subtrajectories(
         15,
         _fixed_start_indices=start_indices,
     )
     self.assertNpEqual(start_indices, new_start_indices)
Пример #11
0
 def test_update_memories_updates_memories_2d(self):
     env = StubMemoryEnv(2)
     buff = UpdatableSubtrajReplayBuffer(
         max_replay_buffer_size=100,
         env=env,
         subtraj_length=2,
         memory_dim=2,
     )
     last_write = rand(2)
     for _ in range(13):
         observation = rand(), last_write
         write = rand(2)
         action = rand(), write
         last_write = write
         buff.add_sample(observation, action, 1, False)
     start_indices = [0, 4, 8]
     new_writes = np.random.rand(len(start_indices), 2, 2)
     buff.update_write_subtrajectories(new_writes, start_indices)
     new_subtrajs, _ = buff.random_subtrajectories(
         len(start_indices),
         _fixed_start_indices=start_indices,
     )
     self.assertNpEqual(new_subtrajs['writes'], new_writes)
Пример #12
0
    def __init__(
            self,
            env,
            qf,
            policy,
            exploration_strategy,
            subtraj_length,
            tau=0.01,
            use_soft_update=True,
            target_hard_update_period=1000,
            use_action_policy_params_for_entire_policy=False,
            action_policy_optimize_bellman=True,
            write_policy_optimizes='both',
            action_policy_learning_rate=1e-3,
            write_policy_learning_rate=1e-5,
            qf_learning_rate=1e-3,
            bellman_error_loss_weight=10,
            refresh_entire_buffer_period=None,
            save_new_memories_back_to_replay_buffer=True,
            only_use_last_dqdm=False,
            action_policy_weight_decay=0,
            write_policy_weight_decay=0,
            do_not_load_initial_memories=False,
            save_memory_gradients=False,
            **kwargs
    ):
        """
        :param args: arguments to be passed onto super class constructor
        :param qf: Q function to train
        :param policy: Policy trained to optimized the Q function
        :param subtraj_length: Length of the subtrajectories loaded
        :param tau: Soft target tau
        :param use_soft_update: If False, use hard target updates.
        :param target_hard_update_period: Number of environment steps between
        hard updates.
        :param use_action_policy_params_for_entire_policy: If True, train the
        entire policy together, rather than training the action and write parts
        separately.
        :param action_policy_optimize_bellman:
        :param write_policy_optimizes:
        :param action_policy_learning_rate:
        :param write_policy_learning_rate:
        :param qf_learning_rate:
        :param bellman_error_loss_weight:
        :param refresh_entire_buffer_period:
        :param save_new_memories_back_to_replay_buffer:
        :param only_use_last_dqdm: If True, cut the gradients for all dQ/dmemory
        other than the last time step.
        :param action_policy_weight_decay:
        :param do_not_load_initial_memories: If True, always zero-out the
        loaded initial memory.
        :param write_policy_weight_decay:
        :param save_memory_gradients: If True, save and load dL/dmemory.
        :param kwargs: kwargs to pass onto super class constructor
        """
        super().__init__(env, policy, exploration_strategy, **kwargs)
        assert write_policy_optimizes in ['qf', 'bellman', 'both']
        self.qf = qf
        self.policy = policy
        self.subtraj_length = subtraj_length
        self.tau = tau
        self.use_soft_update = use_soft_update
        self.target_hard_update_period = target_hard_update_period
        self.use_action_policy_params_for_entire_policy = (
            use_action_policy_params_for_entire_policy
        )
        self.action_policy_optimize_bellman = action_policy_optimize_bellman
        self.write_policy_optimizes = write_policy_optimizes
        self.action_policy_learning_rate = action_policy_learning_rate
        self.write_policy_learning_rate = write_policy_learning_rate
        self.qf_learning_rate = qf_learning_rate
        self.bellman_error_loss_weight = bellman_error_loss_weight
        self.should_refresh_buffer = ConditionTimer(
            refresh_entire_buffer_period
        )
        self.save_new_memories_back_to_replay_buffer = (
            save_new_memories_back_to_replay_buffer
        )
        self.only_use_last_dqdm = only_use_last_dqdm
        self.action_policy_weight_decay = action_policy_weight_decay
        self.write_policy_weight_decay = write_policy_weight_decay
        self.do_not_load_initial_memories = do_not_load_initial_memories
        self.save_memory_gradients = save_memory_gradients

        """
        Set some params-dependency values
        """
        self.num_subtrajs_per_batch = self.batch_size // self.subtraj_length
        self.train_validation_num_subtrajs_per_batch = (
            self.num_subtrajs_per_batch
        )
        self.action_dim = int(self.env.action_space.flat_dim)
        self.obs_dim = int(self.env.observation_space.flat_dim)
        self.memory_dim = self.env.memory_dim
        self.max_number_trajectories_loaded_at_once = (
            self.num_subtrajs_per_batch
        )

        if not self.save_new_memories_back_to_replay_buffer:
            assert self.should_refresh_buffer.always_false, (
                "If save_new_memories_back_to_replay_buffer is False, "
                "you cannot refresh the replay buffer."
            )

        """
        Create the necessary node objects.
        """
        self.replay_buffer = SplitReplayBuffer(
            UpdatableSubtrajReplayBuffer(
                self.replay_buffer_size,
                self.env,
                self.subtraj_length,
                self.memory_dim,
            ),
            UpdatableSubtrajReplayBuffer(
                self.replay_buffer_size,
                self.env,
                self.subtraj_length,
                self.memory_dim,
            ),
            fraction_paths_in_train=0.8,
        )
        self.target_qf = self.qf.copy()
        self.target_policy = self.policy.copy()

        self.qf_optimizer = optim.Adam(
            self.qf.parameters(), lr=self.qf_learning_rate
        )
        self.action_policy_optimizer = optim.Adam(
            self.policy.action_parameters(),
            lr=self.action_policy_learning_rate,
            weight_decay=self.action_policy_weight_decay,
        )
        self.write_policy_optimizer = optim.Adam(
            self.policy.write_parameters(),
            lr=self.write_policy_learning_rate,
            weight_decay=self.write_policy_weight_decay,
        )
        self.policy_optimizer = optim.Adam(
            self.policy.parameters(),
            lr=self.action_policy_learning_rate,
            weight_decay=self.action_policy_weight_decay,
        )

        if self.save_memory_gradients:
            self.saved_grads = {}
            self.save_hook = self.create_save_grad_hook('dl_dmemory')