def _statistics_from_paths(self, paths, stat_prefix): eval_replay_buffer = UpdatableSubtrajReplayBuffer( len(paths) * (self.max_path_length + 1), self.env, self.subtraj_length, self.memory_dim, ) for path in paths: eval_replay_buffer.add_trajectory(path) raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories() assert raw_subtraj_batch is not None subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch) if self.save_memory_gradients: subtraj_batch['memories'].requires_grad = True statistics = self._statistics_from_subtraj_batch( subtraj_batch, stat_prefix=stat_prefix ) statistics.update(eval_util.get_generic_path_information( paths, stat_prefix="Test", )) env_actions = np.vstack([path["actions"][:self.action_dim] for path in paths]) writes = np.vstack([path["actions"][self.action_dim:] for path in paths]) statistics.update(create_stats_ordered_dict( 'Env Actions', env_actions, stat_prefix=stat_prefix )) statistics.update(create_stats_ordered_dict( 'Writes', writes, stat_prefix=stat_prefix )) return statistics
def test_size_add_none(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) self.assertEqual(buff.num_steps_can_sample(return_all=True), 0)
def test_size_add_one(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) observation = rand(), rand() action = rand(), rand() buff.add_sample(observation, action, 1, False) self.assertEqual(buff.num_steps_can_sample(return_all=True), 0)
def test_dloss_dwrites_are_zero_initially(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) last_write = rand() for _ in range(13): observation = rand(), last_write write = rand() action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) subtrajs, _ = buff.random_subtrajectories(5) self.assertNpEqual(subtrajs['dloss_dwrites'], np.zeros((5, 2, 1)))
def test_next_memory_equals_write_after_overflow(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=10, env=env, subtraj_length=2, memory_dim=1, ) last_write = rand() for _ in range(13): observation = rand(), last_write write = rand() action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) subtrajs, _ = buff.random_subtrajectories(5) self.assertNpEqual(subtrajs['next_memories'], subtrajs['writes'])
def test_random_subtraj_shape(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) observation = rand(), rand() action = rand(), rand() for _ in range(10): buff.add_sample(observation, action, 1, False) subtrajs, _ = buff.random_subtrajectories(5) self.assertEqual(subtrajs['env_obs'].shape, (5, 2, 1)) self.assertEqual(subtrajs['env_actions'].shape, (5, 2, 1)) self.assertEqual(subtrajs['next_env_obs'].shape, (5, 2, 1)) self.assertEqual(subtrajs['memories'].shape, (5, 2, 1)) self.assertEqual(subtrajs['next_memories'].shape, (5, 2, 1)) self.assertEqual(subtrajs['writes'].shape, (5, 2, 1)) self.assertEqual(subtrajs['rewards'].shape, (5, 2)) self.assertEqual(subtrajs['dloss_dwrites'].shape, (5, 2, 1))
def test_update_memories_does_not_update_other_memories(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) last_write = rand() buff.terminate_episode((rand(), rand()), True) for _ in range(5): observation = rand(), last_write write = rand() action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) # White box testing...sue me. old_memories = buff._memories.copy() """ For writes 0 - same 1 - different 2 - different 3 - same 4 - same For memories 0 - same 1 - same 2 - different 3 - different 4 - same """ start_indices = [1] written_writes = np.random.rand(len(start_indices), 2, 1) buff.update_write_subtrajectories(written_writes, start_indices) new_memories = buff._memories expected_new_memories = old_memories.copy() expected_new_memories[2:4] = written_writes self.assertNpArraysNotEqual(old_memories, expected_new_memories) self.assertNpArraysNotEqual(old_memories, new_memories) self.assertNpEqual(new_memories, expected_new_memories)
def test_update_dloss_dmemories_works(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) last_write = rand() for _ in range(13): observation = rand(), last_write write = rand() action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) """ internal dL/dm idx dL/dw idx changed? 0 n/a different 1 0 different 2 1 same 3 2 same 4 3 different 5 4 different 6 5 same 7 6 same 8 7 different 9 8 different 10 9 same 11 10 same 12 11 same 13 12 same """ start_indices = [0, 4, 8] dloss_dmem = np.random.rand(len(start_indices), 2, 1) buff.update_dloss_dmemories_subtrajectories(dloss_dmem, start_indices) new_subtrajs, _ = buff.random_subtrajectories( len(start_indices), _fixed_start_indices=start_indices, ) expected_dloss_dwrite = np.zeros_like(dloss_dmem) for i in range(len(start_indices)): expected_dloss_dwrite[i, 0, :] = dloss_dmem[i, 1, :] self.assertNpEqual(new_subtrajs['dloss_dwrites'], expected_dloss_dwrite)
def test_update_dloss_dmemories_works_overlap(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) last_write = rand() for _ in range(13): observation = rand(), last_write write = rand() action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) """ internal dL/dm idx dL/dw idx changed? 0 n/a different 1 0 different 2 1 same 3 2 different 4 3 different 5 4 same """ start_indices = [0, 3] dloss_dmem = np.random.rand(len(start_indices), 2, 1) buff.update_dloss_dmemories_subtrajectories(dloss_dmem, start_indices) new_subtrajs, _ = buff.random_subtrajectories( len(start_indices), _fixed_start_indices=[0, 1, 2, 3], ) expected_dloss_dwrite = np.zeros((4, 2, 1)) expected_dloss_dwrite[0, 0, :] = dloss_dmem[0, 1, :] expected_dloss_dwrite[1, 1, :] = dloss_dmem[1, 0, :] expected_dloss_dwrite[2, 0, :] = dloss_dmem[1, 0, :] expected_dloss_dwrite[2, 1, :] = dloss_dmem[1, 1, :] expected_dloss_dwrite[3, 0, :] = dloss_dmem[1, 1, :] self.assertNpEqual(new_subtrajs['dloss_dwrites'], expected_dloss_dwrite)
def test__fixed_start_indices(self): env = StubMemoryEnv() buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=1, ) for _ in range(25): observation = rand(), rand() action = rand(), rand() buff.add_sample(observation, action, 1, False) _, start_indices = buff.random_subtrajectories(15) _, new_start_indices = buff.random_subtrajectories( 15, _fixed_start_indices=start_indices, ) self.assertNpEqual(start_indices, new_start_indices)
def test_update_memories_updates_memories_2d(self): env = StubMemoryEnv(2) buff = UpdatableSubtrajReplayBuffer( max_replay_buffer_size=100, env=env, subtraj_length=2, memory_dim=2, ) last_write = rand(2) for _ in range(13): observation = rand(), last_write write = rand(2) action = rand(), write last_write = write buff.add_sample(observation, action, 1, False) start_indices = [0, 4, 8] new_writes = np.random.rand(len(start_indices), 2, 2) buff.update_write_subtrajectories(new_writes, start_indices) new_subtrajs, _ = buff.random_subtrajectories( len(start_indices), _fixed_start_indices=start_indices, ) self.assertNpEqual(new_subtrajs['writes'], new_writes)
def __init__( self, env, qf, policy, exploration_strategy, subtraj_length, tau=0.01, use_soft_update=True, target_hard_update_period=1000, use_action_policy_params_for_entire_policy=False, action_policy_optimize_bellman=True, write_policy_optimizes='both', action_policy_learning_rate=1e-3, write_policy_learning_rate=1e-5, qf_learning_rate=1e-3, bellman_error_loss_weight=10, refresh_entire_buffer_period=None, save_new_memories_back_to_replay_buffer=True, only_use_last_dqdm=False, action_policy_weight_decay=0, write_policy_weight_decay=0, do_not_load_initial_memories=False, save_memory_gradients=False, **kwargs ): """ :param args: arguments to be passed onto super class constructor :param qf: Q function to train :param policy: Policy trained to optimized the Q function :param subtraj_length: Length of the subtrajectories loaded :param tau: Soft target tau :param use_soft_update: If False, use hard target updates. :param target_hard_update_period: Number of environment steps between hard updates. :param use_action_policy_params_for_entire_policy: If True, train the entire policy together, rather than training the action and write parts separately. :param action_policy_optimize_bellman: :param write_policy_optimizes: :param action_policy_learning_rate: :param write_policy_learning_rate: :param qf_learning_rate: :param bellman_error_loss_weight: :param refresh_entire_buffer_period: :param save_new_memories_back_to_replay_buffer: :param only_use_last_dqdm: If True, cut the gradients for all dQ/dmemory other than the last time step. :param action_policy_weight_decay: :param do_not_load_initial_memories: If True, always zero-out the loaded initial memory. :param write_policy_weight_decay: :param save_memory_gradients: If True, save and load dL/dmemory. :param kwargs: kwargs to pass onto super class constructor """ super().__init__(env, policy, exploration_strategy, **kwargs) assert write_policy_optimizes in ['qf', 'bellman', 'both'] self.qf = qf self.policy = policy self.subtraj_length = subtraj_length self.tau = tau self.use_soft_update = use_soft_update self.target_hard_update_period = target_hard_update_period self.use_action_policy_params_for_entire_policy = ( use_action_policy_params_for_entire_policy ) self.action_policy_optimize_bellman = action_policy_optimize_bellman self.write_policy_optimizes = write_policy_optimizes self.action_policy_learning_rate = action_policy_learning_rate self.write_policy_learning_rate = write_policy_learning_rate self.qf_learning_rate = qf_learning_rate self.bellman_error_loss_weight = bellman_error_loss_weight self.should_refresh_buffer = ConditionTimer( refresh_entire_buffer_period ) self.save_new_memories_back_to_replay_buffer = ( save_new_memories_back_to_replay_buffer ) self.only_use_last_dqdm = only_use_last_dqdm self.action_policy_weight_decay = action_policy_weight_decay self.write_policy_weight_decay = write_policy_weight_decay self.do_not_load_initial_memories = do_not_load_initial_memories self.save_memory_gradients = save_memory_gradients """ Set some params-dependency values """ self.num_subtrajs_per_batch = self.batch_size // self.subtraj_length self.train_validation_num_subtrajs_per_batch = ( self.num_subtrajs_per_batch ) self.action_dim = int(self.env.action_space.flat_dim) self.obs_dim = int(self.env.observation_space.flat_dim) self.memory_dim = self.env.memory_dim self.max_number_trajectories_loaded_at_once = ( self.num_subtrajs_per_batch ) if not self.save_new_memories_back_to_replay_buffer: assert self.should_refresh_buffer.always_false, ( "If save_new_memories_back_to_replay_buffer is False, " "you cannot refresh the replay buffer." ) """ Create the necessary node objects. """ self.replay_buffer = SplitReplayBuffer( UpdatableSubtrajReplayBuffer( self.replay_buffer_size, self.env, self.subtraj_length, self.memory_dim, ), UpdatableSubtrajReplayBuffer( self.replay_buffer_size, self.env, self.subtraj_length, self.memory_dim, ), fraction_paths_in_train=0.8, ) self.target_qf = self.qf.copy() self.target_policy = self.policy.copy() self.qf_optimizer = optim.Adam( self.qf.parameters(), lr=self.qf_learning_rate ) self.action_policy_optimizer = optim.Adam( self.policy.action_parameters(), lr=self.action_policy_learning_rate, weight_decay=self.action_policy_weight_decay, ) self.write_policy_optimizer = optim.Adam( self.policy.write_parameters(), lr=self.write_policy_learning_rate, weight_decay=self.write_policy_weight_decay, ) self.policy_optimizer = optim.Adam( self.policy.parameters(), lr=self.action_policy_learning_rate, weight_decay=self.action_policy_weight_decay, ) if self.save_memory_gradients: self.saved_grads = {} self.save_hook = self.create_save_grad_hook('dl_dmemory')