def test_episode(self): rew_func = lambda s, a, g: -1 * (s != g) hrb = HindsightReplayBuffer(size=10, env_dict={ "obs": {}, "act": {}, "next_obs": {} }, max_episode_len=2, strategy="episode", reward_func=rew_func, additional_goals=2, prioritized=False) hrb.add(obs=0, act=0, next_obs=1) hrb.add(obs=1, act=0, next_obs=2) hrb.on_episode_end(3) self.assertEqual(hrb.get_stored_size(), 6) sample = hrb.get_all_transitions() self.assertIn("rew", sample) self.assertIn("goal", sample) self.assertEqual(sample["obs"].shape, (6, 1)) np.testing.assert_allclose( sample["rew"], rew_func(sample["next_obs"], sample["act"], sample["goal"]))
def test_PER(self): rew_func = lambda s, a, g: -1 * (s != g) batch_size = 4 hrb = HindsightReplayBuffer(size=10, env_dict={ "obs": {}, "act": {}, "next_obs": {} }, max_episode_len=2, strategy="future", reward_func=rew_func, additional_goals=2, prioritized=True) hrb.add(obs=0, act=0, next_obs=1) hrb.add(obs=1, act=0, next_obs=2) hrb.on_episode_end(3) self.assertEqual(hrb.get_stored_size(), 6) sample = hrb.sample(batch_size) hrb.update_priorities(indexes=sample["indexes"], priorities=np.zeros_like(sample["indexes"], dtype=np.float))
def test_goal_final(self): rew_func = lambda s, a, g: -1 * (s[:, :3] != g).any(axis=1) goal_func = lambda s: s[:, :3] hrb = HindsightReplayBuffer(10, { "obs": { "shape": 5 }, "act": {}, "next_obs": { "shape": 5 } }, max_episode_len=10, reward_func=rew_func, goal_func=goal_func, goal_shape=3, strategy="final") hrb.add(obs=(0, 0, 0, 0, 0), act=0, next_obs=(1, 1, 1, 1, 1)) hrb.add(obs=(1, 1, 1, 1, 1), act=0, next_obs=(2, 2, 2, 2, 2)) self.assertEqual(hrb.get_stored_size(), 0) hrb.on_episode_end((3, 3, 3)) self.assertEqual(hrb.get_stored_size(), 4) sample = hrb.get_all_transitions() self.assertIn("goal", sample) self.assertEqual(sample["goal"].shape, (4, 3)) np.testing.assert_allclose( sample["goal"], [[3, 3, 3], [3, 3, 3], [2, 2, 2], [2, 2, 2]])
def test_stored_size(self): """ Test get_stored_size() method """ hrb = HindsightReplayBuffer(size=10, env_dict={ "obs": {}, "act": {}, "next_obs": {} }, max_episode_len=2, reward_func=lambda s, a, g: -1 * (s != g), additional_goals=1, prioritized=False) # Buffer is initialized without data self.assertEqual(hrb.get_stored_size(), 0) self.assertEqual(hrb.additional_goals, 1) # During episode, stored size doesn't increase hrb.add(obs=0, act=0, next_obs=0) self.assertEqual(hrb.get_stored_size(), 0) # On episode end, stored size increases by `episode_len * additional_goals` hrb.on_episode_end(1) self.assertEqual(hrb.get_stored_size(), 2) # If no transactions in the current episode, nothing happens hrb.on_episode_end(1) self.assertEqual(hrb.get_stored_size(), 2) # By calling clear(), stored size become 0 again. hrb.clear() self.assertEqual(hrb.get_stored_size(), 0)
act = env.action_space.sample() else: Q = tf.squeeze(model(sg(obs.reshape(1, -1), goal))) act = np.argmax(Q) next_obs, _, done, info = env.step(act) ep += 1 rb.add(obs=obs, act=act, next_obs=next_obs) if done or (ep >= max_episode_len): obs = env.reset() goal = env.goal.copy().reshape((1, -1)) rb.on_episode_end(goal) n_episode += 1 ep = 0 else: obs = next_obs if rb.get_stored_size() < nwarmup: continue if prioritized: sample = rb.sample(batch_size, beta) beta += beta_step else: sample = rb.sample(batch_size) weights = sample["weights"].ravel() if prioritized else tf.constant(1.0)