示例#1
0
文件: HER.py 项目: ymd-h/cpprb
    def test_episode(self):
        rew_func = lambda s, a, g: -1 * (s != g)
        hrb = HindsightReplayBuffer(size=10,
                                    env_dict={
                                        "obs": {},
                                        "act": {},
                                        "next_obs": {}
                                    },
                                    max_episode_len=2,
                                    strategy="episode",
                                    reward_func=rew_func,
                                    additional_goals=2,
                                    prioritized=False)
        hrb.add(obs=0, act=0, next_obs=1)
        hrb.add(obs=1, act=0, next_obs=2)
        hrb.on_episode_end(3)
        self.assertEqual(hrb.get_stored_size(), 6)

        sample = hrb.get_all_transitions()
        self.assertIn("rew", sample)
        self.assertIn("goal", sample)
        self.assertEqual(sample["obs"].shape, (6, 1))
        np.testing.assert_allclose(
            sample["rew"],
            rew_func(sample["next_obs"], sample["act"], sample["goal"]))
示例#2
0
文件: HER.py 项目: ymd-h/cpprb
    def test_PER(self):
        rew_func = lambda s, a, g: -1 * (s != g)
        batch_size = 4

        hrb = HindsightReplayBuffer(size=10,
                                    env_dict={
                                        "obs": {},
                                        "act": {},
                                        "next_obs": {}
                                    },
                                    max_episode_len=2,
                                    strategy="future",
                                    reward_func=rew_func,
                                    additional_goals=2,
                                    prioritized=True)

        hrb.add(obs=0, act=0, next_obs=1)
        hrb.add(obs=1, act=0, next_obs=2)

        hrb.on_episode_end(3)
        self.assertEqual(hrb.get_stored_size(), 6)

        sample = hrb.sample(batch_size)
        hrb.update_priorities(indexes=sample["indexes"],
                              priorities=np.zeros_like(sample["indexes"],
                                                       dtype=np.float))
示例#3
0
文件: HER.py 项目: ymd-h/cpprb
    def test_goal_final(self):
        rew_func = lambda s, a, g: -1 * (s[:, :3] != g).any(axis=1)
        goal_func = lambda s: s[:, :3]

        hrb = HindsightReplayBuffer(10, {
            "obs": {
                "shape": 5
            },
            "act": {},
            "next_obs": {
                "shape": 5
            }
        },
                                    max_episode_len=10,
                                    reward_func=rew_func,
                                    goal_func=goal_func,
                                    goal_shape=3,
                                    strategy="final")

        hrb.add(obs=(0, 0, 0, 0, 0), act=0, next_obs=(1, 1, 1, 1, 1))
        hrb.add(obs=(1, 1, 1, 1, 1), act=0, next_obs=(2, 2, 2, 2, 2))
        self.assertEqual(hrb.get_stored_size(), 0)

        hrb.on_episode_end((3, 3, 3))
        self.assertEqual(hrb.get_stored_size(), 4)

        sample = hrb.get_all_transitions()
        self.assertIn("goal", sample)
        self.assertEqual(sample["goal"].shape, (4, 3))
        np.testing.assert_allclose(
            sample["goal"], [[3, 3, 3], [3, 3, 3], [2, 2, 2], [2, 2, 2]])
示例#4
0
文件: HER.py 项目: ymd-h/cpprb
    def test_stored_size(self):
        """
        Test get_stored_size() method
        """
        hrb = HindsightReplayBuffer(size=10,
                                    env_dict={
                                        "obs": {},
                                        "act": {},
                                        "next_obs": {}
                                    },
                                    max_episode_len=2,
                                    reward_func=lambda s, a, g: -1 * (s != g),
                                    additional_goals=1,
                                    prioritized=False)
        # Buffer is initialized without data
        self.assertEqual(hrb.get_stored_size(), 0)
        self.assertEqual(hrb.additional_goals, 1)

        # During episode, stored size doesn't increase
        hrb.add(obs=0, act=0, next_obs=0)
        self.assertEqual(hrb.get_stored_size(), 0)

        # On episode end, stored size increases by `episode_len * additional_goals`
        hrb.on_episode_end(1)
        self.assertEqual(hrb.get_stored_size(), 2)

        # If no transactions in the current episode, nothing happens
        hrb.on_episode_end(1)
        self.assertEqual(hrb.get_stored_size(), 2)

        # By calling clear(), stored size become 0 again.
        hrb.clear()
        self.assertEqual(hrb.get_stored_size(), 0)
示例#5
0
文件: dqn-her.py 项目: ymd-h/cpprb
        act = env.action_space.sample()
    else:
        Q = tf.squeeze(model(sg(obs.reshape(1, -1), goal)))
        act = np.argmax(Q)

    next_obs, _, done, info = env.step(act)
    ep += 1

    rb.add(obs=obs,
           act=act,
           next_obs=next_obs)

    if done or (ep >= max_episode_len):
        obs = env.reset()
        goal = env.goal.copy().reshape((1, -1))
        rb.on_episode_end(goal)
        n_episode += 1
        ep = 0
    else:
        obs = next_obs

    if rb.get_stored_size() < nwarmup:
        continue

    if prioritized:
        sample = rb.sample(batch_size, beta)
        beta += beta_step
    else:
        sample = rb.sample(batch_size)

    weights = sample["weights"].ravel() if prioritized else tf.constant(1.0)