Exemplo n.º 1
0
    def test_hindsight(self):
        """Test Hindsight Experience Replay."""

        # The environment is a plane in which the agent moves by steps until it reaches a randomly
        # positioned goal.  No reward is given until it reaches the goal.  That makes it very hard
        # to learn by standard methods, since it may take a very long time to receive any feedback
        # at all.  Using hindsight makes it much easier.

        class TestEnvironment(dc.rl.Environment):
            def __init__(self):
                super(TestEnvironment, self).__init__((4, ), 4)
                self.moves = [(-1, 0), (1, 0), (0, -1), (0, 1)]

            def reset(self):
                self._state = np.concatenate([[0, 0],
                                              np.random.randint(-50, 50, 2)])
                self._terminated = False
                self.count = 0

            def step(self, action):
                new_state = self._state.copy()
                new_state[:2] += self.moves[action]
                self._state = new_state
                self.count += 1
                reward = 0
                if np.array_equal(new_state[:2], new_state[2:]):
                    self._terminated = True
                    reward = 1
                elif self.count == 1000:
                    self._terminated = True
                return reward

            def apply_hindsight(self, states, actions, goal):
                new_states = []
                rewards = []
                goal_pos = goal[:2]
                for state, action in zip(states, actions):
                    new_state = state.copy()
                    new_state[2:] = goal_pos
                    new_states.append(new_state)
                    pos_after_action = new_state[:2] + self.moves[action]
                    if np.array_equal(pos_after_action, goal_pos):
                        rewards.append(1)
                    else:
                        rewards.append(0)
                return new_states, rewards

        # A simple policy with two hidden layers.

        class TestPolicy(dc.rl.Policy):
            def create_layers(self, state, **kwargs):

                dense1 = Dense(6, activation_fn=tf.nn.relu, in_layers=state)
                dense2 = Dense(6, activation_fn=tf.nn.relu, in_layers=dense1)
                output = Dense(4,
                               activation_fn=tf.nn.softmax,
                               biases_initializer=None,
                               in_layers=dense2)
                value = Dense(1, in_layers=dense2)
                return {'action_prob': output, 'value': value}

        # Optimize it.

        env = TestEnvironment()
        learning_rate = PolynomialDecay(initial_rate=0.0001,
                                        final_rate=0.00005,
                                        decay_steps=1500000)
        ppo = dc.rl.PPO(env,
                        TestPolicy(),
                        use_hindsight=True,
                        optimization_epochs=8,
                        optimizer=Adam(learning_rate=learning_rate))
        ppo.fit(1500000)

        # Try running it a few times and see if it succeeds.

        pass_count = 0
        for i in range(5):
            env.reset()
            while not env.terminated:
                env.step(ppo.select_action(env.state))
            if np.array_equal(env.state[:2], env.state[2:]):
                pass_count += 1
        assert pass_count >= 3
Exemplo n.º 2
0
    def test_continuous(self):
        """Test A3C on an environment with a continous action space."""

        # The state consists of two numbers: a current value and a target value.
        # The policy just needs to learn to output the target value (or at least
        # move toward it).

        class TestEnvironment(dc.rl.Environment):
            def __init__(self):
                super(TestEnvironment, self).__init__((2, ),
                                                      action_shape=(1, ))

            def reset(self):
                target = np.random.uniform(-50, 50)
                self._state = np.array([0, target])
                self._terminated = False
                self.count = 0

            def step(self, action):
                target = self._state[1]
                dist = np.abs(target - action[0])
                old_dist = np.abs(target - self._state[0])
                new_state = np.array([action[0], target])
                self._state = new_state
                self.count += 1
                reward = old_dist - dist
                self._terminated = (self.count == 10)
                return reward

        # A simple policy with no hidden layers.

        class TestPolicy(dc.rl.Policy):
            def create_layers(self, state, **kwargs):
                action_mean = Dense(1,
                                    in_layers=state,
                                    weights_initializer=tf.zeros_initializer)
                action_std = Constant([10.0])
                value = Dense(1, in_layers=state)
                return {
                    'action_mean': action_mean,
                    'action_std': action_std,
                    'value': value
                }

        # Optimize it.

        env = TestEnvironment()
        learning_rate = PolynomialDecay(initial_rate=0.005,
                                        final_rate=0.0005,
                                        decay_steps=25000)
        a3c = dc.rl.A3C(env,
                        TestPolicy(),
                        discount_factor=0,
                        optimizer=Adam(learning_rate=learning_rate))
        a3c.fit(25000)

        # Try running it and see if it reaches the target

        env.reset()
        while not env.terminated:
            env.step(a3c.select_action(env.state, deterministic=True))
        distance = np.abs(env.state[0] - env.state[1])
        tolerance = max(1.0, 0.1 * np.abs(env.state[1]))
        assert distance < tolerance