Exemplo n.º 1
0
    def test_runUsesNewStateAfterIteration(self):
        self.env.TakeAction.side_effect = [
            q_base.Transition(
                s=numpy.array([[0]]),
                a=numpy.array([[0]]),
                r=1.0,
                sp=numpy.array([[1]]),
            ),
            q_base.Transition(
                s=numpy.array([[1]]),
                a=numpy.array([[0]]),
                r=1.0,
                sp=None,
            )
        ]

        self.runner.Run(
            env=self.env,
            qfunc=self.qfunc,
            policy=self.policy,
            num_of_episodes=1,
        )

        # Tests that the second call is with the new state 1.
        self.policy.Decide.assert_called_with(env=mock.ANY,
                                              qfunc=mock.ANY,
                                              state=numpy.array([[1]]),
                                              episode_idx=0,
                                              num_of_episodes=1)
Exemplo n.º 2
0
    def test_UpdateValues_singleTransition(self):
        self.qfunc._protected_SetValues(
            numpy.array([
                [1, 2, 3],
                [4, 5, 6],
                [2, 2, 2],
            ]), numpy.array([
                [0.5, 0.5],
                [0.3, 0.7],
                [0.8, 0.9],
            ]))

        self.qfunc.UpdateValues([
            q_base.Transition(
                s=numpy.array([[1, 2, 3]]),
                a=numpy.array([[0, 1]]),
                r=1.0,
                sp=numpy.array([[2, 2, 2]]),
            )
        ])

        # The new values for state (1,2,3) should be:
        # - action (1,0): 0.5, since it's not changed.
        # - action (0,1): max(0.8, 0.9) * 0.5 + 1.0 = 1.45
        self.assertArrayEq(numpy.array([[0.5, 1.45]]),
                           self.qfunc.GetValues(numpy.array([[1, 2, 3]])))
Exemplo n.º 3
0
    def test_learningRate(self):
        # Disables learning from Q* to simplifies testing.
        qfunc = qfunc_impl.MemoizationQFunction(
            action_space_size=2,
            discount_factor=0.0,
            learning_rate=0.9,
        )
        qfunc._protected_SetValues(numpy.array([
            [1, 2, 3],
            [4, 5, 6],
        ]), numpy.array([
            [0.5, 0.6],
            [0.3, 0.7],
        ]))
        qfunc.UpdateValues([
            q_base.Transition(
                s=numpy.array([[1, 2, 3]]),
                a=numpy.array([[0, 1]]),
                r=1.0,
                sp=numpy.array([[2, 2, 2]]),
            )
        ])

        # The new values for state (1,2,3) should be:
        # - action (1,0): 0.5, since it's not changed.
        # - action (0,1): (1-0.9) * 0.6 + 0.9 * 1.0 = 0.96.
        self.assertArrayEq(numpy.array([[0.5, 0.96]]),
                           qfunc.GetValues(numpy.array([[1, 2, 3]])))
Exemplo n.º 4
0
    def TakeAction(self, action: q_base.Action) -> q_base.Transition:
        current_state = self._current_state
        move = self.GetChoiceFromAction(action) - 1  # -1, 0, 1
        new_state = current_state + move

        r = None
        if move == 0:
            r = 0
        else:
            if move == 1 and current_state < 0:
                r = 1
            elif move == -1 and current_state > 0:
                r = 1
            else:
                r = -1

        if new_state > self._size:
            new_state = -self._size
        elif new_state < -self._size:
            new_state = self._size

        s = numpy.array([[current_state]])
        a = action
        if self._num_actions_taken >= STEP_LIMIT:
            sp = None
        else:
            sp = numpy.array([[new_state]])

        self._current_state = new_state
        self._num_actions_taken += 1

        return q_base.Transition(s, a, r, sp)
Exemplo n.º 5
0
    def TakeAction(self, action: Action) -> Transition:
        if self._action_count >= self._step_limit:
            sp = None
        else:
            sp = self._state

        self._action_count += 1
        return q_base.Transition(s=self._state, a=action, r=0.0, sp=sp)
Exemplo n.º 6
0
    def test_updateValues_swapModels(self):
        q1 = self.qfunc._q1
        q2 = self.qfunc._q2
        self.qfunc.UpdateValues([
            q_base.Transition(s=numpy.array([[1, 2, 3]]),
                              a=numpy.array([[1, 0]]),
                              r=1.0,
                              sp=numpy.array([[4, 5, 6]]))
        ])

        self.assertEqual(q1, self.qfunc._q2)
        self.assertEqual(q2, self.qfunc._q1)
Exemplo n.º 7
0
    def test_memoryManagement(self):
        qfunc = qfunc_impl.RandomValueQFunction(action_space_size=2)
        runner = runner_impl.ExperienceReplayRunner(
            experience_capacity=1,
            experience_sample_batch_size=1,
            train_every_n_steps=1)

        tran1 = q_base.Transition(s=numpy.array([[1, 2]]),
                                  a=numpy.array([[1, 0]]),
                                  r=1,
                                  sp=numpy.array([[3, 4]]))

        tran2 = q_base.Transition(s=numpy.array([[3, 4]]),
                                  a=numpy.array([[0, 1]]),
                                  r=1,
                                  sp=numpy.array([[5, 6]]))

        runner._protected_ProcessTransition(qfunc, tran1, 0)
        runner._protected_ProcessTransition(qfunc, tran2, 1)

        hist = runner._experience._history
        self.assertEqual(1, len(hist))
        self.assertEqual(tran2, hist[0])
Exemplo n.º 8
0
    def test_convergence(self):
        trans = [
            q_base.Transition(
                s=numpy.array([[1, 2, 3]]),
                a=numpy.array([[1, 0]]),
                r=1.0,
                sp=None,
            )
        ]
        states, actions, target_action_values = None, None, None
        for _ in range(100):
            states, actions, target_action_values = self.qfunc.UpdateValues(
                trans)

        error1_1 = numpy.sum(
            numpy.abs(
                self.qfunc.GetActionValues(self.qfunc.GetValues(states),
                                           actions) - target_action_values))
        states, actions, target_action_values = self.qfunc.UpdateValues(trans)
        error1_2 = numpy.sum(
            numpy.abs(
                self.qfunc.GetActionValues(self.qfunc.GetValues(states),
                                           actions) - target_action_values))
        # Needs this to swap back q1 and q2.
        states, actions, target_action_values = self.qfunc.UpdateValues(trans)

        # Since an even number of iterations was used in the first loop, an even
        # number must be used here as well to make sure it's the same model that's
        # being compared.
        for _ in range(100):
            states, actions, target_action_values = self.qfunc.UpdateValues(
                trans)

        error2_1 = numpy.sum(
            numpy.abs(
                self.qfunc.GetActionValues(self.qfunc.GetValues(states),
                                           actions) - target_action_values))
        states, actions, target_action_values = self.qfunc.UpdateValues(trans)
        error2_2 = numpy.sum(
            numpy.abs(
                self.qfunc.GetActionValues(self.qfunc.GetValues(states),
                                           actions) - target_action_values))

        # Only compare errors from the same model.
        self.assertLessEqual(error2_1, error1_1)
        self.assertLessEqual(error2_2, error1_2)
Exemplo n.º 9
0
    def TakeAction(self, action: Action) -> Transition:

        if self._render_frames:
            self._gym_env.render()
            self._fps_controller.WaitUntilNextAllowedTime()

        if self._recorder and self._recorder.enabled:
            self._recorder.capture_frame()

        observation, reward, done, info = self._gym_env.step(
            self.GetChoiceFromAction(action))

        if done:
            sp = None
        else:
            sp = self._ConvertState(observation)

        transition = q_base.Transition(s=self._current_state,
                                       a=action,
                                       r=reward,
                                       sp=sp)
        self._current_state = sp
        return transition
Exemplo n.º 10
0
    def test_UpdateValues_environmentDone(self):
        self.qfunc._protected_SetValues(
            numpy.array([
                [1, 2, 3],
                [4, 5, 6],
            ]), numpy.array([
                [0.5, 0.5],
                [0.3, 0.7],
            ]))

        self.qfunc.UpdateValues([
            q_base.Transition(
                s=numpy.array([[1, 2, 3]]),
                a=numpy.array([[0, 1]]),
                r=1.0,
                sp=None,
            )
        ])

        # The new values for state (1,2,3) should be:
        # - action (1,0): 0.5, since it's not changed.
        # - action (0,1): 1.0, since environment is done, only reward is used.
        self.assertArrayEq(numpy.array([[0.5, 1.0]]),
                           self.qfunc.GetValues(numpy.array([[1, 2, 3]])))