示例#1
0
 def testCombineTransitions(self):
   states, actions, rewards, new_states, reward_mask = (
     base.Brain.CombineTransitions([
       base.Transition(
         s=numpy.array([[1, 2, 3]]),
         a=numpy.array([[0, 1, 0]]),
         r=1.0,
         sp=numpy.array([[4, 5, 6]]),
       ),
       base.Transition(
         s=numpy.array([[4, 5, 6]]),
         a=numpy.array([[0, 0, 1]]),
         r=-1.0,
         sp=None,
       ),
     ]))
   numpy_util.TestUtil.AssertArrayEqual(
     numpy.array([[1, 2, 3], [4, 5, 6]]), states)
   numpy_util.TestUtil.AssertArrayEqual(
     numpy.array([[0, 1, 0], [0, 0, 1]]), actions)
   numpy_util.TestUtil.AssertArrayEqual(
     numpy.array([1.0, -1.0]), rewards)
   numpy_util.TestUtil.AssertArrayEqual(
     numpy.array([[4, 5, 6], [4, 5, 6]]), new_states)
   numpy_util.TestUtil.AssertArrayEqual(
     numpy.array([1, 0]), reward_mask)
示例#2
0
    def test_convergence(self):
        a3c = a3c_impl.A3C(
            model=a3c_impl.CreateModel(
                state_shape=(3, ),
                action_space_size=2,
                hidden_layer_sizes=(3, ),
            ),
            # optimizer=a3c_impl.CreateDefaultOptimizer(learning_rate=0.05),
        )
        s = numpy.array([[1, 2, 3]])
        a1 = numpy.array([[1, 0]])
        a2 = numpy.array([[0, 1]])

        for _ in range(10):
            # Needs to train for both actions as one step, otherwise it shows some
            # "staggering" effect.
            a3c.UpdateFromTransitions([
                base.Transition(s=s, a=a1, r=1.0, sp=None),
            ])
            a3c.UpdateFromTransitions([
                base.Transition(s=s, a=a2, r=-1.0, sp=s),
            ])
            logging.printf('%s', a3c.GetValues(s))
        old_value_a1 = a3c.GetActionValues(a3c.GetValues(s), a1)
        # Trains for one step, for both actions.
        a3c.UpdateFromTransitions([
            base.Transition(s=s, a=a1, r=1.0, sp=None),
        ])
        a3c.UpdateFromTransitions([
            base.Transition(s=s, a=a2, r=-1.0, sp=s),
        ])
        self.assertGreaterEqual(a3c.GetActionValues(a3c.GetValues(s), a1),
                                old_value_a1)
示例#3
0
  def test_runUsesNewStateAfterIteration(self):
    self.env.TakeAction.side_effect = [
      base.Transition(
        s=numpy.array([[0]]),
        a=numpy.array([[0]]),
        r=1.0,
        sp=numpy.array([[1]]),
      ),
      base.Transition(
        s=numpy.array([[1]]),
        a=numpy.array([[0]]),
        r=1.0,
        sp=None,
      )
    ]

    self.runner.Run(
      env=self.env,
      brain=self.qfunc,
      policy=self.policy,
      num_of_episodes=1,
    )

    # Tests that the second call is with the new state 1.
    self.policy.Decide.assert_called_with(
      env=mock.ANY, brain=mock.ANY, state=numpy.array([[1]]),
      episode_idx=0, num_of_episodes=1)
示例#4
0
  def test_UpdateValues_singleTransition(self):
    self.qfunc._protected_SetValues(
      numpy.array([
        [1, 2, 3],
        [4, 5, 6],
        [2, 2, 2],
      ]),
      numpy.array([
        [0.5, 0.5],
        [0.3, 0.7],
        [0.8, 0.9],
      ]))

    self.qfunc.UpdateFromTransitions([base.Transition(
      s=numpy.array([[1, 2, 3]]),
      a=numpy.array([[0, 1]]),
      r=1.0,
      sp=numpy.array([[2, 2, 2]]),
    )])

    # The new values for state (1,2,3) should be:
    # - action (1,0): 0.5, since it's not changed.
    # - action (0,1): max(0.8, 0.9) * 0.5 + 1.0 = 1.45
    numpy_util.TestUtil.AssertArrayEqual(
      numpy.array([[0.5, 1.45]]),
      self.qfunc.GetValues(numpy.array([[1, 2, 3]])))
示例#5
0
  def test_learningRate(self):
    # Disables learning from Q* to simplifies testing.
    qfunc = qfunc_impl.MemoizationQFunction(
      action_space_size=2,
      discount_factor=0.0,
      learning_rate=0.9,
    )
    qfunc._protected_SetValues(
      numpy.array([
        [1, 2, 3],
        [4, 5, 6],
      ]),
      numpy.array([
        [0.5, 0.6],
        [0.3, 0.7],
      ]))
    qfunc.UpdateFromTransitions([base.Transition(
      s=numpy.array([[1, 2, 3]]),
      a=numpy.array([[0, 1]]),
      r=1.0,
      sp=numpy.array([[2, 2, 2]]),
    )])

    # The new values for state (1,2,3) should be:
    # - action (1,0): 0.5, since it's not changed.
    # - action (0,1): (1-0.9) * 0.6 + 0.9 * 1.0 = 0.96.
    numpy_util.TestUtil.AssertArrayEqual(
      numpy.array([[0.5, 0.96]]),
      qfunc.GetValues(numpy.array([[1, 2, 3]])))
示例#6
0
    def TakeAction(self, action: base.Action) -> base.Transition:
        current_state = self._current_state
        move = self.GetChoiceFromAction(action) - 1  # -1, 0, 1
        new_state = current_state + move

        r = None
        if move == 0:
            r = 0
        else:
            if move == 1 and current_state < 0:
                r = 1
            elif move == -1 and current_state > 0:
                r = 1
            else:
                r = -1

        if new_state > self._size:
            new_state = -self._size
        elif new_state < -self._size:
            new_state = self._size

        s = numpy.array([[current_state]])
        a = action
        if self._num_actions_taken >= STEP_LIMIT:
            sp = None
        else:
            sp = numpy.array([[new_state]])

        self._current_state = new_state
        self._num_actions_taken += 1

        return base.Transition(s, a, r, sp)
示例#7
0
    def test_saveLoad(self):
        a3c = a3c_impl.A3C(model=a3c_impl.CreateModel(
            state_shape=(3, ),
            action_space_size=2,
            hidden_layer_sizes=(3, ),
        ), )
        tmp_file = tempfile.NamedTemporaryFile().name
        s = numpy.array([[1, 2, 3]])
        for _ in range(10):
            a3c.UpdateFromTransitions([
                base.Transition(s=s,
                                a=numpy.array([[1, 0]]),
                                r=1.0,
                                sp=numpy.array([[4, 5, 6]])),
            ])
        a3c.Save(tmp_file)
        saved_values = a3c.GetValues(s)

        a3c = a3c_impl.A3C(model=a3c_impl.CreateModel(
            state_shape=(3, ),
            action_space_size=2,
            hidden_layer_sizes=(3, ),
        ), )
        a3c.Load(tmp_file)

        numpy_util.TestUtil.AssertArrayEqual(saved_values, a3c.GetValues(s))
示例#8
0
 def train_push(self, s, a, r, s_):
     return self._brain.UpdateFromTransitions([
         base.Transition(s=numpy.array([s]),
                         a=numpy.array([a]),
                         r=r,
                         sp=numpy.array([s_]))
     ])
 def setUp(self) -> None:
     self.brain = mock.MagicMock()
     self.runner = runner_impl.NStepExperienceRunner(
         discount_factor=0.5,
         n_step_return=5,
     )
     self.tran = base.Transition(
         s=numpy.array([[0]]),
         a=numpy.array([[1]]),
         r=1.0,
         sp=numpy.array([[0]]),
     )
    def test_memoryManagement(self):
        qfunc = qfunc_impl.RandomQFunction(action_space_size=2)
        runner = runner_impl.ExperienceReplayRunner(
            experience_capacity=1,
            experience_sample_batch_size=1,
            train_every_n_steps=1)

        tran1 = base.Transition(s=numpy.array([[1, 2]]),
                                a=numpy.array([[1, 0]]),
                                r=1,
                                sp=numpy.array([[3, 4]]))

        tran2 = base.Transition(s=numpy.array([[3, 4]]),
                                a=numpy.array([[0, 1]]),
                                r=1,
                                sp=numpy.array([[5, 6]]))

        runner._protected_ProcessTransition(qfunc, tran1, 0)
        runner._protected_ProcessTransition(qfunc, tran2, 1)

        hist = runner._experience._history
        self.assertEqual(1, len(hist))
        self.assertEqual(tran2, hist[0])
    def testCalculateNStepReward_whenDone(self):
        for _ in range(4):
            self.runner._protected_ProcessTransition(self.brain, self.tran, 0)
        self.assertFalse(self.brain.called)
        tran = base.Transition(
            s=numpy.array([[0]]),
            a=numpy.array([[1]]),
            r=1.0,
            sp=None,
        )
        self.runner._protected_ProcessTransition(self.brain, tran, 0)

        rewards = []
        for tran in self.brain.UpdateFromTransitions.call_args[0][0]:
            rewards.append(tran.r)
        self.assertCountEqual([
            1.0, 1.0 + 0.5, 1.0 + 0.5 + 0.5**2, 1.0 + 0.5 + 0.5**2 + 0.5**3,
            1.0 + 0.5 + 0.5**2 + 0.5**3 + 0.5**4
        ], rewards)
示例#12
0
    def _GetNStepTransition(self) -> base.Transition:
        # This implementation takes 3.542e-06 sec per call.
        R = 0.0
        next_discount_factor = 1.0
        for tran in self._memory:
            R += tran.r * next_discount_factor
            next_discount_factor *= self._gamma

        # The commented implementation takes 7.322e-06 sec per call.
        # rewards = numpy.zeros(self._n_step_return)
        # for idx, tran in enumerate(self._memory):
        #   rewards[idx] = tran.r
        # R = numpy.sum(self._gamma_powers * rewards)

        return base.Transition(
            s=self._memory[0].s,
            a=self._memory[0].a,
            r=R,
            sp=self._memory[-1].sp,
        )
示例#13
0
  def test_UpdateValues_environmentDone(self):
    self.qfunc._protected_SetValues(
      numpy.array([
        [1, 2, 3],
        [4, 5, 6],
      ]),
      numpy.array([
        [0.5, 0.5],
        [0.3, 0.7],
      ]))

    self.qfunc.UpdateFromTransitions([base.Transition(
      s=numpy.array([[1, 2, 3]]),
      a=numpy.array([[0, 1]]),
      r=1.0,
      sp=None,
    )])

    # The new values for state (1,2,3) should be:
    # - action (1,0): 0.5, since it's not changed.
    # - action (0,1): 1.0, since environment is done, only reward is used.
    numpy_util.TestUtil.AssertArrayEqual(
      numpy.array([[0.5, 1.0]]),
      self.qfunc.GetValues(numpy.array([[1, 2, 3]])))