예제 #1
0
class TicTacToeEnvTest(unittest.TestCase):
    def setUp(self):
        self.env = TicTacToeEnv()

    def test_check(self):
        self.assertEqual((False, 0.), self.env.check([0] * 8 + [1]))
        self.assertEqual((True, -1.),
                         self.env.check([4, 4, 4, 0, 1, 1, 0, 0, 0]))
        self.assertEqual((True, 1.),
                         self.env.check([4, 4, 0, 1, 1, 1, 0, 0, 0]))
        self.assertEqual((False, 0.),
                         self.env.check([4, 4, 1, 1, 4, 1, 1, 0, 0]))
        self.assertEqual((True, 0.),
                         self.env.check([4, 1, 4, 1, 4, 1, 1, 4, 1]))

    def test_legal_actions(self):
        states = [0] * 9
        states[3] = 1
        states[7] = 4
        states[8] = 1
        self.assertEqual([0, 1, 2, 4, 5, 6], self.env.legal_actions(states))

    def test_opponent_play(self):
        # Chooses the first available space.
        self.assertEqual(0, self.env.opponent_play([0] * 8 + [1]))
        self.assertEqual(8, self.env.opponent_play([1] * 8 + [0]))

    def test_opponent_play_random(self):
        self.env = TicTacToeEnv(r_seed=0, use_random=True)
        s = set()
        for i in range(100):
            s.add(self.env.opponent_play([0, 1, 4, 0, 0, 0, 0, 0, 0]))
        self.assertEqual([0] + list(range(3, 9)), list(s))

    def test_step(self):
        self.env.set_states([4, 4, 0, 0, 1, 1, 0, 0, 0])
        states, is_final, reward = self.env.step(3)
        self.assertEqual([4, 4, 0, 1, 1, 1, 0, 0, 0], states)
        self.assertTrue(is_final)
        self.assertEqual(1., reward)
class MuZeroCollectionPolicyTicTacToeTest(unittest.TestCase):
    def setUp(self):
        # Make tests reproducible.
        np.random.seed(0)
        tf.random.set_seed(0)

        self.initialize(False, 0)

    def initialize(self, use_random, r_seed):
        self.env = TicTacToeEnv(use_random=use_random, r_seed=r_seed)
        self.network_initializer = TicTacToeInitializer()
        self.network = Network(self.network_initializer)
        self.replay_buffer = ReplayBuffer()
        self.rng = np.random.RandomState(0)
        self.policy = MuZeroCollectionPolicy(self.env,
                                             self.network,
                                             self.replay_buffer,
                                             num_simulations=100,
                                             discount=1.,
                                             rng=self.rng)

    def test_action_start(self):
        action = self.policy.action()
        # All corners are optimal first actions.
        # TODO: fix this
        #self.assertIn(action, [0, 2, 6, 8])
        self.assertEqual(1, action)

    def test_action_win(self):
        self.env.set_states([1, 0, 1, 1, 0, 4, 4, 4, 0])
        action = self.policy.action()
        # TODO: fix this to be 1.
        # self.assertEqual(1, action)
        # self.assertEqual(5, action)

    def test_action_win_2(self):
        self.env.set_states([1, 1, 4, 0, 0, 4, 1, 4, 0])
        action = self.policy.action()
        # TODO: fix this to be 3.
        # self.assertEqual(3, action)
        # self.assertEqual(4, action)

    def test_policy_logits(self):
        pass
        # TODO: fix this to provide correct logits.
        logits = self.policy.get_policy_logits()
        # tf.assert_equal(tf.constant([0.14, 0.09, 0.13, 0.09, 0.13, 0.11, 0.09, 0.11, 0.11],
        #                             dtype=tf.float64), logits)

    def test_choose_action(self):
        self.assertEqual(
            1,
            self.policy.choose_action(
                tf.constant([
                    0.11, 0.116, 0.11, 0.11, 0.11, 0.111, 0.111, 0.111, 0.111
                ])))

    def test_game_deterministic(self):
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                break
        # TODO: fix this to win.
        self.assertEqual(-1.0, reward)

    def test_run_self_play(self):
        self.policy.run_self_play()
        self.assertEqual(1, len(self.replay_buffer.buffer))
        traj = self.replay_buffer.buffer[0]
        self.assertEqual([1, 0], traj.action_history)
        self.assertEqual([0., -1.], traj.rewards)

    def play_game_once(self, r_seed):
        self.initialize(True, r_seed)
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                return states, is_final, reward
예제 #3
0
class MctsPolicyTicTacToeTest(unittest.TestCase):
    def setUp(self):
        self.env = TicTacToeEnv()
        self.model = BasicMctsModel(self.env)
        self.policy = MctsPolicy(self.env, self.model, num_simulations=100)

    def test_action_start(self):
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(0, action)
        self.assertEqual(([1, 4, 0, 0, 0, 0, 0, 0, 0], False, 0.0),
                         states_isfinal_reward)

    def test_action_win(self):
        self.env.set_states([1, 0, 1, 1, 0, 4, 4, 4, 0])
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(1, action)
        self.assertEqual(([1, 1, 1, 1, 0, 4, 4, 4, 0], True, 1.0),
                         states_isfinal_reward)

    def test_action_win_2(self):
        self.env.set_states([1, 1, 4, 0, 0, 4, 1, 4, 0])
        action = self.policy.action()
        states_isfinal_reward = self.env.step(action)
        self.assertEqual(3, action)
        self.assertEqual(([1, 1, 4, 1, 0, 4, 1, 4, 0], True, 1.0),
                         states_isfinal_reward)

    def test_policy_logits(self):
        logits = self.policy.get_policy_logits()
        tf.assert_equal(
            tf.constant([0.14, 0.09, 0.13, 0.09, 0.13, 0.11, 0.09, 0.11, 0.11],
                        dtype=tf.float64), logits)

    def test_choose_action(self):
        self.assertEqual(
            1,
            self.policy.choose_action(
                tf.constant([
                    0.11, 0.116, 0.11, 0.11, 0.11, 0.111, 0.111, 0.111, 0.111
                ])))

    def test_game_deterministic(self):
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                break
        self.assertEqual(1.0, reward)

    def play_game_once(self, r_seed):
        self.env = TicTacToeEnv(use_random=True, r_seed=r_seed)
        self.model = BasicMctsModel(self.env, r_seed=r_seed)
        self.policy = MctsPolicy(self.env,
                                 self.model,
                                 num_simulations=100,
                                 r_seed=r_seed)
        while True:
            action = self.policy.action()
            states_isfinal_reward = self.env.step(action)
            states, is_final, reward = states_isfinal_reward
            if is_final:
                return states, is_final, reward

    def test_game_random(self):
        reward_dict = collections.defaultdict(int)
        for r_seed in range(100):
            _, _, reward = self.play_game_once(r_seed)
            reward_dict[reward] += 1
        print('reward distribution: ', reward_dict)
        # 96% winning ratio.
        self.assertEqual({1.0: 96, 0.0: 1, -1.0: 3}, reward_dict)
예제 #4
0
    tf.summary.experimental.set_step(train_iter)
    for play_iter in range(PLAY_ITERATIONS):
        print('STARTING PLAY ITERATION #{}'.format(play_iter))
        start_time = time.time()
        col_policy.run_self_play()
        end_time = time.time()
        print('Self Play Iteration Runtime: {}'.format(end_time - start_time))
    eval_policy.train(NUM_TRAIN_STEPS, NUM_UNROLL_STEPS)

# TODO: pacman, save weights, tensorboard

idx = 0
total_reward = 0
#Reset the env for a game
env = TicTacToeEnv()
env.render()
while True:
    start_time = time.time()
    print('Starting action calculation')
    action = eval_policy.action()
    states, is_final, reward = env.step(action)
    total_reward += reward
    end_time = time.time()
    print('Action at iter %s: %s\nReward: %s\n'
          'TotalReward: %s\nCalc time: %s\n\n' %
          (idx, action, reward, total_reward, end_time - start_time))
    env.render()
    if is_final:
        print("Hit is_final!")
        break
    idx += 1