def run_and_get_new_weights(init_weights, win0, win1): state = GameState(size=19) policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) policy.model.set_weights(init_weights) optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2) policy.model.compile(loss=log_loss, optimizer=optimizer) # Make moves on the state and get trainable (state, action) pairs from them. moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)] state_tensors = [] action_tensors = [] for m in moves: (st_tensor, mv_tensor) = _make_training_pair(state, m, policy.preprocessor) state_tensors.append(st_tensor) action_tensors.append(mv_tensor) state.do_move(m) for i, (s, a) in enumerate(zip(state_tensors, action_tensors)): # Put even state/action pairs in game 0, odd ones in game 1. game_idx = i % 2 optimizer.set_current_game(game_idx) is_last_move = i + 2 >= len(moves) if is_last_move: if game_idx == 0: optimizer.set_result(game_idx, win0) else: optimizer.set_result(game_idx, win1) # train_on_batch accumulates gradients, and should only cause a change to parameters # on the first call after the final set_result() call policy.model.train_on_batch(s, a) return policy.model.get_weights()
def test_positional_superko(self): move_list = [(0, 3), (0, 4), (1, 3), (1, 4), (2, 3), (2, 4), (2, 2), (3, 4), (2, 1), (3, 3), (3, 1), (3, 2), (3, 0), (4, 2), (1, 1), (4, 1), (8, 0), (4, 0), (8, 1), (0, 2), (8, 2), (0, 1), (8, 3), (1, 0), (8, 4), (2, 0), (0, 0)] # 0 1 2 3 4 5 6 7 8 9 # 0 . W W B W . . . . . # 1 W B . B W . . . . . # 2 W B B B W . . . . . # 3 B B W W W . . . . . # 4 W W W . . . . . . . # 5 . . . . . . . . . . # 6 . . . . . . . . . . # 7 . . . . . . . . . . # 8 B B B B B . . . . . # 9 . . . . . . . . . . gs = GameState(size=9) for move in move_list: gs.do_move(move) self.assertTrue(gs.is_legal((1, 0))) gs = GameState(size=9, enforce_superko=True) for move in move_list: gs.do_move(move) self.assertFalse(gs.is_legal((1, 0)))
def test_output_size(self): policy19 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=19) output = policy19.forward(policy19.preprocessor.state_to_tensor(GameState(19))) self.assertEqual(output.shape, (1, 19 * 19)) policy13 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=13) output = policy13.forward(policy13.preprocessor.state_to_tensor(GameState(13))) self.assertEqual(output.shape, (1, 13 * 13))
def test_probabilistic_player(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = ProbabilisticPolicyPlayer(policy) for i in range(20): move = player.get_move(gs) self.assertIsNotNone(move) gs.do_move(move)
def testApplyAndResetOnGamesFinished(self): policy = CNNPolicy.load_model( os.path.join('test_data', 'minimodel.json')) state = GameState(size=19) optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2) policy.model.compile(loss=log_loss, optimizer=optimizer) # Helper to check initial conditions of the optimizer. def assertOptimizerInitialConditions(): for v in optimizer.gradient_sign: self.assertEqual(K.eval(v), 0) self.assertEqual(K.eval(optimizer.running_games), 2) initial_parameters = policy.model.get_weights() def assertModelEffect(changed): any_change = False for cur, init in zip(policy.model.get_weights(), initial_parameters): if not np.allclose(init, cur): any_change = True break self.assertEqual(any_change, changed) assertOptimizerInitialConditions() # Make moves on the state and get trainable (state, action) pairs from them. state_tensors = [] action_tensors = [] moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)] for m in moves: (st_tensor, mv_tensor) = _make_training_pair(state, m, policy.preprocessor) state_tensors.append(st_tensor) action_tensors.append(mv_tensor) state.do_move(m) for i, (s, a) in enumerate(zip(state_tensors, action_tensors)): # Even moves in game 0, odd moves in game 1 game_idx = i % 2 optimizer.set_current_game(game_idx) is_last_move = i + 2 >= len(moves) if is_last_move: # Mark game 0 as a win and game 1 as a loss. optimizer.set_result(game_idx, game_idx == 0) else: # Games not finished yet; assert no change to optimizer state. assertOptimizerInitialConditions() # train_on_batch accumulates gradients, and should only cause a change to parameters # on the first call after the final set_result() call policy.model.train_on_batch(s, a) if i + 1 < len(moves): assertModelEffect(changed=False) else: assertModelEffect(changed=True) # Once both games finished, the last call to train_on_batch() should have triggered a reset # to the optimizer parameters back to initial conditions. assertOptimizerInitialConditions()
def test_eye_recursion(self): # a checkerboard pattern of black is 'technically' all true eyes # mutually supporting each other gs = GameState(7) for x in range(gs.size): for y in range(gs.size): if (x + y) % 2 == 1: gs.do_move((x, y), go.BLACK) self.assertTrue(gs.is_eye((0, 0), go.BLACK))
def test_snapback_is_not_ko(self): gs = GameState(size=5) # B X W B . # W W B . . # . . . . . # . . . . . # . . . . . # imagine black plays at 'X' capturing the white stone at (2, 0). # White may play again at (2, 0) to capture the black stones # at (0, 0), (1, 0). this is a 'snapback' not 'ko' # since it doesn't return the game to a previous position B = [(0, 0), (2, 1), (3, 0)] W = [(0, 1), (1, 1), (2, 0)] for (b, w) in zip(B, W): gs.do_move(b) gs.do_move(w) # do the capture of the single white stone gs.do_move((1, 0)) # there should be no ko self.assertIsNone(gs.ko) self.assertTrue(gs.is_legal((2, 0))) # now play the snapback gs.do_move((2, 0)) # check that the numbers worked out self.assertEqual(gs.num_black_prisoners, 2) self.assertEqual(gs.num_white_prisoners, 1)
def test_sensible_greedy(self): gs = GameState() policy = CNNPolicy(["board", "ones", "turns_since"]) player = GreedyPolicyPlayer(policy) empty = (10, 10) for x in range(19): for y in range(19): if (x, y) != empty: gs.do_move((x, y), go.BLACK) gs.current_player = go.BLACK self.assertIsNone(player.get_move(gs))
def test_copy_maintains_shared_sets(self): gs = GameState(7) gs.do_move((4, 4), go.BLACK) gs.do_move((4, 5), go.BLACK) # assert that gs has *the same object* referenced by group/liberty sets self.assertTrue(gs.group_sets[4][5] is gs.group_sets[4][4]) self.assertTrue(gs.liberty_sets[4][5] is gs.liberty_sets[4][4]) gs_copy = gs.copy() self.assertTrue(gs_copy.group_sets[4][5] is gs_copy.group_sets[4][4]) self.assertTrue( gs_copy.liberty_sets[4][5] is gs_copy.liberty_sets[4][4])
def parse(boardstr): '''Parses a board into a gamestate, and returns the location of any moves marked with anything other than 'B', 'X', '#', 'W', 'O', or '.' Rows are separated by '|', spaces are ignored. ''' boardstr = boardstr.replace(' ', '') board_size = max(boardstr.index('|'), boardstr.count('|')) st = GameState(size=board_size) moves = {} for row, rowstr in enumerate(boardstr.split('|')): for col, c in enumerate(rowstr): if c == '.': continue # ignore empty spaces elif c in 'BX#': st.do_move((row, col), color=BLACK) elif c in 'WO': st.do_move((row, col), color=WHITE) else: # move reference assert c not in moves, "{} already used as a move marker".format( c) moves[c] = (row, col) return st, moves
def setUp(self): # 0 1 2 3 4 5 6 7 8 9 A B # 0 . . . . . . . . . . . . # 1 . . . . . . . . . . . . # 2 . . . . . . . . . . . . # 3 . . . . . . . . . . . . # 4 . . . . . B B . . . . . # 5 . . . . . W B . . . . . # 6 . . . . . . B . . . . . # 7 . . . . . . . . . . . . # 8 . . . . . . . . . . . . # 9 . . . . . . . . . . W . # A . . . . . . . . . . W W # B . . . . . . . . . . . . self.s = GameState() self.s.do_move((4, 5)) self.s.do_move((5, 5)) self.s.do_move((5, 6)) self.s.do_move((10, 10)) self.s.do_move((4, 6)) self.s.do_move((10, 11)) self.s.do_move((6, 6)) self.s.do_move((9, 10))
def test_liberties_after_capture(self): # creates 3x3 black group in the middle, that is then all captured # ...then an assertion is made that the resulting liberties after # capture are the same as if the group had never been there gs_capture = GameState(7) gs_reference = GameState(7) # add in 3x3 black stones for x in range(2, 5): for y in range(2, 5): gs_capture.do_move((x, y), go.BLACK) # surround the black group with white stones # and set the same white stones in gs_reference for x in range(2, 5): gs_capture.do_move((x, 1), go.WHITE) gs_capture.do_move((x, 5), go.WHITE) gs_reference.do_move((x, 1), go.WHITE) gs_reference.do_move((x, 5), go.WHITE) gs_capture.do_move((1, 1), go.WHITE) gs_reference.do_move((1, 1), go.WHITE) for y in range(2, 5): gs_capture.do_move((1, y), go.WHITE) gs_capture.do_move((5, y), go.WHITE) gs_reference.do_move((1, y), go.WHITE) gs_reference.do_move((5, y), go.WHITE) # board configuration and liberties of gs_capture and of gs_reference should be identical self.assertTrue(np.all(gs_reference.board == gs_capture.board)) self.assertTrue( np.all(gs_reference.liberty_counts == gs_capture.liberty_counts))
def setUp(self): self.gs = GameState() self.mcts = MCTS(dummy_value, dummy_policy, dummy_rollout, n_playout=2)
def test_true_eye(self): gs = GameState(size=7) gs.do_move((1, 0), go.BLACK) gs.do_move((0, 1), go.BLACK) # false eye at 0, 0 self.assertTrue(gs.is_eyeish((0, 0), go.BLACK)) self.assertFalse(gs.is_eye((0, 0), go.BLACK)) # make it a true eye by turning the corner (1, 1) into an eye itself gs.do_move((1, 2), go.BLACK) gs.do_move((2, 1), go.BLACK) gs.do_move((2, 2), go.BLACK) gs.do_move((0, 2), go.BLACK) self.assertTrue(gs.is_eyeish((0, 0), go.BLACK)) self.assertTrue(gs.is_eye((0, 0), go.BLACK)) self.assertTrue(gs.is_eye((1, 1), go.BLACK))
def test_standard_ko(self): # . B . . # B X B . # W B W . # . W . . gs = GameState(size=9) gs.do_move((1, 0)) # B gs.do_move((2, 0)) # W gs.do_move((2, 1)) # B gs.do_move((3, 1)) # W gs.do_move((1, 2)) # B gs.do_move((2, 2)) # W gs.do_move((0, 1)) # B gs.do_move((1, 1)) # W trigger capture and ko self.assertEqual(gs.num_black_prisoners, 1) self.assertEqual(gs.num_white_prisoners, 0) self.assertFalse(gs.is_legal((2, 1))) gs.do_move((5, 5)) gs.do_move((5, 6)) self.assertTrue(gs.is_legal((2, 1)))
def test_batch_eval_state(self): policy = ResnetPolicy(["board", "liberties", "sensibleness", "capture_size"]) results = policy.batch_eval_state([GameState(), GameState()]) self.assertEqual(len(results), 2) # one result per GameState self.assertEqual(len(results[0]), 361) # each one has 361 (move,prob) pairs
def test_default_policy(self): policy = ResnetPolicy(["board", "liberties", "sensibleness", "capture_size"]) policy.eval_state(GameState())
def setUp(self): self.gs = GameState() self.node = TreeNode(None, 1.0)
class TestMCTS(unittest.TestCase): def setUp(self): self.gs = GameState() self.mcts = MCTS(dummy_value, dummy_policy, dummy_rollout, n_playout=2) def _count_expansions(self): """Helper function to count the number of expansions past the root using the dummy policy """ node = self.mcts._root expansions = 0 # Loop over actions in decreasing probability. for action, _ in sorted(dummy_policy(self.gs), key=lambda (a, p): p, reverse=True): if action in node._children: expansions += 1 node = node._children[action] else: break return expansions def test_playout(self): self.mcts._playout(self.gs.copy(), 8) # Assert that the most likely child was visited (according to the dummy policy below). self.assertEqual(1, self.mcts._root._children[(18, 18)]._n_visits) # Assert that the search depth expanded nodes 8 times. self.assertEqual(8, self._count_expansions()) def test_playout_with_pass(self): # Test that playout handles the end of the game (i.e. passing/no moves). Mock this by # creating a policy that returns nothing after 4 moves. def stop_early_policy(state): if len(state.history) <= 4: return dummy_policy(state) else: return [] self.mcts = MCTS(dummy_value, stop_early_policy, stop_early_policy, n_playout=2) self.mcts._playout(self.gs.copy(), 8) # Assert that (18, 18) and (18, 17) are still only visited once. self.assertEqual(1, self.mcts._root._children[(18, 18)]._n_visits) # Assert that no expansions happened after reaching the "end" in 4 moves. self.assertEqual(5, self._count_expansions()) def test_get_move(self): move = self.mcts.get_move(self.gs) self.mcts.update_with_move(move) # success if no errors def test_update_with_move(self): move = self.mcts.get_move(self.gs) self.gs.do_move(move) self.mcts.update_with_move(move) # Assert that the new root still has children. self.assertTrue(len(self.mcts._root._children) > 0) # Assert that the new root has no parent (the rest of the tree will be garbage collected). self.assertIsNone(self.mcts._root._parent) # Assert that the next best move according to the root is (18, 17), according to the # dummy policy below. self.assertEqual((18, 17), self.mcts._root.select()[0])
class TestLiberties(unittest.TestCase): def setUp(self): # 0 1 2 3 4 5 6 7 8 9 A B # 0 . . . . . . . . . . . . # 1 . . . . . . . . . . . . # 2 . . . . . . . . . . . . # 3 . . . . . . . . . . . . # 4 . . . . . B B . . . . . # 5 . . . . . W B . . . . . # 6 . . . . . . B . . . . . # 7 . . . . . . . . . . . . # 8 . . . . . . . . . . . . # 9 . . . . . . . . . . W . # A . . . . . . . . . . W W # B . . . . . . . . . . . . self.s = GameState() self.s.do_move((4, 5)) self.s.do_move((5, 5)) self.s.do_move((5, 6)) self.s.do_move((10, 10)) self.s.do_move((4, 6)) self.s.do_move((10, 11)) self.s.do_move((6, 6)) self.s.do_move((9, 10)) def test_curr_liberties(self): self.assertEqual(self.s.liberty_counts[5][5], 2) self.assertEqual(self.s.liberty_counts[4][5], 8) self.assertEqual(self.s.liberty_counts[5][6], 8) def test_neighbors_edge_cases(self): st = GameState() st.do_move((0, 0)) # B B . . . . . st.do_move((5, 5)) # B W . . . . . st.do_move((0, 1)) # . . . . . . . st.do_move((6, 6)) # . . . . . . . st.do_move((1, 0)) # . . . . . W . st.do_move((1, 1)) # . . . . . . W # get_group in the corner self.assertEqual(len(st.get_group((0, 0))), 3, "group size in corner") # get_group of an empty space self.assertEqual(len(st.get_group((4, 4))), 0, "group size of empty space") # get_group of a single piece self.assertEqual(len(st.get_group((5, 5))), 1, "group size of single piece")
def test_neighbors_edge_cases(self): st = GameState() st.do_move((0, 0)) # B B . . . . . st.do_move((5, 5)) # B W . . . . . st.do_move((0, 1)) # . . . . . . . st.do_move((6, 6)) # . . . . . . . st.do_move((1, 0)) # . . . . . W . st.do_move((1, 1)) # . . . . . . W # get_group in the corner self.assertEqual(len(st.get_group((0, 0))), 3, "group size in corner") # get_group of an empty space self.assertEqual(len(st.get_group((4, 4))), 0, "group size of empty space") # get_group of a single piece self.assertEqual(len(st.get_group((5, 5))), 1, "group size of single piece")
def play_batch(player_RL, player_SL, batch_size, features): """Play a batch of games in parallel and return one training pair from each game. """ def do_move(states, moves): for st, mv in zip(states, moves): if not st.is_end_of_game: # Only do more moves if not end of game already st.do_move(mv) return states def do_rand_move(states, player, player_RL): """Do a uniform-random move over legal moves and record info for training. Only gets called once per game. """ colors = [st.current_player for st in states] # Record player color legal_moves = [st.get_legal_moves() for st in states] rand_moves = [lm[np.random.choice(len(lm))] for lm in legal_moves] states = do_move(states, rand_moves) player = player_RL X_list = [st.copy() for st in states] # For later 1hot preprocessing return X_list, colors, states, player def convert(X_list, preprocessor): """Convert states to 1-hot and concatenate. X's are game state objects. """ states = np.concatenate( [preprocessor.state_to_tensor(X) for X in X_list], axis=0) return states # Lists of game training pairs (1-hot) preprocessor = Preprocess(features) player = player_SL states = [GameState() for i in xrange(batch_size)] # Randomly choose turn to play uniform random. Move prior will be from SL # policy. Moves after will be from RL policy. i_rand_move = np.random.choice(range(450)) X_list = None winners = None turn = 0 while True: # Do moves (black) if turn == i_rand_move: # Make random move, then switch from SL to RL policy X_list, colors, states, player = do_rand_move(states, player, player_RL) else: # Get moves (batch) moves_black = player.get_moves(states) # Do moves (black) states = do_move(states, moves_black) turn += 1 # Do moves (white) if turn == i_rand_move: # Make random move, then switch from SL to RL policy X_list, colors, states, player = do_rand_move(states, player, player_RL) else: moves_white = player.get_moves(states) states = do_move(states, moves_white) turn += 1 # If all games have ended, we're done. Get winners. done = [st.is_end_of_game or st.turns_played > 500 for st in states] print turn if all(done): break # Concatenate training examples X = None if X_list is not None: X = convert(X_list, preprocessor) winners = np.array([st.get_winner() for st in states]).reshape(batch_size, 1) return X, winners
def test_simple_eye(self): # create a black eye in top left (1, 1), white in bottom right (5, 5) gs = GameState(size=7) gs.do_move((1, 0)) # B gs.do_move((5, 4)) # W gs.do_move((2, 1)) # B gs.do_move((6, 5)) # W gs.do_move((1, 2)) # B gs.do_move((5, 6)) # W gs.do_move((0, 1)) # B gs.do_move((4, 5)) # W # test black eye top left self.assertTrue(gs.is_eyeish((1, 1), go.BLACK)) self.assertFalse(gs.is_eyeish((1, 1), go.WHITE)) # test white eye bottom right self.assertTrue(gs.is_eyeish((5, 5), go.WHITE)) self.assertFalse(gs.is_eyeish((5, 5), go.BLACK)) # test no eye in other random positions self.assertFalse(gs.is_eyeish((1, 0), go.BLACK)) self.assertFalse(gs.is_eyeish((1, 0), go.WHITE)) self.assertFalse(gs.is_eyeish((2, 2), go.BLACK)) self.assertFalse(gs.is_eyeish((2, 2), go.WHITE))
def run_n_games(optimizer, learner, opponent, num_games): '''Run num_games games to completion, calling train_batch() on each position the learner sees. (Note: optimizer only accumulates gradients in its update function until all games have finished) ''' board_size = learner.policy.model.input_shape[-1] states = [GameState(size=board_size) for _ in range(num_games)] learner_net = learner.policy.model # Start all odd games with moves by 'opponent'. Even games will have 'learner' black. learner_color = [ go.BLACK if i % 2 == 0 else go.WHITE for i in range(num_games) ] odd_states = states[1::2] moves = opponent.get_moves(odd_states) for st, mv in zip(odd_states, moves): st.do_move(mv) current = learner other = opponent # Need to keep track of the index of unfinished states so that we can communicate which one is # being updated to the optimizer. idxs_to_unfinished_states = {i: states[i] for i in range(num_games)} while len(idxs_to_unfinished_states) > 0: # Get next moves by current player for all unfinished states. moves = current.get_moves(idxs_to_unfinished_states.values()) just_finished = [] # Do each move to each state in order. for (idx, state), mv in zip(idxs_to_unfinished_states.iteritems(), moves): # Order is important here. We must first get the training pair on the unmodified state. # Next, the state is updated and checked to see if the game is over. If it is over, the # optimizer is notified via set_result. Finally, train_on_batch is called, which # will trigger an update of all parameters only if set_result() has been called # for all games already (so set_result must come before train_on_batch). is_learnable = current is learner and mv is not go.PASS_MOVE if is_learnable: (X, y) = _make_training_pair(state, mv, learner.policy.preprocessor) state.do_move(mv) if state.is_end_of_game: learner_is_winner = state.get_winner() == learner_color[idx] optimizer.set_result(idx, learner_is_winner) just_finished.append(idx) if is_learnable: optimizer.set_current_game(idx) learner_net.train_on_batch(X, y) # Remove games that have finished from dict. for idx in just_finished: del idxs_to_unfinished_states[idx] # Swap 'current' and 'other' for next turn. current, other = other, current # Return the win ratio. wins = sum(state.get_winner() == pc for (state, pc) in zip(states, learner_color)) return float(wins) / num_games