def self_play(self, first_color, libtorch, show): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to train_examples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in train_examples. """ train_examples = [] player1 = MCTS(libtorch, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) player2 = MCTS(libtorch, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) players = [player2, None, player1] player_index = 1 gomoku = Gomoku(self.n, self.n_in_row, first_color) if show: self.gomoku_gui.reset_status() episode_step = 0 while True: episode_step += 1 player = players[player_index + 1] # get action prob if episode_step <= self.num_explore: prob = np.array( list(player.get_action_probs(gomoku, self.temp))) else: prob = np.array(list(player.get_action_probs(gomoku, 0))) # generate sample board = tuple_2d_to_numpy_2d(gomoku.get_board()) last_action = gomoku.get_last_move() cur_player = gomoku.get_current_color() sym = self.get_symmetries(board, prob) for b, p in sym: train_examples.append([b, last_action, cur_player, p]) # dirichlet noise legal_moves = list(gomoku.get_legal_moves()) noise = 0.2 * np.random.dirichlet( self.dirichlet_alpha * np.ones(np.count_nonzero(legal_moves))) prob = 0.8 * prob j = 0 for i in range(len(prob)): if legal_moves[i] == 1: prob[i] += noise[j] j += 1 prob /= np.sum(prob) # execute move action = np.random.choice(len(prob), p=prob) if show: self.gomoku_gui.execute_move(cur_player, action) gomoku.execute_move(action) player1.update_with_move(action) player2.update_with_move(action) # next player player_index = -player_index # is ended ended, winner = gomoku.get_game_status() if ended == 1: # b, last_action, cur_player, p, v return [(x[0], x[1], x[2], x[3], x[2] * winner) for x in train_examples]
gomoku.execute_move(3) gomoku.execute_move(4) gomoku.execute_move(6) gomoku.execute_move(23) gomoku.execute_move(8) gomoku.execute_move(9) gomoku.execute_move(78) gomoku.execute_move(0) gomoku.execute_move(17) gomoku.execute_move(7) gomoku.execute_move(19) gomoku.execute_move(67) gomoku.execute_move(60) gomoku.execute_move(14) gomoku.execute_move(11) gomoku.execute_move(2) gomoku.execute_move(99) gomoku.execute_move(10) gomoku.execute_move(1) gomoku.execute_move(5) gomoku.execute_move(18) gomoku.execute_move(12) gomoku.execute_move(15) gomoku.execute_move(24) gomoku.execute_move(16) feature_batch = [(tuple_2d_to_numpy_2d(gomoku.get_board()), gomoku.get_last_move(), gomoku.get_current_color())] print('feature', feature_batch) print(nn.infer(feature_batch))