def play_minimax_games(net, game_count, mcts_sim_count, network_color): """ returns the error percentage of the optimal move prediction by the network the network and the mcts are used to predict the move to play :param net: the network :param game_count: the number of games to play :param mcts_sim_count: the number of monte carlo simulations :param network_color: the color of the network :return: the score of the network vs the minimax player """ mcts_list = [mcts.MCTS(tic_tac_toe.TicTacToeBoard()) for _ in range(game_count)] player = CONST.WHITE all_terminated = False while not all_terminated: # make a move with the az agent if player == network_color: # run all mcts simulations mcts.run_simulations(mcts_list, mcts_sim_count, net, 0) # paly the best move suggested by the mcts policy for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.is_terminal(): continue policy = mcts_list[i_mcts_ctx].policy_from_state(mcts_ctx.board.state_id(), 0) move = np.where(policy == 1)[0][0] mcts_ctx.board.execute_action(move) # make an optimal minimax move else: for mcts_ctx in mcts_list: # skip terminated games if mcts_ctx.board.is_terminal(): continue move = mcts_ctx.board.minimax_move() mcts_ctx.board.execute_action(move) # swap the player player = CONST.WHITE if player == CONST.BLACK else CONST.BLACK # check if all games are terminated all_terminated = True for mcts_ctx in mcts_list: if not mcts_ctx.board.is_terminal(): all_terminated = False break # extract the score from all boards tot_score = 0 for mcts_ctx in mcts_list: score = mcts_ctx.board.white_score() if network_color == CONST.WHITE else mcts_ctx.board.black_score() tot_score += score tot_score /= game_count return tot_score
def mcts_prediction_error(net, test_set, mcts_sim_count, alpha_dirich, temp): """ returns the error percentage of the optimal move prediction by the network the network and the mcts are used to predict the move to play :param net: the network :param test_set: the test set :param mcts_sim_count: the number of monte carlo simulations :param alpha_dirich: dirichlet noise parameter :param temp: the temperature :return: error percentage """ tot_positions = test_set.shape[ 0] # test_set.shape[0] # the total number of positions in the test set correct_predictions = 0 # the correctly predicted positions in the test set tot_predictions = 0 batch_size = 512 idx_list = [] mcts_list = [] for j in range(tot_positions): # ignore losing positions if test_set["weak_score"][j] < 0: continue # load the board board = connect4.Connect4Board() board.from_position(test_set["position"][j], test_set["disk_mask"][j]) mcts_list.append(MCTS(board)) # save the index idx_list.append(j) if len(mcts_list) == batch_size or j == tot_positions - 1: # =========================================== execute the mcts simulations for all boards mcts.run_simulations(mcts_list, mcts_sim_count, net, alpha_dirich) # =========================================== get the policy from the mcts for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): policy = mcts_list[i_mcts_ctx].policy_from_state( mcts_ctx.board.state_id(), temp) move = np.argmax(policy) # check if the move is part of the optimal moves idx = idx_list[i_mcts_ctx] if str(move) in str(test_set["weak_moves"][idx]): correct_predictions += 1 tot_predictions += 1 idx_list = [] mcts_list = [] # calculate the prediction error error = (tot_predictions - correct_predictions) / tot_predictions * 100 return error
def __self_play_worker__(network_path, game_count): """ plays a number of self play games :param network_path: path of the network :param game_count: the number of self-play games to play :return: a list of dictionaries with all training examples """ # load the network net = data_storage.load_net(network_path, Config.evaluation_device) training_expl_list = [] # initialize the mcts object for all games mcts_list = [MCTS() for _ in range(game_count)] # initialize the lists that keep track of the game player_list = [[] for _ in range(game_count)] state_list = [[] for _ in range(game_count)] state_id_list = [[] for _ in range(game_count)] policy_list = [[] for _ in range(game_count)] move_count = 0 all_terminated = False while not all_terminated: # =========================================== append the correct values to the lists for the training data for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.terminal: continue # add regular board state, player = mcts_ctx.board.white_perspective() state_id = mcts_ctx.board.state_id() state_list[i_mcts_ctx].append(state) state_id_list[i_mcts_ctx].append(state_id) player_list[i_mcts_ctx].append(player) # add mirrored board board_mirrored = mcts_ctx.board.mirror() state_m, player_m = board_mirrored.white_perspective() state_id_m = board_mirrored.state_id() state_list[i_mcts_ctx].append(state_m) state_id_list[i_mcts_ctx].append(state_id_m) player_list[i_mcts_ctx].append(player_m) # =========================================== execute the mcts simulations for all boards mcts.run_simulations(mcts_list, Config.mcts_sim_count, net, Config.alpha_dirich) # =========================================== get the policy from the mcts temp = 0 if move_count >= Config.temp_threshold else Config.temp for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.terminal: continue policy = mcts_list[i_mcts_ctx].policy_from_state( mcts_ctx.board.state_id(), temp) policy_list[i_mcts_ctx].append(policy) # add the mirrored policy as well policy_m = np.flip(policy) policy_list[i_mcts_ctx].append(policy_m) # sample from the policy to determine the move to play move = np.random.choice(len(policy), p=policy) mcts_ctx.board.play_move(move) move_count += 1 # =========================================== check if there are still boards with running games all_terminated = True for mcts_ctx in mcts_list: if not mcts_ctx.board.terminal: all_terminated = False break # =========================================== add the training example for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): reward = mcts_ctx.board.training_reward() for i_player, player in enumerate(player_list[i_mcts_ctx]): value = reward if player == CONST.WHITE else -reward # save the training example training_expl_list.append({ "state": state_list[i_mcts_ctx][i_player], "state_id": state_id_list[i_mcts_ctx][i_player], "player": player, "policy": policy_list[i_mcts_ctx][i_player], "value": value }) # free up some resources del net del mcts_list torch.cuda.empty_cache() return training_expl_list
def net_vs_net_mcts(net1, net2, mcts_sim_count, temp, game_count, game_class): """ plays the two passed network against each other, in half of the games net1 is white and in the other half net2 is white. to get the policy mcts is used. :param net1: network 1 :param net2: network 2 :param game_count: the number of games to play in total, half of the games are played as white and the other half as black :param game_class: the class of the game :return: score of net1, the score is in the range of 0-1 where: 0: loss 0.5: draw 1: win """ half_count = game_count // 2 mcts_ctx_wrapper_list = [] for i in range(2*half_count): if i < half_count: mcts_ctx_wrapper_list.append(MCTSContextWrapper(game_class(), 1)) # net1 is white else: mcts_ctx_wrapper_list.append(MCTSContextWrapper(game_class(), 2)) # net2 is white all_terminated = False while not all_terminated: # prepare the mcts context lists mcts_list1 = [] # mcts list where net1 needs to be used mcts_list2 = [] # mcts list where net2 needs to be used for idx, mcts_ctx_wrapper in enumerate(mcts_ctx_wrapper_list): # skip finished games if mcts_ctx_wrapper.board.is_terminal(): continue mcts_ctx, net_number = mcts_ctx_wrapper.mcts_info() if net_number == 1: mcts_list1.append(mcts_ctx) else: mcts_list2.append(mcts_ctx) # run the mcts simulations mcts.run_simulations(mcts_list1, mcts_sim_count, net1, 0) mcts.run_simulations(mcts_list2, mcts_sim_count, net2, 0) # execute the move of the tree search for i_mcts_ctx, mcts_ctx in enumerate(mcts_list1): # skip terminated games if mcts_ctx.board.is_terminal(): continue # choose the action according to the probability distribution policy = mcts_ctx.policy_from_state(mcts_ctx.board.state_id(), temp) action = np.random.choice(len(policy), p=policy) # execute the action on the board mcts_ctx.board.execute_action(action) for i_mcts_ctx, mcts_ctx in enumerate(mcts_list2): # skip terminated games if mcts_ctx.board.is_terminal(): continue # choose the action according to the probability distribution policy = mcts_ctx.policy_from_state(mcts_ctx.board.state_id(), temp) action = np.random.choice(len(policy), p=policy) # execute the action on the board mcts_ctx.board.execute_action(action) # check if all boards are terminated all_terminated = True for mcts_ctx_wrapper in mcts_ctx_wrapper_list: if not mcts_ctx_wrapper.board.is_terminal(): all_terminated = False break # calculate the score of network 1 score = 0 for mcts_ctx_wrapper in mcts_ctx_wrapper_list: reward = (mcts_ctx_wrapper.board.reward() + 1) / 2 if mcts_ctx_wrapper.player1_color == 1: score += reward # net1 is white else: score += (1-reward) # net1 is white score = score / (2*half_count) return score
def __self_play_worker__(game_class, network_path, game_count): """ plays a number of self play games :param game_class: the class of the implemented games :param network_path: path of the network :param game_count: the number of self-play games to play :return: a list of dictionaries with all training examples """ # load the network net = data_storage.load_net(network_path, config.evaluation_device) training_expl_list = [] # initialize the mcts object for all games mcts_list = [MCTS(game_class()) for _ in range(game_count)] # initialize the lists that keep track of the games player_list = [[] for _ in range(game_count)] state_list = [[] for _ in range(game_count)] state_id_list = [[] for _ in range(game_count)] policy_list = [[] for _ in range(game_count)] move_count = 0 all_terminated = False while not all_terminated: # =========================================== execute one mcts simulations for all boards mcts.run_simulations(mcts_list, config.mcts_sim_count, net, config.alpha_dirich) # =========================================== get the policy from the mcts temp = 0 if move_count >= config.temp_threshold else config.temp for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.is_terminal(): continue policy = mcts_list[i_mcts_ctx].policy_from_state( mcts_ctx.board.state_id(), temp) # add regular board state, player = mcts_ctx.board.white_perspective() state_id = mcts_ctx.board.state_id() state_list[i_mcts_ctx].append(state) state_id_list[i_mcts_ctx].append(state_id) player_list[i_mcts_ctx].append(player) policy_list[i_mcts_ctx].append(policy) # add symmetric boards board_symmetries, policy_symmetries = mcts_ctx.board.symmetries( policy) if board_symmetries is not None: for board_sym, policy_sym in zip(board_symmetries, policy_symmetries): state_s, player_s = board_sym.white_perspective() state_id_s = board_sym.state_id() state_list[i_mcts_ctx].append(state_s) state_id_list[i_mcts_ctx].append(state_id_s) player_list[i_mcts_ctx].append(player_s) policy_list[i_mcts_ctx].append(policy_sym) # sample from the policy to determine the move to play action = np.random.choice(len(policy), p=policy) mcts_ctx.board.execute_action(action) move_count += 1 # =========================================== check if there are still boards with running games all_terminated = True for mcts_ctx in mcts_list: if not mcts_ctx.board.is_terminal(): all_terminated = False break # =========================================== add the training example for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): reward = mcts_ctx.board.training_reward() for i_player, player in enumerate(player_list[i_mcts_ctx]): value = reward if player == CONST.WHITE else -reward # save the training example training_expl_list.append({ "state": state_list[i_mcts_ctx][i_player], "state_id": state_id_list[i_mcts_ctx][i_player], "player": player, "policy": policy_list[i_mcts_ctx][i_player], "value": value }) # free up some resources del net del mcts_list torch.cuda.empty_cache() return training_expl_list