print("pytorch version: ", torch.__version__) # load the network path_list = os.listdir(network_dir) path_list.sort(key=utils.natural_keys) # load the test set with the solved positions test_set = pd.read_csv(test_set_path, sep=",") # test the best network to quickly get a result net_path = network_dir + path_list[-1] net = data_storage.load_net(net_path, evaluation.torch_device) policy_error, value_error = evaluation.net_prediction_error(net, test_set) logger.debug("prediction-error: {}, value-error: {}, network: {}".format(policy_error, value_error, net_path)) # calculate the prediction error of the networks generation = [] net_prediciton_error = [] net_value_error = [] mcts_prediciton_error = [] path_list = os.listdir(network_dir) path_list.sort(key=utils.natural_keys) # empty board test board = connect4.Connect4Board()
def __self_play_worker__(network_path, game_count): """ plays a number of self play games :param network_path: path of the network :param game_count: the number of self-play games to play :return: a list of dictionaries with all training examples """ # load the network net = data_storage.load_net(network_path, Config.evaluation_device) training_expl_list = [] # initialize the mcts object for all games mcts_list = [MCTS() for _ in range(game_count)] # initialize the lists that keep track of the game player_list = [[] for _ in range(game_count)] state_list = [[] for _ in range(game_count)] state_id_list = [[] for _ in range(game_count)] policy_list = [[] for _ in range(game_count)] move_count = 0 all_terminated = False while not all_terminated: # =========================================== append the correct values to the lists for the training data for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.terminal: continue # add regular board state, player = mcts_ctx.board.white_perspective() state_id = mcts_ctx.board.state_id() state_list[i_mcts_ctx].append(state) state_id_list[i_mcts_ctx].append(state_id) player_list[i_mcts_ctx].append(player) # add mirrored board board_mirrored = mcts_ctx.board.mirror() state_m, player_m = board_mirrored.white_perspective() state_id_m = board_mirrored.state_id() state_list[i_mcts_ctx].append(state_m) state_id_list[i_mcts_ctx].append(state_id_m) player_list[i_mcts_ctx].append(player_m) # =========================================== execute the mcts simulations for all boards mcts.run_simulations(mcts_list, Config.mcts_sim_count, net, Config.alpha_dirich) # =========================================== get the policy from the mcts temp = 0 if move_count >= Config.temp_threshold else Config.temp for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.terminal: continue policy = mcts_list[i_mcts_ctx].policy_from_state( mcts_ctx.board.state_id(), temp) policy_list[i_mcts_ctx].append(policy) # add the mirrored policy as well policy_m = np.flip(policy) policy_list[i_mcts_ctx].append(policy_m) # sample from the policy to determine the move to play move = np.random.choice(len(policy), p=policy) mcts_ctx.board.play_move(move) move_count += 1 # =========================================== check if there are still boards with running games all_terminated = True for mcts_ctx in mcts_list: if not mcts_ctx.board.terminal: all_terminated = False break # =========================================== add the training example for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): reward = mcts_ctx.board.training_reward() for i_player, player in enumerate(player_list[i_mcts_ctx]): value = reward if player == CONST.WHITE else -reward # save the training example training_expl_list.append({ "state": state_list[i_mcts_ctx][i_player], "state_id": state_id_list[i_mcts_ctx][i_player], "player": player, "policy": policy_list[i_mcts_ctx][i_player], "value": value }) # free up some resources del net del mcts_list torch.cuda.empty_cache() return training_expl_list
def main_evaluation(game_class, result_folder): # configuration values game_count = 200 # the number of test games to play mcts_sim_count = 200 # the number of mcts simulations to perform temp = 0.3 # the temperature used to get the policy for the move selection, gives some randomness # the logger utils.init_logger(logging.DEBUG, file_name="log/app.log") logger = logging.getLogger('evaluation') # set the random seed random.seed(a=None, version=2) np.random.seed(seed=None) # load the network network_dir = config.save_dir + "/networks/" path_list = os.listdir(network_dir) path_list.sort(key=utils.natural_keys) # let all network play against the last generation without any mcts best_net_path = network_dir + path_list[-1] best_net = data_storage.load_net(best_net_path, torch_device) generation = [] prediction_score = [] for i in range(len(path_list)): generation.append(i) net_path = network_dir + path_list[i] net = data_storage.load_net(net_path, torch_device) score = net_vs_net_prediction(net, best_net, game_count, game_class) prediction_score.append(score) logger.debug("prediction score: {}, network: {}".format(score, net_path)) # let all network play against the last generation with mcts mcts_score = [] path_list = [] # [path_list[0], path_list[-2]] for i in range(len(path_list)): net_path = network_dir + path_list[i] net = data_storage.load_net(net_path, torch_device) score = net_vs_net_mcts(net, best_net, mcts_sim_count, temp, game_count, game_class) mcts_score.append(score) logger.debug("mcts_score score: {}, network: {}".format(score, net_path)) # save the results np.save(result_folder +"/net_vs_net_pred.npy", np.array(prediction_score)) np.save(result_folder + "/net_vs_net_mcts.npy", np.array(mcts_score)) np.save(result_folder + "/net_vs_net_gen.npy", np.array(generation)) # set the style of the plot plt.style.use('seaborn-dark-palette') # plot the prediction score fig1 = plt.figure(1) plt.plot(generation, prediction_score) axes = plt.gca() axes.set_ylim([0, 0.55]) axes.grid(True, color=(0.9, 0.9, 0.9)) plt.title("Prediction Score vs Best Network") plt.xlabel("Generation") plt.ylabel("Prediction Score") fig1.show() # # plot the mcts score # fig2 = plt.figure(2) # plt.plot(generation, mcts_score) # axes = plt.gca() # axes.set_ylim([0, 0.55]) # axes.grid(True, color=(0.9, 0.9, 0.9)) # plt.title("MCTS Prediction Score vs Best Network") # plt.xlabel("Generation") # plt.ylabel("MCTS Score") # fig2.show() plt.show()
# create the minimax state dict minimax.create_state_dict() # load the network path_list = os.listdir(network_dir) path_list.sort(key=utils.natural_keys) # define the parameters for the evaluation torch_device = torch.device('cpu') # torch device that is used for evaluation game_count = 300 # the number of games to play mcts_sim_count = 20 # the number of mcts simulations # test the best network to quickly get a result net_path = network_dir + path_list[-1] net = data_storage.load_net(net_path, torch_device) white_score = minimax.play_minimax_games(net, game_count, mcts_sim_count, CONST.WHITE) black_score = minimax.play_minimax_games(net, game_count, mcts_sim_count, CONST.BLACK) logger.debug("white score: {}, black: {}, network: {}".format( white_score, black_score, net_path)) # let the different networks play against a minimax player generation = [] white_scores = [] black_scores = [] path_list = os.listdir(network_dir) path_list.sort(key=utils.natural_keys) # get the prediction error of all networks
def __self_play_worker__(game_class, network_path, game_count): """ plays a number of self play games :param game_class: the class of the implemented games :param network_path: path of the network :param game_count: the number of self-play games to play :return: a list of dictionaries with all training examples """ # load the network net = data_storage.load_net(network_path, config.evaluation_device) training_expl_list = [] # initialize the mcts object for all games mcts_list = [MCTS(game_class()) for _ in range(game_count)] # initialize the lists that keep track of the games player_list = [[] for _ in range(game_count)] state_list = [[] for _ in range(game_count)] state_id_list = [[] for _ in range(game_count)] policy_list = [[] for _ in range(game_count)] move_count = 0 all_terminated = False while not all_terminated: # =========================================== execute one mcts simulations for all boards mcts.run_simulations(mcts_list, config.mcts_sim_count, net, config.alpha_dirich) # =========================================== get the policy from the mcts temp = 0 if move_count >= config.temp_threshold else config.temp for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): # skip terminated games if mcts_ctx.board.is_terminal(): continue policy = mcts_list[i_mcts_ctx].policy_from_state( mcts_ctx.board.state_id(), temp) # add regular board state, player = mcts_ctx.board.white_perspective() state_id = mcts_ctx.board.state_id() state_list[i_mcts_ctx].append(state) state_id_list[i_mcts_ctx].append(state_id) player_list[i_mcts_ctx].append(player) policy_list[i_mcts_ctx].append(policy) # add symmetric boards board_symmetries, policy_symmetries = mcts_ctx.board.symmetries( policy) if board_symmetries is not None: for board_sym, policy_sym in zip(board_symmetries, policy_symmetries): state_s, player_s = board_sym.white_perspective() state_id_s = board_sym.state_id() state_list[i_mcts_ctx].append(state_s) state_id_list[i_mcts_ctx].append(state_id_s) player_list[i_mcts_ctx].append(player_s) policy_list[i_mcts_ctx].append(policy_sym) # sample from the policy to determine the move to play action = np.random.choice(len(policy), p=policy) mcts_ctx.board.execute_action(action) move_count += 1 # =========================================== check if there are still boards with running games all_terminated = True for mcts_ctx in mcts_list: if not mcts_ctx.board.is_terminal(): all_terminated = False break # =========================================== add the training example for i_mcts_ctx, mcts_ctx in enumerate(mcts_list): reward = mcts_ctx.board.training_reward() for i_player, player in enumerate(player_list[i_mcts_ctx]): value = reward if player == CONST.WHITE else -reward # save the training example training_expl_list.append({ "state": state_list[i_mcts_ctx][i_player], "state_id": state_id_list[i_mcts_ctx][i_player], "player": player, "policy": policy_list[i_mcts_ctx][i_player], "value": value }) # free up some resources del net del mcts_list torch.cuda.empty_cache() return training_expl_list