Exemplo n.º 1
0
def play_minimax_games(net, game_count, mcts_sim_count, network_color):
    """
    returns the error percentage of the optimal move prediction by the network
    the network and the mcts are used to predict the move to play
    :param net:                 the network
    :param game_count:          the number of games to play
    :param mcts_sim_count:      the number of monte carlo simulations
    :param network_color:       the color of the network
    :return:                    the score of the network vs the minimax player
    """
    mcts_list = [mcts.MCTS(tic_tac_toe.TicTacToeBoard()) for _ in range(game_count)]
    player = CONST.WHITE

    all_terminated = False
    while not all_terminated:
        # make a move with the az agent
        if player == network_color:
            # run all mcts simulations
            mcts.run_simulations(mcts_list, mcts_sim_count, net, 0)

            # paly the best move suggested by the mcts policy
            for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
                # skip terminated games
                if mcts_ctx.board.is_terminal():
                    continue

                policy = mcts_list[i_mcts_ctx].policy_from_state(mcts_ctx.board.state_id(), 0)
                move = np.where(policy == 1)[0][0]
                mcts_ctx.board.execute_action(move)

        # make an optimal minimax move
        else:
            for mcts_ctx in mcts_list:
                # skip terminated games
                if mcts_ctx.board.is_terminal():
                    continue

                move = mcts_ctx.board.minimax_move()
                mcts_ctx.board.execute_action(move)

        # swap the player
        player = CONST.WHITE if player == CONST.BLACK else CONST.BLACK

        # check if all games are terminated
        all_terminated = True
        for mcts_ctx in mcts_list:
            if not mcts_ctx.board.is_terminal():
                all_terminated = False
                break


    # extract the score from all boards
    tot_score = 0
    for mcts_ctx in mcts_list:
        score = mcts_ctx.board.white_score() if network_color == CONST.WHITE else mcts_ctx.board.black_score()
        tot_score += score

    tot_score /= game_count
    return tot_score
Exemplo n.º 2
0
def mcts_prediction_error(net, test_set, mcts_sim_count, alpha_dirich, temp):
    """
    returns the error percentage of the optimal move prediction by the network
    the network and the mcts are used to predict the move to play
    :param net:             the network
    :param test_set:        the test set
    :param mcts_sim_count:  the number of monte carlo simulations
    :param alpha_dirich:    dirichlet noise parameter
    :param temp:            the temperature
    :return:                error percentage
    """
    tot_positions = test_set.shape[
        0]  # test_set.shape[0]   # the total number of positions in the test set
    correct_predictions = 0  # the correctly predicted positions in the test set
    tot_predictions = 0

    batch_size = 512
    idx_list = []
    mcts_list = []

    for j in range(tot_positions):
        # ignore losing positions
        if test_set["weak_score"][j] < 0:
            continue

        # load the board
        board = connect4.Connect4Board()
        board.from_position(test_set["position"][j], test_set["disk_mask"][j])
        mcts_list.append(MCTS(board))

        # save the index
        idx_list.append(j)

        if len(mcts_list) == batch_size or j == tot_positions - 1:
            # =========================================== execute the mcts simulations for all boards
            mcts.run_simulations(mcts_list, mcts_sim_count, net, alpha_dirich)

            # ===========================================  get the policy from the mcts
            for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
                policy = mcts_list[i_mcts_ctx].policy_from_state(
                    mcts_ctx.board.state_id(), temp)
                move = np.argmax(policy)

                # check if the move is part of the optimal moves
                idx = idx_list[i_mcts_ctx]
                if str(move) in str(test_set["weak_moves"][idx]):
                    correct_predictions += 1

                tot_predictions += 1

            idx_list = []
            mcts_list = []

    # calculate the prediction error
    error = (tot_predictions - correct_predictions) / tot_predictions * 100
    return error
Exemplo n.º 3
0
def __self_play_worker__(network_path, game_count):
    """
    plays a number of self play games
    :param network_path:        path of the network
    :param game_count:          the number of self-play games to play
    :return:                    a list of dictionaries with all training examples
    """
    # load the network
    net = data_storage.load_net(network_path, Config.evaluation_device)

    training_expl_list = []

    # initialize the mcts object for all games
    mcts_list = [MCTS() for _ in range(game_count)]

    # initialize the lists that keep track of the game
    player_list = [[] for _ in range(game_count)]
    state_list = [[] for _ in range(game_count)]
    state_id_list = [[] for _ in range(game_count)]
    policy_list = [[] for _ in range(game_count)]

    move_count = 0
    all_terminated = False
    while not all_terminated:
        # ===========================================  append the correct values to the lists for the training data
        for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
            # skip terminated games
            if mcts_ctx.board.terminal:
                continue

            # add regular board
            state, player = mcts_ctx.board.white_perspective()
            state_id = mcts_ctx.board.state_id()
            state_list[i_mcts_ctx].append(state)
            state_id_list[i_mcts_ctx].append(state_id)
            player_list[i_mcts_ctx].append(player)

            # add mirrored board
            board_mirrored = mcts_ctx.board.mirror()
            state_m, player_m = board_mirrored.white_perspective()
            state_id_m = board_mirrored.state_id()
            state_list[i_mcts_ctx].append(state_m)
            state_id_list[i_mcts_ctx].append(state_id_m)
            player_list[i_mcts_ctx].append(player_m)

        # =========================================== execute the mcts simulations for all boards
        mcts.run_simulations(mcts_list, Config.mcts_sim_count, net,
                             Config.alpha_dirich)

        # ===========================================  get the policy from the mcts
        temp = 0 if move_count >= Config.temp_threshold else Config.temp

        for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
            # skip terminated games
            if mcts_ctx.board.terminal:
                continue

            policy = mcts_list[i_mcts_ctx].policy_from_state(
                mcts_ctx.board.state_id(), temp)
            policy_list[i_mcts_ctx].append(policy)

            # add the mirrored policy as well
            policy_m = np.flip(policy)
            policy_list[i_mcts_ctx].append(policy_m)

            # sample from the policy to determine the move to play
            move = np.random.choice(len(policy), p=policy)
            mcts_ctx.board.play_move(move)

        move_count += 1

        # ===========================================  check if there are still boards with running games
        all_terminated = True
        for mcts_ctx in mcts_list:
            if not mcts_ctx.board.terminal:
                all_terminated = False
                break

    # =========================================== add the training example
    for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
        reward = mcts_ctx.board.training_reward()
        for i_player, player in enumerate(player_list[i_mcts_ctx]):
            value = reward if player == CONST.WHITE else -reward

            # save the training example
            training_expl_list.append({
                "state":
                state_list[i_mcts_ctx][i_player],
                "state_id":
                state_id_list[i_mcts_ctx][i_player],
                "player":
                player,
                "policy":
                policy_list[i_mcts_ctx][i_player],
                "value":
                value
            })

    # free up some resources
    del net
    del mcts_list
    torch.cuda.empty_cache()

    return training_expl_list
Exemplo n.º 4
0
def net_vs_net_mcts(net1, net2, mcts_sim_count, temp, game_count, game_class):
    """
    plays the two passed network against each other, in half of the games net1 is white and in the other half net2
    is white. to get the policy mcts is used.
    :param net1:            network 1
    :param net2:            network 2
    :param game_count:      the number of games to play in total, half of the games are played as white and the other half as black
    :param game_class:      the class of the game
    :return:                score of net1, the score is in the range of 0-1 where:
                            0:   loss
                            0.5: draw
                            1:   win
    """

    half_count = game_count // 2

    mcts_ctx_wrapper_list = []
    for i in range(2*half_count):
        if i < half_count:
            mcts_ctx_wrapper_list.append(MCTSContextWrapper(game_class(), 1))    # net1 is white
        else:
            mcts_ctx_wrapper_list.append(MCTSContextWrapper(game_class(), 2))    # net2 is white


    all_terminated = False
    while not all_terminated:
        # prepare the mcts context lists
        mcts_list1 = []         # mcts list where net1 needs to be used
        mcts_list2 = []         # mcts list where net2 needs to be used

        for idx, mcts_ctx_wrapper in enumerate(mcts_ctx_wrapper_list):
            # skip finished games
            if mcts_ctx_wrapper.board.is_terminal():
                continue

            mcts_ctx, net_number = mcts_ctx_wrapper.mcts_info()
            if net_number == 1:
                mcts_list1.append(mcts_ctx)
            else:
                mcts_list2.append(mcts_ctx)

        # run the mcts simulations
        mcts.run_simulations(mcts_list1, mcts_sim_count, net1, 0)
        mcts.run_simulations(mcts_list2, mcts_sim_count, net2, 0)


        # execute the move of the tree search
        for i_mcts_ctx, mcts_ctx in enumerate(mcts_list1):
            # skip terminated games
            if mcts_ctx.board.is_terminal():
                continue

            # choose the action according to the probability distribution
            policy = mcts_ctx.policy_from_state(mcts_ctx.board.state_id(), temp)
            action = np.random.choice(len(policy), p=policy)

            # execute the action on the board
            mcts_ctx.board.execute_action(action)


        for i_mcts_ctx, mcts_ctx in enumerate(mcts_list2):
            # skip terminated games
            if mcts_ctx.board.is_terminal():
                continue

            # choose the action according to the probability distribution
            policy = mcts_ctx.policy_from_state(mcts_ctx.board.state_id(), temp)
            action = np.random.choice(len(policy), p=policy)

            # execute the action on the board
            mcts_ctx.board.execute_action(action)


        # check if all boards are terminated
        all_terminated = True
        for mcts_ctx_wrapper in mcts_ctx_wrapper_list:
            if not mcts_ctx_wrapper.board.is_terminal():
                all_terminated = False
                break


    # calculate the score of network 1
    score = 0
    for mcts_ctx_wrapper in mcts_ctx_wrapper_list:
        reward = (mcts_ctx_wrapper.board.reward() + 1) / 2
        if mcts_ctx_wrapper.player1_color == 1:
            score += reward                 # net1 is white
        else:
            score += (1-reward)             # net1 is white


    score = score / (2*half_count)
    return score
Exemplo n.º 5
0
def __self_play_worker__(game_class, network_path, game_count):
    """
    plays a number of self play games
    :param game_class:          the class of the implemented games
    :param network_path:        path of the network
    :param game_count:          the number of self-play games to play
    :return:                    a list of dictionaries with all training examples
    """
    # load the network
    net = data_storage.load_net(network_path, config.evaluation_device)

    training_expl_list = []

    # initialize the mcts object for all games
    mcts_list = [MCTS(game_class()) for _ in range(game_count)]

    # initialize the lists that keep track of the games
    player_list = [[] for _ in range(game_count)]
    state_list = [[] for _ in range(game_count)]
    state_id_list = [[] for _ in range(game_count)]
    policy_list = [[] for _ in range(game_count)]

    move_count = 0
    all_terminated = False
    while not all_terminated:
        # =========================================== execute one mcts simulations for all boards
        mcts.run_simulations(mcts_list, config.mcts_sim_count, net,
                             config.alpha_dirich)

        # ===========================================  get the policy from the mcts
        temp = 0 if move_count >= config.temp_threshold else config.temp

        for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
            # skip terminated games
            if mcts_ctx.board.is_terminal():
                continue

            policy = mcts_list[i_mcts_ctx].policy_from_state(
                mcts_ctx.board.state_id(), temp)

            # add regular board
            state, player = mcts_ctx.board.white_perspective()
            state_id = mcts_ctx.board.state_id()
            state_list[i_mcts_ctx].append(state)
            state_id_list[i_mcts_ctx].append(state_id)
            player_list[i_mcts_ctx].append(player)
            policy_list[i_mcts_ctx].append(policy)

            # add symmetric boards
            board_symmetries, policy_symmetries = mcts_ctx.board.symmetries(
                policy)
            if board_symmetries is not None:
                for board_sym, policy_sym in zip(board_symmetries,
                                                 policy_symmetries):
                    state_s, player_s = board_sym.white_perspective()
                    state_id_s = board_sym.state_id()
                    state_list[i_mcts_ctx].append(state_s)
                    state_id_list[i_mcts_ctx].append(state_id_s)
                    player_list[i_mcts_ctx].append(player_s)

                    policy_list[i_mcts_ctx].append(policy_sym)

            # sample from the policy to determine the move to play
            action = np.random.choice(len(policy), p=policy)
            mcts_ctx.board.execute_action(action)

        move_count += 1

        # ===========================================  check if there are still boards with running games
        all_terminated = True
        for mcts_ctx in mcts_list:
            if not mcts_ctx.board.is_terminal():
                all_terminated = False
                break

    # =========================================== add the training example
    for i_mcts_ctx, mcts_ctx in enumerate(mcts_list):
        reward = mcts_ctx.board.training_reward()
        for i_player, player in enumerate(player_list[i_mcts_ctx]):
            value = reward if player == CONST.WHITE else -reward

            # save the training example
            training_expl_list.append({
                "state":
                state_list[i_mcts_ctx][i_player],
                "state_id":
                state_id_list[i_mcts_ctx][i_player],
                "player":
                player,
                "policy":
                policy_list[i_mcts_ctx][i_player],
                "value":
                value
            })

    # free up some resources
    del net
    del mcts_list
    torch.cuda.empty_cache()

    return training_expl_list