예제 #1
0
    #     param.grad.data.clamp_(-1, 1)
    rl_optimizer.step()
    sl_optimizer.step()


steps_done = 0
iteration = 0
for i_episode in range(2000000):
    betting_state = np.zeros((num_player, 4, 3, 15))
    # observe states
    (player_states, (community_infos, community_cards)) = env.reset()
    (player_infos, player_hands) = zip(*player_states)
    for i in range(num_player):
        players[i].setInitState(
            betting_state,
            toCardState(community_cards, player_hands[i], card_dictionary))
    current_round = 0
    terminal = False
    while not terminal:
        # env.render()
        current_player = community_infos[-1]
        current_round = community_infos[5]
        current_pot = community_infos[3]
        card_embed = toCardState(community_cards, player_hands[current_player],
                                 card_dictionary)
        # setstate
        players[current_player].setState(players[current_player].toStateRepre(
            betting_state, card_embed))
        # set epsilon
        players[current_player].setEpsilon(i_episode)
        # take action
예제 #2
0
    checkpoint = torch.load(sl_dir[i], map_location='cpu')
    sl_net[i].load_state_dict(checkpoint['model'])
    sl_net[i].eval()

nfsp_players = NFSPLimit(card_dict=card_dictionary,
                         device=device,
                         anticipatoryFactor=1)

for i in range(100000):
    deck = Deck()
    community = [-1, -1, -1, -1, -1]  # non cards dealt
    # community = deck.draw(3) # 3 cards
    hands = deck.draw(2)
    betting_state = np.zeros((num_player, 4, 5, 3))  # no history
    card_embed = toCardState(community, hands, card_dictionary)
    betting_state_flatten = betting_state.flatten()
    state = np.concatenate((betting_state_flatten, card_embed))
    sl = sl_net[0](torch.tensor(state, dtype=torch.float)).detach().numpy()
    q = policy_net[0](torch.tensor(state, dtype=torch.float)).detach().numpy()
    print(hand_to_str(hands))

    # if sl[1] > 0 or q[1] > 0:
    # if q[1] > 0.01:
    #     print(sl)
    #     print(q)
    #     print(hand_to_str(community))
    #     print(hand_to_str(hands))
    print(q)
    input()
# for i_episode in range(25000):
예제 #3
0
파일: selfplay.py 프로젝트: yw10/mlp-holdem
        game_board = {}
        sum_board = {}

        nfsp_players={}
        for i in range(num_NFSP):
            nfsp_players[i] = NFSPLimit(card_dict= card_dictionary, device=device)
            game_board[i] = 20000
            sum_board[i] = 0

        for i_episode in range(25000):
            betting_state = np.zeros((num_player,4,5,3))
            # print('-------------Playing Game:{}------------'.format(i_episode))
            (player_states, (community_infos, community_cards)) = env.reset()
            (player_infos, player_hands) = zip(*player_states)
            for i in range(num_NFSP):
                nfsp_players[i].setInitState(betting_state, toCardState(community_cards, player_hands[i], card_dictionary))

            current_round = 0
            terminal = False
            if i_episode % 1000 == 0:
                print(i_episode)
                env.render()
            while not terminal:
                current_player = community_infos[-1]
                current_round = community_infos[5]
                current_raises = community_infos[6]
                current_pot = community_infos[3]
                card_embed = toCardState(community_cards, player_hands[current_player], card_dictionary)
                nfsp_players[current_player].setState(nfsp_players[current_player].toStateRepre(betting_state, card_embed))
                action, follow_rl = nfsp_players[current_player].act(sl_net = sl_net[current_player], policy_net = policy_net[current_player])
                action_c = correctLimitAction(action.item(),community_infos,player_infos,num_player)
예제 #4
0
def evaluate(policy_net, rl_optimizer, sl_net, sl_optimizer, steps_done,
             iteration, type_of_eval):
    env = LimitTexasHoldemEnv(num_player, max_limit=1e9,
                              debug=False)  #initialize 3-player game
    env.add_player(0, stack=20000)  # add a player to seat 0 with 2000 "chips"
    env.add_player(1, stack=20000)  # add a player to seat 1 with 2000 "chips"
    env.add_player(2, stack=20000)  # add a player to seat 2 with 2000 "chips"
    env.add_player(3, stack=20000)  # add a player to seat 2 with 2000 "chips"

    results = []
    for expriment in range(1):

        game_board = {}
        sum_board = {}

        nfsp_players = {}
        for i in range(num_NFSP):
            nfsp_players[i] = NFSPLimit(card_dict=card_dictionary,
                                        device=device)
            game_board[i] = 20000
            sum_board[i] = 0
        random_players = {}
        for i in range(num_player - num_NFSP):
            random_players[i + num_NFSP] = simpleAgent()
            game_board[i + num_NFSP] = 20000
            sum_board[i + num_NFSP] = 0

        for i_episode in range(25000):
            betting_state = np.zeros((num_player, 4, 5, 3))
            # print('-------------Playing Game:{}------------'.format(i_episode))
            (player_states, (community_infos, community_cards)) = env.reset()
            (player_infos, player_hands) = zip(*player_states)
            for i in range(num_NFSP):
                nfsp_players[i].setInitState(
                    betting_state,
                    toCardState(community_cards, player_hands[i],
                                card_dictionary))

            current_round = 0
            terminal = False
            # if i_episode % 1000 == 0:
            #     print(i_episode)
            #     env.render()
            while not terminal:
                current_player = community_infos[-1]
                current_round = community_infos[5]
                current_raises = community_infos[6]
                current_pot = community_infos[3]
                if current_player in nfsp_players.keys():
                    card_embed = toCardState(community_cards,
                                             player_hands[current_player],
                                             card_dictionary)
                    nfsp_players[current_player].setState(
                        nfsp_players[current_player].toStateRepre(
                            betting_state, card_embed))
                    action, f_rl = nfsp_players[current_player].act(
                        sl_net=sl_net, policy_net=policy_net)
                    action_c = correctLimitAction(action.item(),
                                                  community_infos,
                                                  player_infos, num_player)
                    actions = toLimitContinuesAction(action_c, community_infos,
                                                     player_infos, num_player)
                else:
                    if type_of_eval == 'call':
                        actions = safe_actions(community_infos,
                                               player_infos,
                                               n_seats=num_player)
                    else:
                        actions = simple_heuristic(
                            community_infos,
                            player_infos,
                            n_seats=num_player,
                            community_cards=community_cards,
                            player_hands=player_hands)
                    action_c = toLimitDiscreteAction(current_player,
                                                     current_pot, player_infos,
                                                     actions)

                action = torch.tensor([[action_c]],
                                      dtype=torch.long).to(device)
                # take actions
                (player_states,
                 (community_infos,
                  community_cards)), rews, terminal, info = env.step(actions)
                (player_infos, player_hands) = zip(*player_states)
                # if i_episode % 1000 == 0:
                # env.render()
                if terminal:
                    # set None state
                    for i in range(num_NFSP):
                        nfsp_players[i].reset()
                    for i in range(num_player - num_NFSP):
                        random_players[i + num_NFSP].reset()
                else:
                    # not terminal
                    if current_player in nfsp_players.keys():
                        betting_state = toLimitBettingState(
                            betting_state, current_round, current_raises,
                            current_player, action)
                        nfsp_players[current_player].setState(
                            nfsp_players[current_player].toStateRepre(
                                betting_state, card_embed))
                    else:
                        betting_state = toLimitBettingState(
                            betting_state, current_round, current_raises,
                            current_player, action)

                    if current_round != community_infos[5]:
                        for i in range(num_NFSP):
                            nfsp_players[i].reset()
                        for i in range(num_player - num_NFSP):
                            random_players[i + num_NFSP].reset()

            # record
            for player_id in range(num_player):
                sum_board[player_id] += player_infos[player_id][
                    2] - game_board[player_id]
                game_board[player_id] = player_infos[player_id][2]

            # reset players to 20000 if anyone's stack is down to 100
            lost_players = [
                p for p in env._seats if not p.emptyplayer and p.stack <= 100
            ]
            if lost_players:
                for p in range(num_player):
                    env.remove_player(p)
                    env.add_player(p)
                    game_board[p] = 20000

            if (i_episode + 1) % 1000 == 0:
                with open(
                        'log_' + str(num_player) + 'players_' + str(num_hid) +
                        'hid_' + str(num_layer) + 'layer_' + str(use_res_net) +
                        'res' + str(iteration) + 'nfsp' + str(type_of_eval) +
                        str(expriment) + '.txt', 'a+') as f:
                    line = [
                        str(sum_board[p] / (i_episode + 1))
                        for p in range(num_player)
                    ]
                    line = ','.join([str(i_episode + 1)] + line)
                    f.write(line + '\n')
        results.append(sum_board[0] / (i_episode + 1))
    with open(
            str(num_player) + 'players_' + str(num_hid) + 'hid_' +
            str(num_layer) + 'layer_' + str(use_res_net) + 'res' + 'nfsp' +
            str(type_of_eval) + 'results.txt', 'a+') as f:
        f.write(','.join(
            [str(iteration),
             str(np.mean(results)),
             str(np.std(results))]) + '\n')