from agent.simpleAgent import simpleAgent from agent.NFSPLimit import NFSPLimit import gym import holdem from holdem.env import LimitTexasHoldemEnv from holdem.utils import toLimitDiscreteAction, toLimitContinuesAction, correctLimitAction, get_card_dict, toCardState, toLimitBettingState, toLimitDiscreteAction, random_actions, uniform_random_actions, fold_actions, safe_actions import numpy as np num_player = 4 num_NFSP = 2 big = False hid = 64 num_versions = 91 checkpoint_dir = '4players_64unit/4players_64hid_checkpoints' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = LimitTexasHoldemEnv(num_player, max_limit=1e9, debug=False) #initialize 3-player game env.add_player(0, stack=20000) # add a player to seat 0 with 2000 "chips" env.add_player(1, stack=20000) # add a player to seat 1 with 2000 "chips" env.add_player(2, stack=20000) # add a player to seat 2 with 2000 "chips" env.add_player(3, stack=20000) # add a player to seat 2 with 2000 "chips" card_dictionary = get_card_dict() for epoch in range(num_versions): rl_dir = checkpoint_dir + '/rl_checkpoints/checkpoint_' + str(epoch + 1) + '000.pt' sl_dir = checkpoint_dir + '/sl_checkpoints/checkpoint_' + str(epoch + 1) + '000.pt' # policy_net = DQN(num_player=num_player,big=False).to(device) policy_net = DQN_limit(num_player=num_player, big=big, num_action=3,
def evaluate(policy_net, rl_optimizer, sl_net, sl_optimizer, steps_done, iteration, type_of_eval): env = LimitTexasHoldemEnv(num_player, max_limit=1e9, debug=False) #initialize 3-player game env.add_player(0, stack=20000) # add a player to seat 0 with 2000 "chips" env.add_player(1, stack=20000) # add a player to seat 1 with 2000 "chips" env.add_player(2, stack=20000) # add a player to seat 2 with 2000 "chips" env.add_player(3, stack=20000) # add a player to seat 2 with 2000 "chips" results = [] for expriment in range(1): game_board = {} sum_board = {} nfsp_players = {} for i in range(num_NFSP): nfsp_players[i] = NFSPLimit(card_dict=card_dictionary, device=device) game_board[i] = 20000 sum_board[i] = 0 random_players = {} for i in range(num_player - num_NFSP): random_players[i + num_NFSP] = simpleAgent() game_board[i + num_NFSP] = 20000 sum_board[i + num_NFSP] = 0 for i_episode in range(25000): betting_state = np.zeros((num_player, 4, 5, 3)) # print('-------------Playing Game:{}------------'.format(i_episode)) (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) for i in range(num_NFSP): nfsp_players[i].setInitState( betting_state, toCardState(community_cards, player_hands[i], card_dictionary)) current_round = 0 terminal = False # if i_episode % 1000 == 0: # print(i_episode) # env.render() while not terminal: current_player = community_infos[-1] current_round = community_infos[5] current_raises = community_infos[6] current_pot = community_infos[3] if current_player in nfsp_players.keys(): card_embed = toCardState(community_cards, player_hands[current_player], card_dictionary) nfsp_players[current_player].setState( nfsp_players[current_player].toStateRepre( betting_state, card_embed)) action, f_rl = nfsp_players[current_player].act( sl_net=sl_net, policy_net=policy_net) action_c = correctLimitAction(action.item(), community_infos, player_infos, num_player) actions = toLimitContinuesAction(action_c, community_infos, player_infos, num_player) else: if type_of_eval == 'call': actions = safe_actions(community_infos, player_infos, n_seats=num_player) else: actions = simple_heuristic( community_infos, player_infos, n_seats=num_player, community_cards=community_cards, player_hands=player_hands) action_c = toLimitDiscreteAction(current_player, current_pot, player_infos, actions) action = torch.tensor([[action_c]], dtype=torch.long).to(device) # take actions (player_states, (community_infos, community_cards)), rews, terminal, info = env.step(actions) (player_infos, player_hands) = zip(*player_states) # if i_episode % 1000 == 0: # env.render() if terminal: # set None state for i in range(num_NFSP): nfsp_players[i].reset() for i in range(num_player - num_NFSP): random_players[i + num_NFSP].reset() else: # not terminal if current_player in nfsp_players.keys(): betting_state = toLimitBettingState( betting_state, current_round, current_raises, current_player, action) nfsp_players[current_player].setState( nfsp_players[current_player].toStateRepre( betting_state, card_embed)) else: betting_state = toLimitBettingState( betting_state, current_round, current_raises, current_player, action) if current_round != community_infos[5]: for i in range(num_NFSP): nfsp_players[i].reset() for i in range(num_player - num_NFSP): random_players[i + num_NFSP].reset() # record for player_id in range(num_player): sum_board[player_id] += player_infos[player_id][ 2] - game_board[player_id] game_board[player_id] = player_infos[player_id][2] # reset players to 20000 if anyone's stack is down to 100 lost_players = [ p for p in env._seats if not p.emptyplayer and p.stack <= 100 ] if lost_players: for p in range(num_player): env.remove_player(p) env.add_player(p) game_board[p] = 20000 if (i_episode + 1) % 1000 == 0: with open( 'log_' + str(num_player) + 'players_' + str(num_hid) + 'hid_' + str(num_layer) + 'layer_' + str(use_res_net) + 'res' + str(iteration) + 'nfsp' + str(type_of_eval) + str(expriment) + '.txt', 'a+') as f: line = [ str(sum_board[p] / (i_episode + 1)) for p in range(num_player) ] line = ','.join([str(i_episode + 1)] + line) f.write(line + '\n') results.append(sum_board[0] / (i_episode + 1)) with open( str(num_player) + 'players_' + str(num_hid) + 'hid_' + str(num_layer) + 'layer_' + str(use_res_net) + 'res' + 'nfsp' + str(type_of_eval) + 'results.txt', 'a+') as f: f.write(','.join( [str(iteration), str(np.mean(results)), str(np.std(results))]) + '\n')
from agent.NFSPAgent import NFSPAgent from agent.simpleAgent import simpleAgent from agent.NFSPLimit import NFSPLimit import gym import holdem from holdem.env import LimitTexasHoldemEnv from holdem.utils import hand_to_str, toLimitDiscreteAction, toLimitContinuesAction, correctLimitAction, get_card_dict, toCardState, toLimitBettingState, toLimitDiscreteAction, random_actions, uniform_random_actions, fold_actions, safe_actions import numpy as np from treys import Card, Deck, Evaluator num_player = 4 num_NFSP = 4 hid = 64 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = LimitTexasHoldemEnv(num_player, max_limit=1e9, debug=False) #initialize 3-player game env.add_player(0, stack=20000) # add a player to seat 0 with 2000 "chips" env.add_player(1, stack=20000) # add a player to seat 1 with 2000 "chips" env.add_player(2, stack=20000) # add a player to seat 2 with 2000 "chips" env.add_player(3, stack=20000) # add a player to seat 2 with 2000 "chips" card_dictionary = get_card_dict() checkpoint_dir = '4players_64unit/4players_64hid_checkpoints' version = 73 policy_net = {} sl_net = {} rl_dir = {} sl_dir = {} for i in range(num_NFSP): rl_dir[i] = checkpoint_dir + '/rl_checkpoints/checkpoint_' + str( version + 1 + i) + '000.pt'
GAMMA = 1 POLICY_UPDATE = 128 TARGET_UPDATE = 128 * 300 SAVE_INTERVAL = 5000 big = False use_res_net = False num_hid = 64 num_layer = 1 num_NFSP = 1 # init objects save_dir = str(num_player) + 'players_' + str(num_hid) + 'hid_' + str( num_layer) + 'layer_' + str(use_res_net) + 'res' + '_checkpoints' log_dir = 'log' # environment env = LimitTexasHoldemEnv(num_player, max_limit=1e5, debug=False) #2 players # game board game_board = {} sum_board = {} for p in range(num_player): env.add_player(p, stack=20000) game_board[p] = 20000 sum_board[p] = 0 # cards to index card_dictionary = get_card_dict() # players players = {} for i in range(num_player): players[i] = NFSPLimit(card_dict=card_dictionary, device=device)