# param.grad.data.clamp_(-1, 1) rl_optimizer.step() sl_optimizer.step() steps_done = 0 iteration = 0 for i_episode in range(2000000): betting_state = np.zeros((num_player, 4, 3, 15)) # observe states (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) for i in range(num_player): players[i].setInitState( betting_state, toCardState(community_cards, player_hands[i], card_dictionary)) current_round = 0 terminal = False while not terminal: # env.render() current_player = community_infos[-1] current_round = community_infos[5] current_pot = community_infos[3] card_embed = toCardState(community_cards, player_hands[current_player], card_dictionary) # setstate players[current_player].setState(players[current_player].toStateRepre( betting_state, card_embed)) # set epsilon players[current_player].setEpsilon(i_episode) # take action
checkpoint = torch.load(sl_dir[i], map_location='cpu') sl_net[i].load_state_dict(checkpoint['model']) sl_net[i].eval() nfsp_players = NFSPLimit(card_dict=card_dictionary, device=device, anticipatoryFactor=1) for i in range(100000): deck = Deck() community = [-1, -1, -1, -1, -1] # non cards dealt # community = deck.draw(3) # 3 cards hands = deck.draw(2) betting_state = np.zeros((num_player, 4, 5, 3)) # no history card_embed = toCardState(community, hands, card_dictionary) betting_state_flatten = betting_state.flatten() state = np.concatenate((betting_state_flatten, card_embed)) sl = sl_net[0](torch.tensor(state, dtype=torch.float)).detach().numpy() q = policy_net[0](torch.tensor(state, dtype=torch.float)).detach().numpy() print(hand_to_str(hands)) # if sl[1] > 0 or q[1] > 0: # if q[1] > 0.01: # print(sl) # print(q) # print(hand_to_str(community)) # print(hand_to_str(hands)) print(q) input() # for i_episode in range(25000):
game_board = {} sum_board = {} nfsp_players={} for i in range(num_NFSP): nfsp_players[i] = NFSPLimit(card_dict= card_dictionary, device=device) game_board[i] = 20000 sum_board[i] = 0 for i_episode in range(25000): betting_state = np.zeros((num_player,4,5,3)) # print('-------------Playing Game:{}------------'.format(i_episode)) (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) for i in range(num_NFSP): nfsp_players[i].setInitState(betting_state, toCardState(community_cards, player_hands[i], card_dictionary)) current_round = 0 terminal = False if i_episode % 1000 == 0: print(i_episode) env.render() while not terminal: current_player = community_infos[-1] current_round = community_infos[5] current_raises = community_infos[6] current_pot = community_infos[3] card_embed = toCardState(community_cards, player_hands[current_player], card_dictionary) nfsp_players[current_player].setState(nfsp_players[current_player].toStateRepre(betting_state, card_embed)) action, follow_rl = nfsp_players[current_player].act(sl_net = sl_net[current_player], policy_net = policy_net[current_player]) action_c = correctLimitAction(action.item(),community_infos,player_infos,num_player)
def evaluate(policy_net, rl_optimizer, sl_net, sl_optimizer, steps_done, iteration, type_of_eval): env = LimitTexasHoldemEnv(num_player, max_limit=1e9, debug=False) #initialize 3-player game env.add_player(0, stack=20000) # add a player to seat 0 with 2000 "chips" env.add_player(1, stack=20000) # add a player to seat 1 with 2000 "chips" env.add_player(2, stack=20000) # add a player to seat 2 with 2000 "chips" env.add_player(3, stack=20000) # add a player to seat 2 with 2000 "chips" results = [] for expriment in range(1): game_board = {} sum_board = {} nfsp_players = {} for i in range(num_NFSP): nfsp_players[i] = NFSPLimit(card_dict=card_dictionary, device=device) game_board[i] = 20000 sum_board[i] = 0 random_players = {} for i in range(num_player - num_NFSP): random_players[i + num_NFSP] = simpleAgent() game_board[i + num_NFSP] = 20000 sum_board[i + num_NFSP] = 0 for i_episode in range(25000): betting_state = np.zeros((num_player, 4, 5, 3)) # print('-------------Playing Game:{}------------'.format(i_episode)) (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) for i in range(num_NFSP): nfsp_players[i].setInitState( betting_state, toCardState(community_cards, player_hands[i], card_dictionary)) current_round = 0 terminal = False # if i_episode % 1000 == 0: # print(i_episode) # env.render() while not terminal: current_player = community_infos[-1] current_round = community_infos[5] current_raises = community_infos[6] current_pot = community_infos[3] if current_player in nfsp_players.keys(): card_embed = toCardState(community_cards, player_hands[current_player], card_dictionary) nfsp_players[current_player].setState( nfsp_players[current_player].toStateRepre( betting_state, card_embed)) action, f_rl = nfsp_players[current_player].act( sl_net=sl_net, policy_net=policy_net) action_c = correctLimitAction(action.item(), community_infos, player_infos, num_player) actions = toLimitContinuesAction(action_c, community_infos, player_infos, num_player) else: if type_of_eval == 'call': actions = safe_actions(community_infos, player_infos, n_seats=num_player) else: actions = simple_heuristic( community_infos, player_infos, n_seats=num_player, community_cards=community_cards, player_hands=player_hands) action_c = toLimitDiscreteAction(current_player, current_pot, player_infos, actions) action = torch.tensor([[action_c]], dtype=torch.long).to(device) # take actions (player_states, (community_infos, community_cards)), rews, terminal, info = env.step(actions) (player_infos, player_hands) = zip(*player_states) # if i_episode % 1000 == 0: # env.render() if terminal: # set None state for i in range(num_NFSP): nfsp_players[i].reset() for i in range(num_player - num_NFSP): random_players[i + num_NFSP].reset() else: # not terminal if current_player in nfsp_players.keys(): betting_state = toLimitBettingState( betting_state, current_round, current_raises, current_player, action) nfsp_players[current_player].setState( nfsp_players[current_player].toStateRepre( betting_state, card_embed)) else: betting_state = toLimitBettingState( betting_state, current_round, current_raises, current_player, action) if current_round != community_infos[5]: for i in range(num_NFSP): nfsp_players[i].reset() for i in range(num_player - num_NFSP): random_players[i + num_NFSP].reset() # record for player_id in range(num_player): sum_board[player_id] += player_infos[player_id][ 2] - game_board[player_id] game_board[player_id] = player_infos[player_id][2] # reset players to 20000 if anyone's stack is down to 100 lost_players = [ p for p in env._seats if not p.emptyplayer and p.stack <= 100 ] if lost_players: for p in range(num_player): env.remove_player(p) env.add_player(p) game_board[p] = 20000 if (i_episode + 1) % 1000 == 0: with open( 'log_' + str(num_player) + 'players_' + str(num_hid) + 'hid_' + str(num_layer) + 'layer_' + str(use_res_net) + 'res' + str(iteration) + 'nfsp' + str(type_of_eval) + str(expriment) + '.txt', 'a+') as f: line = [ str(sum_board[p] / (i_episode + 1)) for p in range(num_player) ] line = ','.join([str(i_episode + 1)] + line) f.write(line + '\n') results.append(sum_board[0] / (i_episode + 1)) with open( str(num_player) + 'players_' + str(num_hid) + 'hid_' + str(num_layer) + 'layer_' + str(use_res_net) + 'res' + 'nfsp' + str(type_of_eval) + 'results.txt', 'a+') as f: f.write(','.join( [str(iteration), str(np.mean(results)), str(np.std(results))]) + '\n')