def test_best_response_cfr_one_card_poker(): game = OneCardPoker.create_game(n_cards=4) strategy, exploitabilities, strategies = cfr(game, num_iters=10, use_chance_sampling=False) exploitability = compute_exploitability(game, strategy) print("Exploitability: {}".format(exploitability)) assert exploitability > 0.0
def external_sampling_cfr(game: extensive_game.ExtensiveGame, num_iters: int = 1000): """ Args: game: ExtensiveGame. num_iters: int. The number of iterations of CFR to perform. Returns: average_strategy exploitabilities strategies """ # regrets is a dictionary where the keys are the information sets and values # are dictionaries from actions available in that information set to the # counterfactual regret for not playing that action in that information set. # Since information sets encode the player, we only require one dictionary. regrets = dict() # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the # strategy at time t + 1. strategy_t = extensive_game.Strategy.initialise() strategy_t_1 = extensive_game.Strategy.initialise() cfr_state = cfr_util.CFRState() average_strategy = cfr_util.AverageStrategy(game) strategies = [] exploitabilities = [] for t in range(num_iters): for player in [1, 2]: external_sampling_cfr_recursive(game, game.root, player, regrets, strategy_t, strategy_t_1, cfr_state) # Update the strategies strategy_t = strategy_t_1.copy() strategies.append(strategy_t) # Update average strategy cfr_util.update_average_strategy(game, average_strategy, strategy_t) # Compute the average strategy if t % 200 == 0: # Compute exploitability exploitability = best_response.compute_exploitability( game, average_strategy.compute_strategy()) exploitabilities.append((t, exploitability)) print("t: {}, nodes touched: {}, exploitability: {:.3f} mbb/h". format(t, cfr_state.nodes_touched, exploitability * 1000)) # immediate_regret, _, _ = cfr_metrics.compute_immediate_regret(game, strategies) # print("Immediate regret: {}".format(immediate_regret)) return average_strategy.compute_strategy(), exploitabilities, strategies
def cfr(game, num_iters=10000, use_chance_sampling=True): # regrets is a dictionary where the keys are the information sets and values # are dictionaries from actions available in that information set to the # counterfactual regret for not playing that action in that information set. # Since information sets encode the player, we only require one dictionary. regrets = dict() # Similarly, action_counts is a dictionary with keys the information sets # and values dictionaries from actions to action counts. action_counts = dict() # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the # strategy at time t + 1. strategy_t = dict() strategy_t_1 = dict() average_strategy = None exploitabilities = [] # Each information set is uniquely identified with an action tuple. values = {1: [], 2: []} for t in range(num_iters): for i in [1, 2]: cfr_recursive(game, game.root, i, t, 1.0, 1.0, 1.0, regrets, action_counts, strategy_t, strategy_t_1, use_chance_sampling=use_chance_sampling) average_strategy = compute_average_strategy(action_counts) # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside # cfr_recursive. We take a copy because we update it inside # cfr_recursive, and want to hold on to strategy_t_1 separately to # compare. strategy_t = strategy_t_1.copy() # Compute the exploitability of the strategy. if t % 1000 == 0: completed_strategy = game.complete_strategy_uniformly( average_strategy) exploitability = best_response.compute_exploitability( game, completed_strategy) exploitabilities.append((t, exploitability)) print("t: {}, exploitability: {}".format(t, exploitability)) return average_strategy, exploitabilities
def test_best_response_cfr(): """Test we can run 10 iterations of CFR on Leduc and then compute a best response. """ cards = [Card(value, suit) for value in range(3) for suit in range(2)] game = Leduc(cards) strategy, exploitabilities, strategies = cfr(game, num_iters=10, use_chance_sampling=False) exploitability = compute_exploitability(game, strategy) print("Exploitability: {}".format(exploitability)) assert exploitability > 0.0
def compute_agent_exploitability(agent: Agent, sess: tf.Session, game: NFSPGame): """Computes the exploitability of the agent's current strategy. Args: agent: Agent. sess: tensorflow session. game: NFSPGame. Returns: float. Exploitability of the agent's strategy. """ states = game._state_vectors strategy = agent.get_strategy(sess, states) return compute_exploitability(game._game, strategy)
default=3, type=int, help='In OneCardPoker or Leduc, pass the number of cards to use.') parser.add_argument('--num_suits', default=2, type=int, help='In Leduc, pass the number of suits to use.') args = parser.parse_args() if args.game == 'Leduc': print("Solving Leduc Hold'em") cards = card.get_deck(num_values=args.num_values, num_suits=args.num_suits) n_game = leduc.create_neural_leduc(cards) elif args.game == 'RockPaperScissors': print("Solving rock paper scissors") n_game = rock_paper_scissors.create_neural_rock_paper_scissors() strategy, exploitabilities = deep_cfr.deep_cfr( n_game, num_iters=args.num_iters, num_traversals=args.num_traversals, advantage_maxlen=args.advantage_maxlen, strategy_maxlen=args.strategy_maxlen, batch_size=args.batch_size, num_sgd_updates=args.num_sgd_updates) exploitability = compute_exploitability(n_game.extensive_game, strategy) print("Exploitability of strategy: {}".format(exploitability))
def deep_cfr(n_game: neural_game.NeuralGame, num_iters: int=100, num_traversals: int=10000, advantage_maxlen: int=1000000, strategy_maxlen: int=1000000, batch_size: int=1024, num_sgd_updates: int=100): """ Args: n_game: NeuralGame. num_iters: int. The number of iterations to run deep CFR for. num_traversals: int. The number of traversals per CFR iteration. advantage_maxlen: int. The maximum length of the advantage memories. strategy_maxlen: int. The maximum length of the strategy memory. batch_size: int. The batch size to use in training. num_sgd_updates: int. The number of sgd updates per training. Returns: strategy, exploitability. """ game, action_indexer, info_set_vectoriser = n_game advantage_memory1 = buffer.Reservoir(maxlen=advantage_maxlen) advantage_memory2 = buffer.Reservoir(maxlen=advantage_maxlen) strategy_memory = buffer.Reservoir(maxlen=strategy_maxlen) # Create summary tensors valid_summariser = util.TBSummariser(['exploitability']) time_str = time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime()) save_path = os.path.join('experiments', time_str) if not os.path.exists(save_path): print("Path doesn't exist, so creating: {}".format(save_path)) os.makedirs(save_path) log_file = os.path.join(save_path, 'nfsp.log') print("Log file {}".format(log_file)) print("To run tensorboard: tensorboard --logdir {}".format(os.path.join(os.getcwd(), save_path))) with tf.Session() as sess: network1 = DeepRegretNetwork(info_set_vectoriser.state_shape, action_indexer, 1) network1.set_sess(sess) network2 = DeepRegretNetwork(info_set_vectoriser.state_shape, action_indexer, 2) network2.set_sess(sess) network1.initialise() network2.initialise() tf_train_writer = tf.summary.FileWriter(os.path.join(save_path, 'train'), graph=sess.graph) # Iterate over players and do cfr traversals. for t in range(1, num_iters + 1): print("Iteration t = {}".format(t)) for player in [1, 2]: print("Player: {}".format(player)) print("Traversing") for i in tqdm(range(num_traversals)): cfr_traverse(game, action_indexer, info_set_vectoriser, game.root, player, network1, network2, advantage_memory1, advantage_memory2, strategy_memory, t) # Train the traversing player's network on the cfr traversals. network = network1 if player == 1 else network2 network.initialise() advantage_memory = advantage_memory1 if player == 1 else advantage_memory2 mean_loss = train_network( network, advantage_memory, action_indexer, info_set_vectoriser, t, tf_train_writer, batch_size, num_sgd_updates) print("Mean loss: {}".format(mean_loss)) tf_train_writer.flush() # print("################") # # print("----------------") # print("Advantage memory 1:") # print(advantage_memory1.buffer) # print("----------------") # print("Advantage memory 2:") # print(advantage_memory2.buffer) # print("----------------") # # print("################") # # print("----------------") # print("Predicted advantages:") # for info_set_id in set(game.info_set_ids.values()): # print("{}: {}".format( # info_set_id, # network.predict_advantages(info_set_vectoriser.get_vector(info_set_id), action_indexer)) # ) # print("----------------") # print("Advantage memory 1 length: {}".format(len(advantage_memory1))) print("Advantage memory 2 length: {}".format(len(advantage_memory2))) print("Strategy memory length: {}".format(len(strategy_memory))) mean_strategy = compute_mean_strategy(strategy_memory) # print("Strategy summary") # print(mean_strategy) if game.is_strategy_complete(mean_strategy): exploitability = best_response.compute_exploitability(game, mean_strategy) else: print("Strategy not complete, filling uniformly.") exploitability = best_response.compute_exploitability( game, mean_strategy, ) print("Exploitability: {} mbb/h".format(exploitability * 1000)) valid_summary = valid_summariser.summarise(sess, {'exploitability': exploitability}) tf_train_writer.add_summary(valid_summary, global_step=t) # TODO(chrisn). Train the network on the strategy memory. return mean_strategy, exploitability
help='The dropout rate to use.') args = parser.parse_args() dropout_rate = None if args.dropout_rate: dropout_rate = float(args.dropout_rate) cards = get_deck(num_values=args.num_values, num_suits=args.num_suits) game = Leduc(cards) strategy, exploitabilities, strategies = cfr( game, num_iters=args.cfr_iters, use_chance_sampling=args.use_chance_sampling) exploitability = compute_exploitability(game, strategy) print("Exploitability of final strategy: {}".format(exploitability)) leduc_nfsp = LeducNFSP(cards) state_vectors = leduc_nfsp._state_vectors state_dim = leduc_nfsp.state_dim action_dim = leduc_nfsp.action_dim # Now build a network. layer_dims = [64, 64, 64] network = build_network(state_dim, action_dim, layer_dims, dropout_rate=dropout_rate) states = list(strategy.keys())
cards = get_deck(num_values=args.num_values, num_suits=args.num_suits) game = Leduc(cards) elif args.game == 'OneCardPoker': print("Solving One Card Poker") game = OneCardPoker.create_game(args.num_values) strategy, exploitabilities = cfr( game, num_iters=args.num_iters, use_chance_sampling=args.use_chance_sampling) # Save the strategy and plot the performance. strategy_name = '{}_cfr.strategy'.format(args.game) print("Saving strategy at {}".format(strategy_name)) save_strategy(strategy, strategy_name) exploitability = compute_exploitability(game, strategy) print("Exploitability of saved strategy: {}".format(exploitability)) # plot_name = '{}.html'.format(args.game) # plt.output_file(plot_name) # p = plt.figure(title='Exploitability for CFR trained on {}'.format( # args.game), x_axis_label='t', y_axis_label='Exploitability') # times = [pair[0] for pair in exploitabilities] # exploits = [pair[1] for pair in exploitabilities] # p.line(times, exploits) # # print("Saved plot of exploitability at: {}".format(plot_name))
def cfr(game, num_iters=10000, use_chance_sampling=True, linear_weight=False): """ Args: game: num_iters: use_chance_sampling: Returns: average_strategy, exploitabilities """ # regrets is a dictionary where the keys are the information sets and values # are dictionaries from actions available in that information set to the # counterfactual regret for not playing that action in that information set. # Since information sets encode the player, we only require one dictionary. regrets = dict() # Similarly, action_counts is a dictionary with keys the information sets # and values dictionaries from actions to action counts. action_counts = dict() cfr_state = cfr_util.CFRState() # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the # strategy at time t + 1. strategy_t = Strategy.initialise() strategy_t_1 = Strategy.initialise() average_strategy = None exploitabilities = [] strategies = [] average_strategy2 = cfr_util.AverageStrategy(game) # Each information set is uniquely identified with an action tuple. start_time = time.time() for t in range(num_iters): weight = t if linear_weight else 1.0 for i in [1, 2]: cfr_recursive(game, game.root, i, t, 1.0, 1.0, 1.0, regrets, action_counts, strategy_t, strategy_t_1, cfr_state, use_chance_sampling=use_chance_sampling, weight=weight) average_strategy = compute_average_strategy(action_counts) cfr_util.update_average_strategy(game, average_strategy2, strategy_t, weight=weight) # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside # cfr_recursive. We take a copy because we update it inside # cfr_recursive, and want to hold on to strategy_t_1 separately to # compare. strategy_t = strategy_t_1.copy() strategies.append(strategy_t.copy()) # Compute the exploitability of the strategy. if t % 10 == 0: print("t: {}. Time since last evaluation: {:.4f} s".format(t, time.time() - start_time)) start_time = time.time() exploitability = best_response.compute_exploitability( game, average_strategy) exploitabilities.append((t, exploitability)) print("t: {}, nodes touched: {}, exploitability: {} mbb/h".format(t, cfr_state.nodes_touched, exploitability * 1000)) exploitability = best_response.compute_exploitability(game, average_strategy2.compute_strategy()) print("Exploitability (av strategy method 2): {} mbb/h".format(exploitability * 1000)) immediate_regret, _, _ = cfr_metrics.compute_immediate_regret(game, strategies) print("Immediate regret: {}".format(immediate_regret)) return average_strategy, exploitabilities, strategies