예제 #1
0
def test_best_response_cfr_one_card_poker():
    game = OneCardPoker.create_game(n_cards=4)

    strategy, exploitabilities, strategies = cfr(game, num_iters=10, use_chance_sampling=False)

    exploitability = compute_exploitability(game, strategy)

    print("Exploitability: {}".format(exploitability))
    assert exploitability > 0.0
예제 #2
0
def external_sampling_cfr(game: extensive_game.ExtensiveGame,
                          num_iters: int = 1000):
    """

    Args:
        game: ExtensiveGame.
        num_iters: int. The number of iterations of CFR to perform.

    Returns:
        average_strategy
        exploitabilities
        strategies
    """
    # regrets is a dictionary where the keys are the information sets and values
    # are dictionaries from actions available in that information set to the
    # counterfactual regret for not playing that action in that information set.
    # Since information sets encode the player, we only require one dictionary.
    regrets = dict()

    # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the
    # strategy at time t + 1.
    strategy_t = extensive_game.Strategy.initialise()
    strategy_t_1 = extensive_game.Strategy.initialise()
    cfr_state = cfr_util.CFRState()

    average_strategy = cfr_util.AverageStrategy(game)

    strategies = []
    exploitabilities = []

    for t in range(num_iters):
        for player in [1, 2]:
            external_sampling_cfr_recursive(game, game.root, player, regrets,
                                            strategy_t, strategy_t_1,
                                            cfr_state)

        # Update the strategies
        strategy_t = strategy_t_1.copy()
        strategies.append(strategy_t)

        # Update average strategy
        cfr_util.update_average_strategy(game, average_strategy, strategy_t)

        # Compute the average strategy
        if t % 200 == 0:
            # Compute exploitability
            exploitability = best_response.compute_exploitability(
                game, average_strategy.compute_strategy())
            exploitabilities.append((t, exploitability))

            print("t: {}, nodes touched: {}, exploitability: {:.3f} mbb/h".
                  format(t, cfr_state.nodes_touched, exploitability * 1000))

            # immediate_regret, _, _ = cfr_metrics.compute_immediate_regret(game, strategies)
            # print("Immediate regret: {}".format(immediate_regret))

    return average_strategy.compute_strategy(), exploitabilities, strategies
예제 #3
0
파일: cfr.py 프로젝트: downseq/rlpoker
def cfr(game, num_iters=10000, use_chance_sampling=True):
    # regrets is a dictionary where the keys are the information sets and values
    # are dictionaries from actions available in that information set to the
    # counterfactual regret for not playing that action in that information set.
    # Since information sets encode the player, we only require one dictionary.
    regrets = dict()

    # Similarly, action_counts is a dictionary with keys the information sets
    # and values dictionaries from actions to action counts.
    action_counts = dict()

    # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the
    # strategy at time t + 1.
    strategy_t = dict()
    strategy_t_1 = dict()

    average_strategy = None
    exploitabilities = []

    # Each information set is uniquely identified with an action tuple.
    values = {1: [], 2: []}
    for t in range(num_iters):
        for i in [1, 2]:
            cfr_recursive(game,
                          game.root,
                          i,
                          t,
                          1.0,
                          1.0,
                          1.0,
                          regrets,
                          action_counts,
                          strategy_t,
                          strategy_t_1,
                          use_chance_sampling=use_chance_sampling)

        average_strategy = compute_average_strategy(action_counts)

        # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside
        # cfr_recursive.  We take a copy because we update it inside
        # cfr_recursive, and want to hold on to strategy_t_1 separately to
        # compare.
        strategy_t = strategy_t_1.copy()

        # Compute the exploitability of the strategy.
        if t % 1000 == 0:
            completed_strategy = game.complete_strategy_uniformly(
                average_strategy)
            exploitability = best_response.compute_exploitability(
                game, completed_strategy)
            exploitabilities.append((t, exploitability))

            print("t: {}, exploitability: {}".format(t, exploitability))

    return average_strategy, exploitabilities
예제 #4
0
def test_best_response_cfr():
    """Test we can run 10 iterations of CFR on Leduc and then compute a best
    response.
    """
    cards = [Card(value, suit) for value in range(3) for suit in range(2)]
    game = Leduc(cards)

    strategy, exploitabilities, strategies = cfr(game, num_iters=10, use_chance_sampling=False)

    exploitability = compute_exploitability(game, strategy)

    print("Exploitability: {}".format(exploitability))
    assert exploitability > 0.0
예제 #5
0
파일: nfsp.py 프로젝트: Michael-Z/rlpoker
def compute_agent_exploitability(agent: Agent, sess: tf.Session, game: NFSPGame):
    """Computes the exploitability of the agent's current strategy.

    Args:
        agent: Agent.
        sess: tensorflow session.
        game: NFSPGame.

    Returns:
        float. Exploitability of the agent's strategy.
    """
    states = game._state_vectors
    strategy = agent.get_strategy(sess, states)

    return compute_exploitability(game._game, strategy)
예제 #6
0
        default=3,
        type=int,
        help='In OneCardPoker or Leduc, pass the number of cards to use.')
    parser.add_argument('--num_suits',
                        default=2,
                        type=int,
                        help='In Leduc, pass the number of suits to use.')

    args = parser.parse_args()

    if args.game == 'Leduc':
        print("Solving Leduc Hold'em")
        cards = card.get_deck(num_values=args.num_values,
                              num_suits=args.num_suits)
        n_game = leduc.create_neural_leduc(cards)
    elif args.game == 'RockPaperScissors':
        print("Solving rock paper scissors")
        n_game = rock_paper_scissors.create_neural_rock_paper_scissors()

    strategy, exploitabilities = deep_cfr.deep_cfr(
        n_game,
        num_iters=args.num_iters,
        num_traversals=args.num_traversals,
        advantage_maxlen=args.advantage_maxlen,
        strategy_maxlen=args.strategy_maxlen,
        batch_size=args.batch_size,
        num_sgd_updates=args.num_sgd_updates)

    exploitability = compute_exploitability(n_game.extensive_game, strategy)
    print("Exploitability of strategy: {}".format(exploitability))
예제 #7
0
def deep_cfr(n_game: neural_game.NeuralGame,
             num_iters: int=100, num_traversals: int=10000,
             advantage_maxlen: int=1000000, strategy_maxlen: int=1000000,
             batch_size: int=1024, num_sgd_updates: int=100):
    """
    Args:
        n_game: NeuralGame.
        num_iters: int. The number of iterations to run deep CFR for.
        num_traversals: int. The number of traversals per CFR iteration.
        advantage_maxlen: int. The maximum length of the advantage memories.
        strategy_maxlen: int. The maximum length of the strategy memory.
        batch_size: int. The batch size to use in training.
        num_sgd_updates: int. The number of sgd updates per training.

    Returns:
        strategy, exploitability.
    """
    game, action_indexer, info_set_vectoriser = n_game

    advantage_memory1 = buffer.Reservoir(maxlen=advantage_maxlen)
    advantage_memory2 = buffer.Reservoir(maxlen=advantage_maxlen)
    strategy_memory = buffer.Reservoir(maxlen=strategy_maxlen)

    # Create summary tensors
    valid_summariser = util.TBSummariser(['exploitability'])

    time_str = time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime())
    save_path = os.path.join('experiments', time_str)

    if not os.path.exists(save_path):
        print("Path doesn't exist, so creating: {}".format(save_path))
        os.makedirs(save_path)

    log_file = os.path.join(save_path, 'nfsp.log')
    print("Log file {}".format(log_file))

    print("To run tensorboard: tensorboard --logdir {}".format(os.path.join(os.getcwd(), save_path)))

    with tf.Session() as sess:
        network1 = DeepRegretNetwork(info_set_vectoriser.state_shape, action_indexer, 1)
        network1.set_sess(sess)
        network2 = DeepRegretNetwork(info_set_vectoriser.state_shape, action_indexer, 2)
        network2.set_sess(sess)

        network1.initialise()
        network2.initialise()

        tf_train_writer = tf.summary.FileWriter(os.path.join(save_path, 'train'), graph=sess.graph)

        # Iterate over players and do cfr traversals.
        for t in range(1, num_iters + 1):
            print("Iteration t = {}".format(t))
            for player in [1, 2]:
                print("Player: {}".format(player))
                print("Traversing")
                for i in tqdm(range(num_traversals)):
                    cfr_traverse(game, action_indexer, info_set_vectoriser,
                                 game.root, player, network1, network2,
                                 advantage_memory1, advantage_memory2,
                                 strategy_memory, t)

                # Train the traversing player's network on the cfr traversals.
                network = network1 if player == 1 else network2
                network.initialise()
                advantage_memory = advantage_memory1 if player == 1 else advantage_memory2
                mean_loss = train_network(
                    network, advantage_memory, action_indexer, info_set_vectoriser, t,
                    tf_train_writer, batch_size, num_sgd_updates)

                print("Mean loss: {}".format(mean_loss))
                tf_train_writer.flush()

            # print("################")
            #
            # print("----------------")
            # print("Advantage memory 1:")
            # print(advantage_memory1.buffer)
            # print("----------------")
            # print("Advantage memory 2:")
            # print(advantage_memory2.buffer)
            # print("----------------")
            #
            # print("################")
            #

            # print("----------------")
            # print("Predicted advantages:")
            # for info_set_id in set(game.info_set_ids.values()):
            #     print("{}: {}".format(
            #         info_set_id,
            #         network.predict_advantages(info_set_vectoriser.get_vector(info_set_id), action_indexer))
            #     )
            # print("----------------")
            #

            print("Advantage memory 1 length: {}".format(len(advantage_memory1)))
            print("Advantage memory 2 length: {}".format(len(advantage_memory2)))
            print("Strategy memory length: {}".format(len(strategy_memory)))

            mean_strategy = compute_mean_strategy(strategy_memory)
            # print("Strategy summary")
            # print(mean_strategy)
            if game.is_strategy_complete(mean_strategy):
                exploitability = best_response.compute_exploitability(game, mean_strategy)
            else:
                print("Strategy not complete, filling uniformly.")
                exploitability = best_response.compute_exploitability(
                    game,
                    mean_strategy,
                )
            print("Exploitability: {} mbb/h".format(exploitability * 1000))

            valid_summary = valid_summariser.summarise(sess, {'exploitability': exploitability})
            tf_train_writer.add_summary(valid_summary, global_step=t)

    # TODO(chrisn). Train the network on the strategy memory.
    return mean_strategy, exploitability
예제 #8
0
                        help='The dropout rate to use.')
    args = parser.parse_args()

    dropout_rate = None
    if args.dropout_rate:
        dropout_rate = float(args.dropout_rate)

    cards = get_deck(num_values=args.num_values, num_suits=args.num_suits)
    game = Leduc(cards)

    strategy, exploitabilities, strategies = cfr(
        game,
        num_iters=args.cfr_iters,
        use_chance_sampling=args.use_chance_sampling)

    exploitability = compute_exploitability(game, strategy)
    print("Exploitability of final strategy: {}".format(exploitability))

    leduc_nfsp = LeducNFSP(cards)
    state_vectors = leduc_nfsp._state_vectors
    state_dim = leduc_nfsp.state_dim
    action_dim = leduc_nfsp.action_dim

    # Now build a network.
    layer_dims = [64, 64, 64]
    network = build_network(state_dim,
                            action_dim,
                            layer_dims,
                            dropout_rate=dropout_rate)

    states = list(strategy.keys())
예제 #9
0
        cards = get_deck(num_values=args.num_values, num_suits=args.num_suits)
        game = Leduc(cards)

    elif args.game == 'OneCardPoker':
        print("Solving One Card Poker")
        game = OneCardPoker.create_game(args.num_values)

    strategy, exploitabilities = cfr(
        game,
        num_iters=args.num_iters,
        use_chance_sampling=args.use_chance_sampling)

    # Save the strategy and plot the performance.

    strategy_name = '{}_cfr.strategy'.format(args.game)
    print("Saving strategy at {}".format(strategy_name))
    save_strategy(strategy, strategy_name)

    exploitability = compute_exploitability(game, strategy)
    print("Exploitability of saved strategy: {}".format(exploitability))

    # plot_name = '{}.html'.format(args.game)
    # plt.output_file(plot_name)
    # p = plt.figure(title='Exploitability for CFR trained on {}'.format(
    #     args.game), x_axis_label='t', y_axis_label='Exploitability')
    # times = [pair[0] for pair in exploitabilities]
    # exploits = [pair[1] for pair in exploitabilities]
    # p.line(times, exploits)
    #
    # print("Saved plot of exploitability at: {}".format(plot_name))
예제 #10
0
def cfr(game, num_iters=10000, use_chance_sampling=True, linear_weight=False):
    """

    Args:
        game:
        num_iters:
        use_chance_sampling:

    Returns:
        average_strategy, exploitabilities
    """
    # regrets is a dictionary where the keys are the information sets and values
    # are dictionaries from actions available in that information set to the
    # counterfactual regret for not playing that action in that information set.
    # Since information sets encode the player, we only require one dictionary.
    regrets = dict()

    # Similarly, action_counts is a dictionary with keys the information sets
    # and values dictionaries from actions to action counts.
    action_counts = dict()

    cfr_state = cfr_util.CFRState()

    # Strategy_t holds the strategy at time t; similarly strategy_t_1 holds the
    # strategy at time t + 1.
    strategy_t = Strategy.initialise()
    strategy_t_1 = Strategy.initialise()

    average_strategy = None
    exploitabilities = []
    strategies = []

    average_strategy2 = cfr_util.AverageStrategy(game)

    # Each information set is uniquely identified with an action tuple.
    start_time = time.time()
    for t in range(num_iters):
        weight = t if linear_weight else 1.0
        for i in [1, 2]:
            cfr_recursive(game, game.root, i, t, 1.0, 1.0, 1.0, regrets,
                          action_counts, strategy_t, strategy_t_1,
                          cfr_state,
                          use_chance_sampling=use_chance_sampling,
                          weight=weight)

        average_strategy = compute_average_strategy(action_counts)
        cfr_util.update_average_strategy(game, average_strategy2, strategy_t, weight=weight)

        # Update strategy_t to equal strategy_t_1. We update strategy_t_1 inside
        # cfr_recursive.  We take a copy because we update it inside
        # cfr_recursive, and want to hold on to strategy_t_1 separately to
        # compare.
        strategy_t = strategy_t_1.copy()
        strategies.append(strategy_t.copy())

        # Compute the exploitability of the strategy.
        if t % 10 == 0:
            print("t: {}. Time since last evaluation: {:.4f} s".format(t, time.time() - start_time))
            start_time = time.time()
            exploitability = best_response.compute_exploitability(
                game, average_strategy)
            exploitabilities.append((t, exploitability))

            print("t: {}, nodes touched: {}, exploitability: {} mbb/h".format(t, cfr_state.nodes_touched,
                                                                              exploitability * 1000))

            exploitability = best_response.compute_exploitability(game, average_strategy2.compute_strategy())
            print("Exploitability (av strategy method 2): {} mbb/h".format(exploitability * 1000))

            immediate_regret, _, _ = cfr_metrics.compute_immediate_regret(game, strategies)
            print("Immediate regret: {}".format(immediate_regret))

    return average_strategy, exploitabilities, strategies