def test_policy_zero_is_uniform(self, linear_averaging, regret_matching_plus, alternating_updates): # We use Leduc and not Kuhn, because Leduc has illegal actions and Kuhn does # not. game = pyspiel.load_game("leduc_poker") cfr_solver = cfr._CFRSolver(game, regret_matching_plus=regret_matching_plus, linear_averaging=linear_averaging, alternating_updates=alternating_updates) np.testing.assert_array_equal( _LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.current_policy().action_probability_array) np.testing.assert_array_equal( _LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.average_policy().action_probability_array)
def test_cfr_kuhn_poker_runs_with_multiple_players(self, linear_averaging, regret_matching_plus, alternating_updates): num_players = 3 game = pyspiel.load_game("kuhn_poker", {"players": num_players}) cfr_solver = cfr._CFRSolver(game, regret_matching_plus=regret_matching_plus, linear_averaging=linear_averaging, alternating_updates=alternating_updates) for _ in range(10): cfr_solver.evaluate_and_update_policy() average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * num_players) del average_policy_values
def test_simultaneous_two_step_avg_1b_seq_in_kuhn_poker( self, regret_matching_plus, initialize_cumulative_values): num_players = 2 game = pyspiel.load_game( "kuhn_poker", {"players": pyspiel.GameParameter(num_players)}) cfr_solver = cfr._CFRSolver( game, initialize_cumulative_values=initialize_cumulative_values, regret_matching_plus=regret_matching_plus, linear_averaging=False, alternating_updates=False) def check_avg_policy_is_uniform_random(): policy = cfr_solver.average_policy() for player_info_states in policy.states_per_player: for info_state in player_info_states: state_policy = policy.policy_for_key(info_state) np.testing.assert_allclose(state_policy, [1.0 / len(state_policy)] * len(state_policy)) check_avg_policy_is_uniform_random() cfr_solver.evaluate_and_update_policy() check_avg_policy_is_uniform_random() cfr_solver.evaluate_and_update_policy() # The acting player in 1b is player 1 and they have not acted before, so # the probability this player plays to this information state is 1, and # the sequence probability of any action is just the probability of that # action given the information state. On the first iteration, this # probability is 0.5 for both actions. On the second iteration, the # current policy is [0, 1], so the average cumulants should be # [0.5, 1.5]. Normalizing this gives the average policy. normalization = 0.5 + 0.5 + 1 np.testing.assert_allclose( cfr_solver.average_policy().policy_for_key("1b"), [0.5 / normalization, (0.5 + 1) / normalization])