def test_exploitability_on_kuhn_poker_uniform_random(self): # NashConv of uniform random test_policy from (found on Google books): # https://link.springer.com/chapter/10.1007/978-3-319-75931-9_5 game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) expected_nash_conv = 11 / 12 self.assertAlmostEqual( exploitability.exploitability(game, test_policy), expected_nash_conv / 2)
def test_best_response(self, name): """Checks if the value of a policy computation works.""" game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) self.assertAlmostEqual(br_val, 30.029387484327486)
def policy_bots(): random_policy = policy.UniformRandomPolicy(GAME) py_bot = PolicyBot(0, np.random.RandomState(4321), random_policy) cpp_bot = pyspiel.make_policy_bot( GAME, 1, 1234, policy.python_policy_to_pyspiel_policy(random_policy.to_tabular())) return [py_bot, cpp_bot]
def __init__(self, game): """Initializes the greedy policy. Args: game: The game to analyze. """ self._game = game self._policy = policy_std.UniformRandomPolicy(self._game) self._fp_step = 0
def __init__(self, game): """Initializes the greedy policy. Args: game: The game to analyze. """ self._game = game self._states = None # Required to avoid attribute-error. self._policy = policy_std.UniformRandomPolicy(self._game) self._fp_step = 0 self._states = policy_std.get_tabular_policy_states(self._game)
def test_joint_action_probabilities(self): """Test expected behavior of joint_action_probabilities.""" game = pyspiel.load_game("python_iterated_prisoners_dilemma") uniform_policy = policy.UniformRandomPolicy(game) joint_action_probs = policy.joint_action_probabilities( game.new_initial_state(), uniform_policy) self.assertCountEqual(list(joint_action_probs), [ ((0, 0), 0.25), ((1, 1), 0.25), ((1, 0), 0.25), ((0, 1), 0.25), ])
def test_uniform_mfg_policy_conversion_to_n_player_uniform_policy(self): """Test conversion of uniform to uniform policy.""" mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) n_player_game = pyspiel.load_game("python_dynamic_routing", { "time_step_length": 0.05, "max_num_time_step": 100 }) mfg_derived_policy = (dynamic_routing_to_mean_field_game. DerivedNPlayerPolicyFromMeanFieldPolicy( n_player_game, policy.UniformRandomPolicy(mfg_game))) derived_policy_value = expected_game_score.policy_value( n_player_game.new_initial_state(), mfg_derived_policy) uniform_policy_value = expected_game_score.policy_value( n_player_game.new_initial_state(), policy.UniformRandomPolicy(n_player_game)) self.assertSequenceAlmostEqual(derived_policy_value, uniform_policy_value)
def test_policy_value(self, name): """Checks if the value of a policy computation works. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) py_value = policy_value.PolicyValue(game, dist, uniform_policy, value.TabularValueFunction(game)) py_val = py_value(game.new_initial_state()) self.assertAlmostEqual(py_val, 27.215850929940448)
def mean_field_uniform_policy(mfg_game, number_of_iterations, compute_metrics=False): del number_of_iterations uniform_policy = policy_module.UniformRandomPolicy(mfg_game) if compute_metrics: distribution_mfg = distribution_module.DistributionPolicy( mfg_game, uniform_policy) policy_value_ = policy_value.PolicyValue( mfg_game, distribution_mfg, uniform_policy).value(mfg_game.new_initial_state()) return uniform_policy, policy_value_ return uniform_policy
class CommonTest(parameterized.TestCase): @parameterized.parameters([ policy.TabularPolicy(_LEDUC_POKER), policy.UniformRandomPolicy(_LEDUC_POKER), policy.FirstActionPolicy(_LEDUC_POKER), ]) def test_policy_on_leduc(self, policy_object): test_policy_on_game(self, _LEDUC_POKER, policy_object) @parameterized.named_parameters([ ("pyspiel.UniformRandom", pyspiel.UniformRandomPolicy(_LEDUC_POKER)), ]) def test_cpp_policies_on_leduc(self, policy_object): test_policy_on_game(self, _LEDUC_POKER, policy_object)
def __init__(self, best_response_backend='cpp', game=None, all_states=None, state_to_information_state=None, **kwargs): """Init function for the RLOracle. Args: best_response_backend: A string (either 'cpp' or 'py'), specifying the best response backend to use (C++ or python, respectively). The cpp backend should be preferred, generally, as it is significantly faster. game: The game on which the optimization process takes place. all_states: The result of calling get_all_states.get_all_states. Cached for improved performance. state_to_information_state: A dict mapping str(state) to state.information_state for every state in the game. Cached for improved performance. **kwargs: kwargs """ super(BestResponseOracle, self).__init__(**kwargs) self.best_response_backend = best_response_backend if self.best_response_backend == 'cpp': # Should compute all_states and state_to_information_state only once in # the program, as caching them speeds up TabularBestResponse tremendously. self.all_states, self.state_to_information_state = ( utils.compute_states_and_info_states_if_none( game, all_states, state_to_information_state)) policy = openspiel_policy.UniformRandomPolicy(game) policy_to_dict = policy_utils.policy_to_dict( policy, game, self.all_states, self.state_to_information_state) # pylint: disable=g-complex-comprehension # Cache TabularBestResponse for players, due to their costly construction # TODO(b/140426861): Use a single best-responder once the code supports # multiple player ids. self.best_response_processors = [ pyspiel.TabularBestResponse(game, best_responder_id, policy_to_dict) for best_responder_id in range(game.num_players()) ] self.best_responders = [ best_response.CPPBestResponsePolicy( game, i_player, policy, self.all_states, self.state_to_information_state, self.best_response_processors[i_player]) for i_player in range(game.num_players()) ]
def test_policy_at_state(self): game = pyspiel.load_game("tic_tac_toe") uniform_random_policy = policy.UniformRandomPolicy(game) state = game.new_initial_state() state.apply_action(2) state.apply_action(4) state.apply_action(6) state.apply_action(8) self.assertEqual(uniform_random_policy.action_probabilities(state), { 0: 0.2, 1: 0.2, 3: 0.2, 5: 0.2, 7: 0.2 })
def test_policy_aggregation_random(self, game_name): env = rl_environment.Environment(game_name) policies = [[policy.UniformRandomPolicy(env.game) for _ in range(2)] for _ in range(2)] probabilities = [ list(np.ones(len(policies)) / len(policies)) for _ in range(2) ] pol_ag = policy_aggregator.PolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0], policies, probabilities) for item in aggr_policy.policy[0].items(): _, probs = zip(*item[1].items()) const_probs = tuple([probs[0]] * len(probs)) self.assertEqual(probs, const_probs)
def test_best_response_is_a_policy(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) expected_policy = { "0": 1, # Bet in case opponent folds when winning "1": 1, # Bet in case opponent folds when winning "2": 0, # Both equally good (we return the lowest action) # Some of these will never happen under the best-response policy, # but we have computed best-response actions anyway. "0pb": 0, # Fold - we're losing "1pb": 1, # Call - we're 50-50 "2pb": 1, # Call - we've won } self.assertEqual( expected_policy, {key: br.best_response_action(key) for key in expected_policy.keys()})
def test_greedy_cpp(self): """Check if the greedy policy works as expected. The test checks that a greedy policy with respect to an optimal value is an optimal policy. """ game = pyspiel.load_game("mfg_crowd_modelling") uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, dist) br_val = br_value(game.new_initial_state()) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(game, dist, greedy_pi) pybr_val = pybr_value(game.new_initial_state()) self.assertAlmostEqual(br_val, pybr_val)
def test_kuhn_poker_uniform_random_best_response_pid0(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) results = exploitability.best_response(game, test_policy, player_id=0) self.assertEqual( results["best_response_action"], { "0": 1, # Bet in case opponent folds when winning "1": 1, # Bet in case opponent folds when winning "2": 0, # Both equally good (we return the lowest action) # Some of these will never happen under the best-response policy, # but we have computed best-response actions anyway. "0pb": 0, # Fold - we're losing "1pb": 1, # Call - we're 50-50 "2pb": 1, # Call - we've won }) self.assertGreater(results["nash_conv"], 0.1)
def test_kuhn_poker_uniform_random_best_response_pid1(self): game = pyspiel.load_game("kuhn_poker") test_policy = policy.UniformRandomPolicy(game) results = exploitability.best_response(game, test_policy, player_id=1) self.assertEqual( results["best_response_action"], { # Bet is always best "0p": 1, "1p": 1, "2p": 1, # Call unless we know we're beaten "0b": 0, "1b": 1, "2b": 1, }) self.assertGreater(results["nash_conv"], 0.1)
def test_kuhn_poker_uniform(self): game = pyspiel.load_game("kuhn_poker") calc = action_value_vs_best_response.Calculator(game) expl, avvbr, cfrp = calc(0, policy.UniformRandomPolicy(game), ["0", "1", "2", "0pb", "1pb", "2pb"]) self.assertAlmostEqual(expl, 15 / 36) np.testing.assert_allclose( avvbr, [ [-1.5, -2.0], # 0 (better to pass) [-0.5, -0.5], # 1 (same) [0.5, 1.5], # 2 (better to bet) [-1.0, -2.0], # 0pb - losing [-1.0, 0.0], # 1pb - best response is bet always [-1.0, 2.0], # 2pb - winning ]) np.testing.assert_allclose(cfrp, [1 / 3, 1 / 3, 1 / 3, 1 / 3, 1 / 3, 1 / 3])
def test_players_have_different_legal_actions(self): game = pyspiel.load_game("oshi_zumo") uniform_random_policy = policy.UniformRandomPolicy(game) state = game.new_initial_state() state.apply_actions([46, 49]) # Started with 50 coins each, now have 4 and 1 respectively self.assertEqual( uniform_random_policy.action_probabilities(state, player_id=0), { 0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2 }) self.assertEqual( uniform_random_policy.action_probabilities(state, player_id=1), { 0: 0.5, 1: 0.5 })
def test_rl_environment(self, game_name): """Check that the RL environment runs for a few trajectories.""" game = pyspiel.load_game(game_name) uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) envs = [ rl_environment.Environment(game, distribution=mfg_dist, mfg_population=p) for p in range(game.num_players()) ] for p, env in enumerate(envs): for _ in range(FLAGS.rl_env_simulations): time_step = env.reset() while not time_step.last(): print(time_step) a = random.choice( time_step.observations['legal_actions'][p]) time_step = env.step([a])
def test_best_response_prisoner_dilemma_simultaneous_game(self): """Test best response computation for simultaneous game.""" game = pyspiel.load_game( "python_iterated_prisoners_dilemma(max_game_length=5)") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) # Best policy is always to defect; we verify this for a handful of states self.assertEqual(br.best_response_action("us:CCCC op:CCCC"), 1) self.assertEqual(br.best_response_action("us:DDDD op:CCCC"), 1) self.assertEqual(br.best_response_action("us:CDCD op:DCDC"), 1) self.assertEqual(br.best_response_action("us:CCCC op:DDDD"), 1) # Expected value per turn = 5.5 (avg of 1 and 10) # Expected game length = sum(0.875**i for i in range(5)) = 3.896728515625 # Game value = 5.5 * 3.896728515625 = 21.4320068359375 self.assertAlmostEqual(br.value(game.new_initial_state()), 21.4320068359375)
def test_average(self): """Test the average of policies. Here we test that the average of values is the value of the average policy. """ game = crowd_modelling.MFGCrowdModellingGame() uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse(game, mfg_dist) py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() merged_pi = fictitious_play.MergedPolicy( game, list(range(game.num_players())), [uniform_policy, greedy_pi], [mfg_dist, distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5]) merged_pi_value = policy_value.PolicyValue(game, mfg_dist, merged_pi) self.assertAlmostEqual(merged_pi_value(game.new_initial_state()), (br_value(game.new_initial_state()) + py_value(game.new_initial_state())) / 2)
def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) policy = openspiel_policy.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): noise = noisy_policy.NoisyPolicy(policy, 0, alpha=0.5, beta=10.) for state in all_states.values(): if state.current_player() != current_player: continue # TODO(b/141737795): Decide what to do about this. self.assertNotEqual(policy.action_probabilities(state), noise.action_probabilities(state))
def __init__(self, game, lr=0.01, root_state=None): """Initializes mirror descent. Args: game: The game, lr: The learning rate of mirror descent, root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._policy = policy_std.UniformRandomPolicy(game) self._distribution = distribution.DistributionPolicy(game, self._policy) self._md_step = 0 self._lr = lr self._state_value = collections.defaultdict(float) self._cumulative_state_value = collections.defaultdict(float)
def test_best_response_oshi_zumo_simultaneous_game(self): """Test best response computation for simultaneous game.""" game = pyspiel.load_game("oshi_zumo(horizon=5,coins=5)") test_policy = policy.UniformRandomPolicy(game) br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0) expected_policy = { "0, 0, 0, 3, 0, 2": 1, "0, 0, 1, 4, 3, 1": 0, "0, 0, 4, 1, 0, 2, 0, 2": 1, "0, 1, 1, 0, 1, 4": 1, "0, 1, 4, 1, 0, 0, 0, 1": 1, "0, 2, 2, 2, 3, 0, 0, 0": 0, "0, 5, 0, 0, 0, 0, 3, 0": 1 } self.assertEqual( expected_policy, {key: br.best_response_action(key) for key in expected_policy}) self.assertAlmostEqual(br.value(game.new_initial_state()), 0.856471051954)
def test_greedy(self, name): """Check if the greedy policy works as expected. The test checks that a greedy policy with respect to an optimal value is an optimal policy. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(game, dist, greedy_pi, value.TabularValueFunction(game)) pybr_val = pybr_value(game.new_initial_state()) self.assertAlmostEqual(br_val, pybr_val)
def test_policy_aggregation_random(self, game_name): env = rl_environment.Environment(game_name) num_players = 2 num_joint_policies = 4 joint_policies = [[ policy.UniformRandomPolicy(env.game) for _ in range(num_players) ] for _ in range(num_joint_policies)] probabilities = np.ones(len(joint_policies)) probabilities /= np.sum(probabilities) pol_ag = policy_aggregator_joint.JointPolicyAggregator(env.game) aggr_policy = pol_ag.aggregate([0, 1], joint_policies, probabilities) self.assertLen(aggr_policy.policies, num_players) for player in range(num_players): player_policy = aggr_policy.policies[player] self.assertNotEmpty(player_policy) for state_action_probs in player_policy.values(): probs = list(state_action_probs.values()) expected_prob = 1. / len(probs) for prob in probs: self.assertEqual(expected_prob, prob)
def test_joint_action_probabilities_failure_on_seq_game(self): """Test failure of child on sequential games.""" game = pyspiel.load_game("kuhn_poker") with self.assertRaises(AssertionError): list(policy.joint_action_probabilities( game.new_initial_state(), policy.UniformRandomPolicy(game)))
def main(_): game = pyspiel.load_game(FLAGS.game) expl = exploitability.exploitability(game, policy.UniformRandomPolicy(game)) print("Exploitability: {}".format(expl))
def test_policy_attributes(self): game = pyspiel.load_game("tiny_bridge_4p") uniform_random_policy = policy.UniformRandomPolicy(game) self.assertEqual(uniform_random_policy.player_ids, [0, 1, 2, 3])