def test_simultaneous_game_noisy_policy(self, game_name): game = pyspiel.load_game(game_name) policy = openspiel_policy.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=10, include_terminals=False, include_chance_states=False, to_string=lambda s: s.history_str()) for current_player in range(game.num_players()): noise = noisy_policy.NoisyPolicy(policy, player_id=current_player, alpha=0.5, beta=10.) for state in all_states.values(): if state.current_player() == pyspiel.PlayerId.SIMULTANEOUS: for player_id in range(game.num_players()): if player_id != current_player: self.assertEqual( policy.action_probabilities(state, player_id), noise.action_probabilities(state, player_id)) else: self.assertNotEqual( policy.action_probabilities(state, player_id), noise.action_probabilities(state, player_id))
def test_cpp_and_python_implementations_are_identical(self, game_name): game = pyspiel.load_game(game_name) policy = openspiel_policy.UniformRandomPolicy(game) all_states = get_all_states.get_all_states( game, depth_limit=-1, include_terminals=False, include_chance_states=False, to_string=lambda s: s.information_state_string()) for current_player in range(game.num_players()): noise = noisy_policy.NoisyPolicy(policy, player_id=current_player, alpha=0.5, beta=10.) for state in all_states.values(): if state.current_player() < 0: continue if state.current_player() != current_player: self.assertEqual(policy.action_probabilities(state), noise.action_probabilities(state)) else: self.assertNotEqual(policy.action_probabilities(state), noise.action_probabilities(state))
def compute_regret_policy(game, policy, num_random_policy_tested=10, num_sample=100): time_tick = time.time() expected_value_policy = get_expected_value(game, policy, num_sample) worse_regret = 0 for _ in range(num_random_policy_tested): noisy_n_policy = noisy_policy.NoisyPolicy(policy, player_id=0, alpha=1) expected_value_noise = get_expected_value(game, noisy_n_policy, num_sample, player=0) approximate_regret = expected_value_noise - expected_value_policy worse_regret = max(worse_regret, approximate_regret) return worse_regret, time.time() - time_tick
def copy_with_noise(self, alpha=0.0, beta=0.0): """Copies this policy and adds noise, making it a Noisy Best Response. The policy's new probabilities P' on each state s become P'(s) = alpha * epsilon + (1-alpha) * P(s) With P the former policy's probabilities, and epsilon ~ Softmax(beta * Uniform) Args: alpha: First mixture component beta: Softmax 1/temperature component Returns: Noisy copy of best response. """ return noisy_policy.NoisyPolicy(self, alpha, beta, self.all_states)