def test_cpp_and_python_value_are_identical(self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) test_policy = policy.TabularPolicy(game) root_state = game.new_initial_state() for i_player in range(num_players): best_resp_py_backend = best_response.BestResponsePolicy( game, i_player, test_policy) best_resp_cpp_backend = best_response.CPPBestResponsePolicy( game, i_player, test_policy) value_py_backend = best_resp_py_backend.value(root_state) value_cpp_backend = best_resp_cpp_backend.value(root_state) self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))
def __init__(self, best_response_backend='cpp', game=None, all_states=None, state_to_information_state=None, **kwargs): """Init function for the RLOracle. Args: best_response_backend: A string (either 'cpp' or 'py'), specifying the best response backend to use (C++ or python, respectively). The cpp backend should be preferred, generally, as it is significantly faster. game: The game on which the optimization process takes place. all_states: The result of calling get_all_states.get_all_states. Cached for improved performance. state_to_information_state: A dict mapping str(state) to state.information_state for every state in the game. Cached for improved performance. **kwargs: kwargs """ super(BestResponseOracle, self).__init__(**kwargs) self.best_response_backend = best_response_backend if self.best_response_backend == 'cpp': # Should compute all_states and state_to_information_state only once in # the program, as caching them speeds up TabularBestResponse tremendously. self.all_states, self.state_to_information_state = ( utils.compute_states_and_info_states_if_none( game, all_states, state_to_information_state)) policy = openspiel_policy.UniformRandomPolicy(game) policy_to_dict = policy_utils.policy_to_dict( policy, game, self.all_states, self.state_to_information_state) # pylint: disable=g-complex-comprehension # Cache TabularBestResponse for players, due to their costly construction # TODO(b/140426861): Use a single best-responder once the code supports # multiple player ids. self.best_response_processors = [ pyspiel.TabularBestResponse(game, best_responder_id, policy_to_dict) for best_responder_id in range(game.num_players()) ] self.best_responders = [ best_response.CPPBestResponsePolicy( game, i_player, policy, self.all_states, self.state_to_information_state, self.best_response_processors[i_player]) for i_player in range(game.num_players()) ]
def nash_conv(game, policy, return_only_nash_conv=True, use_cpp_br=False): r"""Returns a measure of closeness to Nash for a policy in the game. See https://arxiv.org/pdf/1711.00832.pdf for the NashConv definition. Args: game: An open_spiel game, e.g. kuhn_poker policy: A `policy.Policy` object. This policy should depend only on the information state available to the current player, but this is not enforced. return_only_nash_conv: Whether to only return the NashConv value, or a namedtuple containing additional statistics. Prefer using `False`, as we hope to change the default to that value. use_cpp_br: if True, compute the best response in c++ Returns: Returns a object with the following attributes: - player_improvements: A `[num_players]` numpy array of the improvement for players (i.e. value_player_p_versus_BR - value_player_p). - nash_conv: The sum over all players of the improvements in value that each player could obtain by unilaterally changing their strategy, i.e. sum(player_improvements). """ root_state = game.new_initial_state() if use_cpp_br: best_response_values = np.array([ pyspiel_best_response.CPPBestResponsePolicy( game, best_responder, policy).value(root_state) for best_responder in range(game.num_players()) ]) else: best_response_values = np.array([ pyspiel_best_response.BestResponsePolicy( game, best_responder, policy).value(root_state) for best_responder in range(game.num_players()) ]) on_policy_values = _state_values(root_state, game.num_players(), policy) player_improvements = best_response_values - on_policy_values nash_conv_ = sum(player_improvements) if return_only_nash_conv: return nash_conv_ else: return _NashConvReturn( nash_conv=nash_conv_, player_improvements=player_improvements)
def test_cpp_and_python_best_response_are_identical(self, game_name, num_players): game = pyspiel.load_game(game_name, {"players": num_players}) test_policy = policy.TabularPolicy(game) for i_player in range(num_players): best_resp_py_backend = best_response.BestResponsePolicy( game, i_player, test_policy) best_resp_cpp_backend = best_response.CPPBestResponsePolicy( game, i_player, test_policy) for state in best_resp_cpp_backend.all_states.values(): if i_player == state.current_player(): py_dict = best_resp_py_backend.action_probabilities(state) cpp_dict = best_resp_cpp_backend.action_probabilities(state) # We do check like this, because the actions associated to a 0. prob # do not necessarily appear for key, value in py_dict.items(): self.assertEqual(value, cpp_dict.get(key, 0.)) for key, value in cpp_dict.items(): self.assertEqual(value, py_dict.get(key, 0.))
def exploitability(game, policy): """Returns the exploitability of the policy in the game. This is implemented only for 2 players constant-sum games, and is equivalent to NashConv / num_players in that case. Prefer using `nash_conv`. Args: game: An open_spiel game, e.g. kuhn_poker policy: A `policy.Policy` object. This policy should depend only on the information state available to the current player, but this is not enforced. Returns: The value that this policy achieves when playing against the worst-case non-cheating opponent, averaged across both starting positions. It has a minimum of zero (assuming the supplied policy is non-cheating) and this bound is achievable in a 2p game. Raises: ValueError if the game is not a two-player constant-sum turn-based game. """ if game.num_players() != 2: raise ValueError("Game must be a 2-player game") game_info = game.get_type() if game_info.dynamics != pyspiel.GameType.Dynamics.SEQUENTIAL: raise ValueError("The game must be turn-based, not {}".format( game_info.dynamics)) if game_info.utility not in (pyspiel.GameType.Utility.ZERO_SUM, pyspiel.GameType.Utility.CONSTANT_SUM): raise ValueError("The game must be constant- or zero-sum, not {}".format( game_info.utility)) root_state = game.new_initial_state() nash_conv_value = ( sum( pyspiel_best_response.CPPBestResponsePolicy( game, best_responder, policy).value(root_state) for best_responder in range(game.num_players())) - game.utility_sum()) return nash_conv_value / game.num_players()
def __call__(self, game, training_parameters, strategy_sampler=utils.sample_strategy, using_joint_strategies=False, **oracle_specific_execution_kwargs): """Call method for oracle, returns best responses for training_parameters. Args: game: The game on which the optimization process takes place. training_parameters: List of list of dicts: one list per player, one dict per selected agent in the pool for each player, each dictionary containing the following fields: - policy: the policy from which to start training. - total_policies: A list of all policy.Policy strategies used for training, including the one for the current player. Either marginalized or joint strategies are accepted. - current_player: Integer representing the current player. - probabilities_of_playing_policies: A list of arrays representing, per player, the probabilities of playing each policy in total_policies for the same player. strategy_sampler: Callable that samples strategies from `total_policies` using `probabilities_of_playing_policies`. It only samples one joint "action" for all players. Implemented to be able to take into account joint probabilities of action. using_joint_strategies: Whether the meta-strategies sent are joint (True) or marginalized. **oracle_specific_execution_kwargs: Other set of arguments, for compatibility purposes. Can for example represent whether to Rectify Training or not. Returns: A list of list of OpenSpiel Policy objects representing the expected best response, following the same structure as training_parameters. """ new_policies = [] for player_parameters in training_parameters: player_policies = [] for params in player_parameters: current_player = params['current_player'] total_policies = params['total_policies'] probabilities_of_playing_policies = params[ 'probabilities_of_playing_policies'] if using_joint_strategies: aggr_policy = utils.aggregate_joint_policies( game, utils.marginal_to_joint(total_policies), probabilities_of_playing_policies.reshape(-1)) else: aggr_policy = utils.aggregate_policies( game, total_policies, probabilities_of_playing_policies) # This takes as input an aggregate policy, and computes a best response # for current_player at the applicable information states by recursing # through the game tree. At information states involving other players # or chance, the aggr_policy is used to compute the expected value, such # that a best response for current_player can be computed. if self.best_response_backend == 'py': best_resp = best_response.BestResponsePolicy( game, current_player, aggr_policy) else: self.best_response_processors[current_player].set_policy( policy_utils.policy_to_dict( aggr_policy, game, self.all_states, self.state_to_information_state)) self.best_responders[current_player] = ( best_response.CPPBestResponsePolicy( game, current_player, aggr_policy, self.all_states, self.state_to_information_state, self.best_response_processors[current_player])) best_resp = self.best_responders[current_player] player_policies.append(best_resp) new_policies.append(player_policies) return new_policies