Python BestResponsePolicy示例，open_spiel.python.algorithms.best_response.BestResponsePolicy Python示例

示例#1

0

显示文件

    def test_cpp_and_python_implementations_are_identical(self, game_name):
        game = pyspiel.load_game(game_name)

        python_policy = policy.UniformRandomPolicy(game)
        pyspiel_policy = pyspiel.UniformRandomPolicy(game)

        all_states = get_all_states.get_all_states(
            game,
            depth_limit=-1,
            include_terminals=False,
            include_chance_states=False,
            to_string=lambda s: s.information_state_string())

        for current_player in range(game.num_players()):
            python_br = best_response.BestResponsePolicy(
                game, current_player, python_policy)
            cpp_br = pyspiel.TabularBestResponse(
                game, current_player,
                pyspiel_policy).get_best_response_policy()

            for state in all_states.values():
                if state.current_player() != current_player:
                    continue

                # TODO(b/141737795): Decide what to do about this.
                self.assertEqual(
                    python_br.action_probabilities(state), {
                        a: prob
                        for a, prob in cpp_br.action_probabilities(
                            state).items() if prob != 0
                    })

示例#2

0

显示文件

文件： exploitability.py 项目： ngrupen/open_spiel

def best_response(game, policy, player_id):
  """Returns information about the specified player's best response.

  Given a game and a policy for every player, computes for a single player their
  best unilateral strategy. Returns the value improvement that player would
  get, the action they should take in each information state, and the value
  of each state when following their unilateral policy.

  Args:
    game: An open_spiel game, e.g. kuhn_poker
    policy: A `policy.Policy` object. This policy should depend only on the
      information state available to the current player, but this is not
      enforced.
    player_id: The integer id of a player in the game for whom the best response
      will be computed.

  Returns:
    A dictionary of values, with keys:
      best_response_action: The best unilateral strategy for `player_id` as a
        map from infostatekey to action_id.
      best_response_state_value: The value obtained for `player_id` when
        unilaterally switching strategy, for each state.
      best_response_value: The value obtained for `player_id` when unilaterally
        switching strategy.
      info_sets: A dict of info sets, mapping info state key to a list of
        `(state, counterfactual_reach_prob)` pairs.
      nash_conv: `best_response_value - on_policy_value`
      on_policy_value: The value for `player_id` when all players follow the
        policy
      on_policy_values: The value for each player when all players follow the
        policy
  """
  root_state = game.new_initial_state()
  br = pyspiel_best_response.BestResponsePolicy(game, player_id, policy,
                                                root_state)
  on_policy_values = _state_values(root_state, game.num_players(), policy)
  best_response_value = br.value(root_state)

  # Get best response action for unvisited states
  for infostate in set(br.infosets) - set(br.cache_best_response_action):
    br.best_response_action(infostate)

  return {
      "best_response_action": br.cache_best_response_action,
      "best_response_state_value": br.cache_value,
      "best_response_value": best_response_value,
      "info_sets": br.infosets,
      "nash_conv": best_response_value - on_policy_values[player_id],
      "on_policy_value": on_policy_values[player_id],
      "on_policy_values": on_policy_values,
  }

示例#3

0

显示文件

文件： best_response_test.py 项目： ngrupen/open_spiel

    def test_cpp_and_python_value_are_identical(self, game_name, num_players):
        game = pyspiel.load_game(game_name, {"players": num_players})
        test_policy = policy.TabularPolicy(game)
        root_state = game.new_initial_state()
        for i_player in range(num_players):
            best_resp_py_backend = best_response.BestResponsePolicy(
                game, i_player, test_policy)
            best_resp_cpp_backend = best_response.CPPBestResponsePolicy(
                game, i_player, test_policy)

            value_py_backend = best_resp_py_backend.value(root_state)
            value_cpp_backend = best_resp_cpp_backend.value(root_state)

            self.assertTrue(np.allclose(value_py_backend, value_cpp_backend))

示例#4

0

显示文件

 def test_best_response_is_a_policy(self):
   game = pyspiel.load_game("kuhn_poker")
   test_policy = policy.UniformRandomPolicy(game)
   br = best_response.BestResponsePolicy(game, policy=test_policy, player_id=0)
   expected_policy = {
       "0": 1,  # Bet in case opponent folds when winning
       "1": 1,  # Bet in case opponent folds when winning
       "2": 0,  # Both equally good (we return the lowest action)
       # Some of these will never happen under the best-response policy,
       # but we have computed best-response actions anyway.
       "0pb": 0,  # Fold - we're losing
       "1pb": 1,  # Call - we're 50-50
       "2pb": 1,  # Call - we've won
   }
   self.assertEqual(
       expected_policy,
       {key: br.best_response_action(key) for key in expected_policy.keys()})

示例#5

0

显示文件

  def test_best_response_tic_tac_toe_value_is_consistent(self):
    # This test was failing because of use of str(state) in the best response,
    # which is imperfect recall. We now use state.history_str() throughout.

    # Chose a policy at random; not the uniform random policy.
    game = pyspiel.load_game("tic_tac_toe")
    pi = policy.TabularPolicy(game)
    rng = np.random.RandomState(1234)
    pi.action_probability_array[:] = rng.rand(*pi.legal_actions_mask.shape)
    pi.action_probability_array *= pi.legal_actions_mask
    pi.action_probability_array /= np.sum(
        pi.action_probability_array, axis=1, keepdims=True)

    # Compute a best response and verify the best response value is consistent.
    br = best_response.BestResponsePolicy(game, 1, pi)
    self.assertAlmostEqual(
        expected_game_score.policy_value(game.new_initial_state(), [pi, br])[1],
        br.value(game.new_initial_state()))

示例#6

0

显示文件

文件： exploitability.py 项目： ngrupen/open_spiel

def nash_conv(game, policy, return_only_nash_conv=True, use_cpp_br=False):
  r"""Returns a measure of closeness to Nash for a policy in the game.

  See https://arxiv.org/pdf/1711.00832.pdf for the NashConv definition.

  Args:
    game: An open_spiel game, e.g. kuhn_poker
    policy: A `policy.Policy` object. This policy should depend only on the
      information state available to the current player, but this is not
      enforced.
    return_only_nash_conv: Whether to only return the NashConv value, or a
      namedtuple containing additional statistics. Prefer using `False`, as we
      hope to change the default to that value.
    use_cpp_br: if True, compute the best response in c++

  Returns:
    Returns a object with the following attributes:
    - player_improvements: A `[num_players]` numpy array of the improvement
      for players (i.e. value_player_p_versus_BR - value_player_p).
    - nash_conv: The sum over all players of the improvements in value that each
      player could obtain by unilaterally changing their strategy, i.e.
      sum(player_improvements).
  """
  root_state = game.new_initial_state()
  if use_cpp_br:
    best_response_values = np.array([
        pyspiel_best_response.CPPBestResponsePolicy(
            game, best_responder, policy).value(root_state)
        for best_responder in range(game.num_players())
    ])
  else:
    best_response_values = np.array([
        pyspiel_best_response.BestResponsePolicy(
            game, best_responder, policy).value(root_state)
        for best_responder in range(game.num_players())
    ])
  on_policy_values = _state_values(root_state, game.num_players(), policy)
  player_improvements = best_response_values - on_policy_values
  nash_conv_ = sum(player_improvements)
  if return_only_nash_conv:
    return nash_conv_
  else:
    return _NashConvReturn(
        nash_conv=nash_conv_, player_improvements=player_improvements)

示例#7

0

显示文件

文件： best_response_test.py 项目： ngrupen/open_spiel

    def test_best_response_prisoner_dilemma_simultaneous_game(self):
        """Test best response computation for simultaneous game."""
        game = pyspiel.load_game(
            "python_iterated_prisoners_dilemma(max_game_length=5)")
        test_policy = policy.UniformRandomPolicy(game)
        br = best_response.BestResponsePolicy(game,
                                              policy=test_policy,
                                              player_id=0)

        # Best policy is always to defect; we verify this for a handful of states
        self.assertEqual(br.best_response_action("us:CCCC op:CCCC"), 1)
        self.assertEqual(br.best_response_action("us:DDDD op:CCCC"), 1)
        self.assertEqual(br.best_response_action("us:CDCD op:DCDC"), 1)
        self.assertEqual(br.best_response_action("us:CCCC op:DDDD"), 1)

        # Expected value per turn = 5.5 (avg of 1 and 10)
        # Expected game length = sum(0.875**i for i in range(5)) = 3.896728515625
        # Game value = 5.5 * 3.896728515625 = 21.4320068359375
        self.assertAlmostEqual(br.value(game.new_initial_state()),
                               21.4320068359375)

示例#8

0

显示文件

  def test_cpp_and_python_best_response_are_identical(self, game_name,
                                                      num_players):
    game = pyspiel.load_game(game_name, {"players": num_players})

    test_policy = policy.TabularPolicy(game)
    for i_player in range(num_players):
      best_resp_py_backend = best_response.BestResponsePolicy(
          game, i_player, test_policy)
      best_resp_cpp_backend = best_response.CPPBestResponsePolicy(
          game, i_player, test_policy)
      for state in best_resp_cpp_backend.all_states.values():
        if i_player == state.current_player():
          py_dict = best_resp_py_backend.action_probabilities(state)
          cpp_dict = best_resp_cpp_backend.action_probabilities(state)

          # We do check like this, because the actions associated to a 0. prob
          # do not necessarily appear
          for key, value in py_dict.items():
            self.assertEqual(value, cpp_dict.get(key, 0.))
          for key, value in cpp_dict.items():
            self.assertEqual(value, py_dict.get(key, 0.))

示例#9

0

显示文件

文件： best_response_test.py 项目： ngrupen/open_spiel

 def test_best_response_oshi_zumo_simultaneous_game(self):
     """Test best response computation for simultaneous game."""
     game = pyspiel.load_game("oshi_zumo(horizon=5,coins=5)")
     test_policy = policy.UniformRandomPolicy(game)
     br = best_response.BestResponsePolicy(game,
                                           policy=test_policy,
                                           player_id=0)
     expected_policy = {
         "0, 0, 0, 3, 0, 2": 1,
         "0, 0, 1, 4, 3, 1": 0,
         "0, 0, 4, 1, 0, 2, 0, 2": 1,
         "0, 1, 1, 0, 1, 4": 1,
         "0, 1, 4, 1, 0, 0, 0, 1": 1,
         "0, 2, 2, 2, 3, 0, 0, 0": 0,
         "0, 5, 0, 0, 0, 0, 3, 0": 1
     }
     self.assertEqual(
         expected_policy,
         {key: br.best_response_action(key)
          for key in expected_policy})
     self.assertAlmostEqual(br.value(game.new_initial_state()),
                            0.856471051954)

示例#10

0

显示文件

文件： exploitability.py 项目： BlueBerryBread/MyOpenSpiel

def exploitability(game, policy):
    """Returns the exploitability of the policy in the game.

  This is implemented only for 2 players constant-sum games, and is equivalent
  to NashConv / num_players in that case. Prefer using `nash_conv`.

  Args:
    game: An open_spiel game, e.g. kuhn_poker
    policy: A `policy.Policy` object. This policy should depend only on the
      information state available to the current player, but this is not
      enforced.

  Returns:
    The value that this policy achieves when playing against the worst-case
    non-cheating opponent, averaged across both starting positions. It has a
    minimum of zero (assuming the supplied policy is non-cheating) and
    this bound is achievable in a 2p game.

  Raises:
    ValueError if the game is not a two-player constant-sum turn-based game.
  """
    if game.num_players() != 2:
        raise ValueError("Game must be a 2-player game")
    game_info = game.get_type()
    if game_info.dynamics != pyspiel.GameType.Dynamics.SEQUENTIAL:
        raise ValueError("The game must be turn-based, not {}".format(
            game_info.dynamics))
    if game_info.utility not in (pyspiel.GameType.Utility.ZERO_SUM,
                                 pyspiel.GameType.Utility.CONSTANT_SUM):
        raise ValueError(
            "The game must be constant- or zero-sum, not {}".format(
                game_info.utility))
    root_state = game.new_initial_state()
    nash_conv_value = (sum(
        pyspiel_best_response.BestResponsePolicy(game, best_responder, policy,
                                                 root_state).value(root_state)
        for best_responder in range(game.num_players())) - game.utility_sum())
    return nash_conv_value / game.num_players()

示例#11

0

显示文件

文件： best_response_oracle.py 项目： ngrupen/open_spiel

    def __call__(self,
                 game,
                 training_parameters,
                 strategy_sampler=utils.sample_strategy,
                 using_joint_strategies=False,
                 **oracle_specific_execution_kwargs):
        """Call method for oracle, returns best responses for training_parameters.

    Args:
      game: The game on which the optimization process takes place.
      training_parameters: List of list of dicts: one list per player, one dict
        per selected agent in the pool for each player,
        each dictionary containing the following fields:
        - policy: the policy from which to start training.
        - total_policies: A list of all policy.Policy strategies used for
          training, including the one for the current player. Either
          marginalized or joint strategies are accepted.
        - current_player: Integer representing the current player.
        - probabilities_of_playing_policies: A list of arrays representing, per
          player, the probabilities of playing each policy in total_policies for
          the same player.
      strategy_sampler: Callable that samples strategies from `total_policies`
        using `probabilities_of_playing_policies`. It only samples one joint
        "action" for all players. Implemented to be able to take into account
        joint probabilities of action.
      using_joint_strategies: Whether the meta-strategies sent are joint (True)
        or marginalized.
      **oracle_specific_execution_kwargs: Other set of arguments, for
        compatibility purposes. Can for example represent whether to Rectify
        Training or not.

    Returns:
      A list of list of OpenSpiel Policy objects representing the expected
      best response, following the same structure as training_parameters.
    """
        new_policies = []
        for player_parameters in training_parameters:
            player_policies = []
            for params in player_parameters:
                current_player = params['current_player']
                total_policies = params['total_policies']
                probabilities_of_playing_policies = params[
                    'probabilities_of_playing_policies']
                if using_joint_strategies:
                    aggr_policy = utils.aggregate_joint_policies(
                        game, utils.marginal_to_joint(total_policies),
                        probabilities_of_playing_policies.reshape(-1))
                else:
                    aggr_policy = utils.aggregate_policies(
                        game, total_policies,
                        probabilities_of_playing_policies)

                # This takes as input an aggregate policy, and computes a best response
                # for current_player at the applicable information states by recursing
                # through the game tree. At information states involving other players
                # or chance, the aggr_policy is used to compute the expected value, such
                # that a best response for current_player can be computed.
                if self.best_response_backend == 'py':
                    best_resp = best_response.BestResponsePolicy(
                        game, current_player, aggr_policy)
                else:
                    self.best_response_processors[current_player].set_policy(
                        policy_utils.policy_to_dict(
                            aggr_policy, game, self.all_states,
                            self.state_to_information_state))

                    self.best_responders[current_player] = (
                        best_response.CPPBestResponsePolicy(
                            game, current_player, aggr_policy, self.all_states,
                            self.state_to_information_state,
                            self.best_response_processors[current_player]))
                    best_resp = self.best_responders[current_player]
                player_policies.append(best_resp)
            new_policies.append(player_policies)
        return new_policies