Пример #1
0
 def test_cpp_python_cfr_kuhn(self):
     game = pyspiel.load_game("kuhn_poker")
     solver = pyspiel.CFRSolver(game)
     for _ in range(100):
         solver.evaluate_and_update_policy()
     pyspiel_average_policy = solver.tabular_average_policy()
     cpp_nash_conv = pyspiel.nash_conv(game, pyspiel_average_policy)
     python_policy = policy.pyspiel_policy_to_python_policy(
         game, pyspiel_average_policy)
     python_nash_conv = exploitability.nash_conv(game, python_policy)
     self.assertAlmostEqual(python_nash_conv, cpp_nash_conv)
Пример #2
0
    def test_cfr_cce_ce_dist_goofspiel(self):
        """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc."""
        game = pyspiel.load_game(
            "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order="
            "descending,returns_type=total_points))")
        for num_iterations in [1, 10, 100]:
            policies = []
            cfr_solver = cfr.CFRSolver(game)
            for _ in range(num_iterations):
                cfr_solver.evaluate_and_update_policy()
                policies.append(
                    policy.python_policy_to_pyspiel_policy(
                        cfr_solver.current_policy()))
            mu = pyspiel.uniform_correlation_device(policies)
            cce_dist_info = pyspiel.cce_dist(game, mu)
            print(
                "goofspiel, cce test num_iters: {}, cce_dist: {}, per player: {}"
                .format(num_iterations, cce_dist_info.dist_value,
                        cce_dist_info.deviation_incentives))
            # Try converting one of the BR policies:
            _ = policy.pyspiel_policy_to_python_policy(
                game, cce_dist_info.best_response_policies[0])

            # Assemble the same correlation device manually, just as an example for
            # how to do non-uniform distributions of them and to test the python
            # bindings for lists of tuples works properly
            uniform_prob = 1.0 / len(policies)
            mu2 = [(uniform_prob, policy) for policy in policies]
            cce_dist_info2 = pyspiel.cce_dist(game, mu2)
            self.assertAlmostEqual(cce_dist_info2.dist_value,
                                   sum(cce_dist_info.deviation_incentives))
            # Test the CEDist function too, why not. Disable the exact one, as it
            # takes too long for a test.
            # ce_dist_info = pyspiel.ce_dist(game, pyspiel.determinize_corr_dev(mu))
            ce_dist_info = pyspiel.ce_dist(
                game, pyspiel.sampled_determinize_corr_dev(mu, 100))
            print(
                "goofspiel, ce test num_iters: {}, ce_dist: {}, per player: {}"
                .format(num_iterations, ce_dist_info.dist_value,
                        ce_dist_info.deviation_incentives))
            print("number of conditional best responses per player:")
            for p in range(game.num_players()):
                print("  player {}, num: {}".format(
                    p,
                    len(ce_dist_info.conditional_best_response_policies[p])))
Пример #3
0
def find_best_response(
    game, meta_dist, meta_game, iteration, joint_policies,
    target_equilibrium, update_players_strategy):
  """Returns new best response policies."""
  num_players = meta_game.shape[0]
  per_player_num_policies = meta_dist.shape[:]

  # Player update strategy.
  if update_players_strategy == "all":
    players = list(range(num_players))
  elif update_players_strategy == "cycle":
    players = [iteration % num_players]
  elif update_players_strategy == "random":
    players = [np.random.randint(0, num_players)]
  else:
    raise ValueError(
        "update_players_strategy must be a valid player update strategy: "
        "%s. Received: %s" % (UPDATE_PLAYERS_STRATEGY, update_players_strategy))

  # Find best response.
  per_player_new_policies = []
  per_player_deviation_incentives = []

  if target_equilibrium == "cce":
    for player in range(num_players):
      if player in players:
        joint_policy_ids = itertools.product(*[
            (np_-1,) if p_ == player else range(np_) for p_, np_
            in enumerate(per_player_num_policies)])
        joint_policies_slice = [
            joint_policies[jpid] for jpid in joint_policy_ids]
        meta_dist_slice = np.sum(meta_dist, axis=player)
        meta_dist_slice[meta_dist_slice < DIST_TOL] = 0.0
        meta_dist_slice[meta_dist_slice > 1.0] = 1.0
        meta_dist_slice /= np.sum(meta_dist_slice)
        meta_dist_slice = meta_dist_slice.flat

        mu = [(p, mp) for mp, p in zip(joint_policies_slice, meta_dist_slice)
              if p > 0]
        info = pyspiel.cce_dist(game, mu, player, prob_cut_threshold=0.0)

        new_policy = policy.pyspiel_policy_to_python_policy(
            game, info.best_response_policies[0], players=(player,))
        on_policy_value = np.sum(meta_game[player] * meta_dist)
        deviation_incentive = max(
            info.best_response_values[0] - on_policy_value, 0)
        if deviation_incentive < GAP_TOL:
          deviation_incentive = 0.0

        per_player_new_policies.append([new_policy])
        per_player_deviation_incentives.append([deviation_incentive])
      else:
        per_player_new_policies.append([])
        per_player_deviation_incentives.append([])

  elif target_equilibrium == "ce":
    for player in range(num_players):
      if player in players:
        per_player_new_policies.append([])
        per_player_deviation_incentives.append([])

        for pid in range(per_player_num_policies[player]):
          joint_policy_ids = itertools.product(*[
              (pid,) if p_ == player else range(np_) for p_, np_
              in enumerate(per_player_num_policies)])
          joint_policies_slice = [
              joint_policies[jpid] for jpid in joint_policy_ids]
          inds = tuple((pid,) if player == p_ else slice(None)
                       for p_ in range(num_players))
          meta_dist_slice = np.ravel(meta_dist[inds]).copy()
          meta_dist_slice[meta_dist_slice < DIST_TOL] = 0.0
          meta_dist_slice[meta_dist_slice > 1.0] = 1.0
          meta_dist_slice_sum = np.sum(meta_dist_slice)

          if meta_dist_slice_sum > 0.0:
            meta_dist_slice /= meta_dist_slice_sum
            mu = [(p, mp) for mp, p in
                  zip(joint_policies_slice, meta_dist_slice)
                  if p > 0]
            info = pyspiel.cce_dist(game, mu, player, prob_cut_threshold=0.0)

            new_policy = policy.pyspiel_policy_to_python_policy(
                game, info.best_response_policies[0], players=(player,))
            on_policy_value = np.sum(
                np.ravel(meta_game[player][inds]) * meta_dist_slice)
            deviation_incentive = max(
                info.best_response_values[0] - on_policy_value, 0)
            if deviation_incentive < GAP_TOL:
              deviation_incentive = 0.0

            per_player_new_policies[-1].append(new_policy)
            per_player_deviation_incentives[-1].append(
                meta_dist_slice_sum * deviation_incentive)

      else:
        per_player_new_policies.append([])
        per_player_deviation_incentives.append([])

  else:
    raise ValueError(
        "target_equilibrium must be a valid best response strategy: %s. "
        "Received: %s" % (BRS, target_equilibrium))

  return per_player_new_policies, per_player_deviation_incentives