Пример #1
0
 def test_cfr_cce_dist_goofspiel(self):
     """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc."""
     game = pyspiel.load_game(
         "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order="
         "descending,returns_type=total_points))")
     for num_iterations in [1, 10, 100]:
         policies = []
         cfr_solver = cfr.CFRSolver(game)
         for _ in range(num_iterations):
             cfr_solver.evaluate_and_update_policy()
             policies.append(
                 policy.python_policy_to_pyspiel_policy(
                     cfr_solver.current_policy()))
         mu = pyspiel.uniform_correlation_device(policies)
         cce_dist1 = pyspiel.cce_dist(game, mu)
         print(
             "goofspiel, cce test num_iterations: {}, cce_dist: {}".format(
                 num_iterations, cce_dist1))
         # Assemble the same correlation device manually, just as an example for
         # how to do non-uniform distributions of them and to test the python
         # bindings for lists of tuples works properly
         uniform_prob = 1.0 / len(policies)
         mu2 = [(uniform_prob, policy) for policy in policies]
         cce_dist2 = pyspiel.cce_dist(game, mu2)
         self.assertAlmostEqual(cce_dist1, cce_dist2)
Пример #2
0
def policy_bots():
    random_policy = policy.UniformRandomPolicy(GAME)

    py_bot = PolicyBot(0, np.random.RandomState(4321), random_policy)
    cpp_bot = pyspiel.make_policy_bot(
        GAME, 1, 1234,
        policy.python_policy_to_pyspiel_policy(random_policy.to_tabular()))

    return [py_bot, cpp_bot]
Пример #3
0
 def test_cfr_plus_solver_best_response_mdp(self):
     game = pyspiel.load_game("kuhn_poker")
     cfr_solver = cfr.CFRPlusSolver(game)
     for _ in range(200):
         cfr_solver.evaluate_and_update_policy()
     average_policy = cfr_solver.average_policy()
     pyspiel_avg_policy = policy.python_policy_to_pyspiel_policy(
         average_policy)
     br_computer = pyspiel.TabularBestResponseMDP(game, pyspiel_avg_policy)
     br_info = br_computer.exploitability()
     self.assertLessEqual(br_info.exploitability, 0.001)
Пример #4
0
  def test_record_batched_trajectories(self):
    for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]:
      game = pyspiel.load_game(game_name)
      python_policy = policy.TabularPolicy(game)
      tabular_policy = policy.python_policy_to_pyspiel_policy(python_policy)
      policies = [tabular_policy] * 2

      # We test that we can create a batch of trajectories.
      seed = 0
      batch_size = 128
      include_full_observations = False
      pyspiel.record_batched_trajectories(game, policies,
                                          python_policy.state_lookup,
                                          batch_size, include_full_observations,
                                          seed, -1)
Пример #5
0
    def test_cfr_cce_ce_dist_goofspiel(self):
        """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc."""
        game = pyspiel.load_game(
            "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order="
            "descending,returns_type=total_points))")
        for num_iterations in [1, 10, 100]:
            policies = []
            cfr_solver = cfr.CFRSolver(game)
            for _ in range(num_iterations):
                cfr_solver.evaluate_and_update_policy()
                policies.append(
                    policy.python_policy_to_pyspiel_policy(
                        cfr_solver.current_policy()))
            mu = pyspiel.uniform_correlation_device(policies)
            cce_dist_info = pyspiel.cce_dist(game, mu)
            print(
                "goofspiel, cce test num_iters: {}, cce_dist: {}, per player: {}"
                .format(num_iterations, cce_dist_info.dist_value,
                        cce_dist_info.deviation_incentives))
            # Try converting one of the BR policies:
            _ = policy.pyspiel_policy_to_python_policy(
                game, cce_dist_info.best_response_policies[0])

            # Assemble the same correlation device manually, just as an example for
            # how to do non-uniform distributions of them and to test the python
            # bindings for lists of tuples works properly
            uniform_prob = 1.0 / len(policies)
            mu2 = [(uniform_prob, policy) for policy in policies]
            cce_dist_info2 = pyspiel.cce_dist(game, mu2)
            self.assertAlmostEqual(cce_dist_info2.dist_value,
                                   sum(cce_dist_info.deviation_incentives))
            # Test the CEDist function too, why not. Disable the exact one, as it
            # takes too long for a test.
            # ce_dist_info = pyspiel.ce_dist(game, pyspiel.determinize_corr_dev(mu))
            ce_dist_info = pyspiel.ce_dist(
                game, pyspiel.sampled_determinize_corr_dev(mu, 100))
            print(
                "goofspiel, ce test num_iters: {}, ce_dist: {}, per player: {}"
                .format(num_iterations, ce_dist_info.dist_value,
                        ce_dist_info.deviation_incentives))
            print("number of conditional best responses per player:")
            for p in range(game.num_players()):
                print("  player {}, num: {}".format(
                    p,
                    len(ce_dist_info.conditional_best_response_policies[p])))
Пример #6
0
 def test_matching_pennies_3p(self):
     game = pyspiel.load_game_as_turn_based('matching_pennies_3p')
     deep_cfr_solver = deep_cfr.DeepCFRSolver(game,
                                              policy_network_layers=(16, 8),
                                              advantage_network_layers=(32,
                                                                        16),
                                              num_iterations=2,
                                              num_traversals=2,
                                              learning_rate=1e-3,
                                              batch_size_advantage=None,
                                              batch_size_strategy=None,
                                              memory_capacity=1e7)
     deep_cfr_solver.solve()
     conv = pyspiel.nash_conv(
         game,
         policy.python_policy_to_pyspiel_policy(
             policy.tabular_policy_from_callable(
                 game, deep_cfr_solver.action_probabilities)))
     logging.info('Deep CFR in Matching Pennies 3p. NashConv: %.2f', conv)
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)

    deep_cfr_solver = deep_cfr.DeepCFRSolver(
        game,
        policy_network_layers=(32, 32),
        advantage_network_layers=(16, 16),
        num_iterations=FLAGS.num_iterations,
        num_traversals=FLAGS.num_traversals,
        learning_rate=1e-3,
        batch_size_advantage=None,
        batch_size_strategy=None,
        memory_capacity=int(1e7))

    _, advantage_losses, policy_loss = deep_cfr_solver.solve()
    for player, losses in six.iteritems(advantage_losses):
        logging.info("Advantage for player %d: %s", player,
                     losses[:2] + ["..."] + losses[-2:])
        logging.info("Advantage Buffer Size for player %s: '%s'", player,
                     len(deep_cfr_solver.advantage_buffers[player]))
    logging.info("Strategy Buffer Size: '%s'",
                 len(deep_cfr_solver.strategy_buffer))
    logging.info("Final policy loss: '%s'", policy_loss)

    average_policy = policy.tabular_policy_from_callable(
        game, deep_cfr_solver.action_probabilities)
    pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy)
    conv = pyspiel.nash_conv(game, pyspiel_policy)
    logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)

    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    logging.info("Computed player 0 value: %.2f (expected: %.2f).",
                 average_policy_values[0], -1 / 18)
    logging.info("Computed player 1 value: %.2f (expected: %.2f).",
                 average_policy_values[1], 1 / 18)
Пример #8
0
    def test_can_create_cpp_tabular_policy(self):
        for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]:
            game = pyspiel.load_game(game_name)

            # We just test that we can create a tabular policy.
            policy.python_policy_to_pyspiel_policy(policy.TabularPolicy(game))
Пример #9
0
def add_new_policies(
    per_player_new_policies,
    per_player_gaps,
    per_player_repeats,
    per_player_policies,
    joint_policies,
    joint_returns,
    game,
    br_selection):
  """Adds novel policies from new policies."""
  num_players = len(per_player_new_policies)
  per_player_num_novel_policies = [0 for _ in range(num_players)]

  # Update policies and policy counts.
  for player in range(num_players):
    new_policies = per_player_new_policies[player]
    new_gaps = per_player_gaps[player]

    repeat_policies = []
    repeat_gaps = []
    repeat_ids = []
    novel_policies = []
    novel_gaps = []
    for new_policy, new_gap in zip(new_policies, new_gaps):
      for policy_id, policy_ in enumerate(per_player_policies[player]):
        if np.all(  # New policy is not novel.
            new_policy.action_probability_array ==
            policy_.action_probability_array):
          logging.debug("Player %d's new policy is not novel.", player)
          repeat_policies.append(new_policy)
          repeat_gaps.append(new_gap)
          repeat_ids.append(policy_id)
          break
      else:  # New policy is novel.
        logging.debug("Player %d's new policy is novel.", player)
        novel_policies.append(new_policy)
        novel_gaps.append(new_gap)

    add_novel_policies = []
    add_repeat_ids = []
    if (novel_policies or repeat_policies):
      if br_selection == "all":
        add_novel_policies.extend(novel_policies)
        add_repeat_ids.extend(repeat_ids)
      elif br_selection == "all_novel":
        add_novel_policies.extend(novel_policies)
      elif br_selection == "random":
        index = np.random.randint(0, len(repeat_policies) + len(novel_policies))
        if index < len(novel_policies):
          add_novel_policies.append(novel_policies[index])
        else:
          add_repeat_ids.append(repeat_ids[index - len(novel_policies)])
      elif br_selection == "random_novel":
        if novel_policies:
          index = np.random.randint(0, len(novel_policies))
          add_novel_policies.append(novel_policies[index])
        else:  # Fall back on random.
          index = np.random.randint(0, len(repeat_policies))
          add_repeat_ids.append(repeat_ids[index])
      elif br_selection == "largest_gap":
        if novel_policies:
          index = np.argmax(novel_gaps)
          if novel_gaps[index] == 0.0:  # Fall back to random when zero.
            index = np.random.randint(0, len(novel_policies))
          add_novel_policies.append(novel_policies[index])
        else:  # Fall back on random.
          index = np.random.randint(0, len(repeat_policies))
          add_repeat_ids.append(repeat_ids[index])
      else:
        raise ValueError("Unrecognized br_selection method: %s"
                         % br_selection)

    for add_repeat_id in add_repeat_ids:
      per_player_repeats[player][add_repeat_id] += 1

    for add_novel_policy in add_novel_policies:
      per_player_policies[player].append(add_novel_policy)  # Add new policy.
      per_player_repeats[player].append(1)  # Add new count.
      per_player_num_novel_policies[player] += 1

  # Add new joint policies.
  for pids in itertools.product(*[
      range(len(policies)) for policies in per_player_policies]):
    if pids in joint_policies:
      continue
    logging.debug("Evaluating novel joint policy: %s.", pids)
    policies = [
        policies[pid] for pid, policies in zip(pids, per_player_policies)]
    python_tabular_policy = policy.merge_tabular_policies(
        policies, game)
    pyspiel_tabular_policy = policy.python_policy_to_pyspiel_policy(
        python_tabular_policy)
    joint_policies[pids] = pyspiel_tabular_policy
    joint_returns[pids] = [
        0.0 if abs(er) < RETURN_TOL else er
        for er in pyspiel.expected_returns(
            game.new_initial_state(), pyspiel_tabular_policy, -1, True)]

  return per_player_num_novel_policies