Exemplo n.º 1
0
def compute_regret_policy_against_pure_policy_sim_game(game,
                                                       policy,
                                                       compute_true_value=False,
                                                       num_sample=100):
    time_tick = time.time()
    if compute_true_value:
        expected_value_policy = expected_game_score.policy_value(
            game.new_initial_state(), policy)[0]
    else:
        expected_value_policy = get_expected_value_sim_game(
            game, policy, num_sample)
    worse_regret = 0
    policies = [
        PathBCEResponse(game, policy, 0),
        PathBCDEResponse(game, policy, 0),
        PathBDEResponse(game, policy, 0)
    ]
    for deviation_policy in policies:
        if compute_true_value:
            expected_value_noise = expected_game_score.policy_value(
                game.new_initial_state(), deviation_policy)[0]
        else:
            expected_value_noise = get_expected_value_sim_game(
                game, deviation_policy, num_sample, player=0)
        approximate_regret = expected_value_noise - expected_value_policy
        worse_regret = max(worse_regret, approximate_regret)
    return worse_regret, time.time() - time_tick
 def test_learning_and_applying_mfg_policy_in_n_player_game(self):
     """Test converting learnt MFG policy default game."""
     # learning the Braess MFG Nash equilibrium
     mfg_game = pyspiel.load_game("python_mfg_dynamic_routing")
     omd = mirror_descent.MirrorDescent(mfg_game, lr=1)
     for _ in range(10):
         omd.iteration()
     mfg_policy = omd.get_policy()
     n_player_game = pyspiel.load_game("python_dynamic_routing")
     mfg_derived_policy = (dynamic_routing_to_mean_field_game.
                           DerivedNPlayerPolicyFromMeanFieldPolicy(
                               n_player_game, mfg_policy))
     expected_game_score.policy_value(n_player_game.new_initial_state(),
                                      mfg_derived_policy)
 def test_expected_game_score_uniform_random_iterated_prisoner_dilemma(self):
   game = pyspiel.load_game(
       "python_iterated_prisoners_dilemma(max_game_length=6)")
   pi = policy.UniformRandomPolicy(game)
   values = expected_game_score.policy_value(game.new_initial_state(), pi)
   # 4*(1-0.875**6)/0.125 = 17.6385498
   np.testing.assert_allclose(values, [17.6385498, 17.6385498])
Exemplo n.º 4
0
def print_average_payouts():
    # Print the average payouts given the current game and the final policies
    game = pyspiel.load_game(FLAGS.game_name)
    average_policy = __tabular_policy_from_csv(game, "./leduc_best_policy.csv")
    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    print(average_policy_values)
Exemplo n.º 5
0
 def test_discounted_cfr_on_kuhn(self):
     game = pyspiel.load_game("kuhn_poker")
     solver = discounted_cfr.DCFRSolver(game)
     for _ in range(300):
         solver.evaluate_and_update_policy()
     average_policy = solver.average_policy()
     average_policy_values = expected_game_score.policy_value(
         game.new_initial_state(), [average_policy] * 2)
     # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker
     np.testing.assert_allclose(average_policy_values, [-1 / 18, 1 / 18],
                                atol=1e-3)
 def test_uniform_mfg_policy_conversion_to_n_player_uniform_policy(self):
     """Test conversion of uniform to uniform policy."""
     mfg_game = pyspiel.load_game("python_mfg_dynamic_routing", {
         "time_step_length": 0.05,
         "max_num_time_step": 100
     })
     n_player_game = pyspiel.load_game("python_dynamic_routing", {
         "time_step_length": 0.05,
         "max_num_time_step": 100
     })
     mfg_derived_policy = (dynamic_routing_to_mean_field_game.
                           DerivedNPlayerPolicyFromMeanFieldPolicy(
                               n_player_game,
                               policy.UniformRandomPolicy(mfg_game)))
     derived_policy_value = expected_game_score.policy_value(
         n_player_game.new_initial_state(), mfg_derived_policy)
     uniform_policy_value = expected_game_score.policy_value(
         n_player_game.new_initial_state(),
         policy.UniformRandomPolicy(n_player_game))
     self.assertSequenceAlmostEqual(derived_policy_value,
                                    uniform_policy_value)
Exemplo n.º 7
0
def main(unused_argv):
    game = pyspiel.load_game("kuhn_poker")
    cfr_solver = cfr.CFRSolver(game)

    episodes = []
    exploits = []
    nashes = []

    # Train the agent for a specific amount of episodes
    for ep in range(FLAGS.num_train_episodes):
        print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes))
        cfr_solver.evaluate_and_update_policy()
        avg_pol = cfr_solver.average_policy()

        # Calculate the exploitability and nash convergence
        expl = exploitability.exploitability(game, avg_pol)
        nash = exploitability.nash_conv(game, avg_pol)

        exploits.append(expl)
        nashes.append(nash)
        episodes.append(ep)

    # Get the average policy
    average_policy = cfr_solver.average_policy()
    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    cur_pol = cfr_solver.current_policy()

    # Plot the exploitability
    plt.plot(episodes, exploits, "-r", label="Exploitability")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_expl.png")

    plt.figure()

    # Plot the nash convergence
    plt.plot(episodes, nashes, "-r", label="NashConv")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes)
    plt.legend(loc="upper right")
    plt.show()
    plt.savefig("cfr_nash.png")

    print(average_policy)
    print(average_policy_values)
    policy_to_csv(game, average_policy, "./kuhn_policy.csv")
Exemplo n.º 8
0
def main(_):
  game = pyspiel.load_game("kuhn_poker")

  cfr_solver = cfr.CFRSolver(game)
  iterations = 1000

  for i in range(iterations):
    cfr_value = cfr_solver.evaluate_and_update_policy()
    print("Game util at iteration {}: {}".format(i, cfr_value))

  average_policy = cfr_solver.average_policy()
  average_policy_values = expected_game_score.policy_value(
      game.new_initial_state(), [average_policy] * 2)
  print("Computed player 0 value: {}".format(average_policy_values[0]))
  print("Expected player 0 value: {}".format(-1 / 18))
Exemplo n.º 9
0
    def test_cfr_kuhn_poker_runs_with_multiple_players(self, linear_averaging,
                                                       regret_matching_plus,
                                                       alternating_updates):
        num_players = 3

        game = pyspiel.load_game("kuhn_poker", {"players": num_players})
        cfr_solver = cfr._CFRSolver(game,
                                    regret_matching_plus=regret_matching_plus,
                                    linear_averaging=linear_averaging,
                                    alternating_updates=alternating_updates)
        for _ in range(10):
            cfr_solver.evaluate_and_update_policy()
        average_policy = cfr_solver.average_policy()
        average_policy_values = expected_game_score.policy_value(
            game.new_initial_state(), [average_policy] * num_players)
        del average_policy_values
Exemplo n.º 10
0
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False):
    xfp_solver = fictitious_play.XFPSolver(seq_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        xfp_solver.iteration()
    timing = time.time() - tick_time
    # print('done')
    # average_policies = xfp_solver.average_policy_tables()
    tabular_policy = policy_module.TabularPolicy(seq_game)
    if compute_metrics:
        nash_conv = exploitability.nash_conv(seq_game,
                                             xfp_solver.average_policy())
        average_policy_values = expected_game_score.policy_value(
            seq_game.new_initial_state(), [tabular_policy])
        return timing, tabular_policy, nash_conv, average_policy_values
    return timing, tabular_policy
Exemplo n.º 11
0
  def test_best_response_tic_tac_toe_value_is_consistent(self):
    # This test was failing because of use of str(state) in the best response,
    # which is imperfect recall. We now use state.history_str() throughout.

    # Chose a policy at random; not the uniform random policy.
    game = pyspiel.load_game("tic_tac_toe")
    pi = policy.TabularPolicy(game)
    rng = np.random.RandomState(1234)
    pi.action_probability_array[:] = rng.rand(*pi.legal_actions_mask.shape)
    pi.action_probability_array *= pi.legal_actions_mask
    pi.action_probability_array /= np.sum(
        pi.action_probability_array, axis=1, keepdims=True)

    # Compute a best response and verify the best response value is consistent.
    br = best_response.BestResponsePolicy(game, 1, pi)
    self.assertAlmostEqual(
        expected_game_score.policy_value(game.new_initial_state(), [pi, br])[1],
        br.value(game.new_initial_state()))
Exemplo n.º 12
0
 def test_xfp(self):
   game = pyspiel.load_game("kuhn_poker")
   xfp_solver = fictitious_play.XFPSolver(game)
   for _ in range(100):
     xfp_solver.iteration()
   average_policies = xfp_solver.average_policy_tables()
   tabular_policy = policy.TabularPolicy(game)
   for player_id in range(2):
     for info_state, state_policy in average_policies[player_id].items():
       policy_to_update = tabular_policy.policy_for_key(info_state)
       for action, probability in state_policy.items():
         policy_to_update[action] = probability
   average_policy_values = expected_game_score.policy_value(
       game.new_initial_state(), [tabular_policy, tabular_policy])
   print("Kuhn 2P average values after 10 iterations")
   print("P0: {}".format(average_policy_values[0]))
   print("P1: {}".format(average_policy_values[1]))
   self.assertIsNotNone(average_policy_values)
   self.assertTrue(
       np.allclose(average_policy_values, [-1 / 18, 1 / 18], atol=1e-3))
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    with tf.Session() as sess:
        deep_cfr_solver = deep_cfr.DeepCFRSolver(
            sess,
            game,
            policy_network_layers=(16, ),
            advantage_network_layers=(16, ),
            num_iterations=FLAGS.num_iterations,
            num_traversals=FLAGS.num_traversals,
            learning_rate=1e-3,
            batch_size_advantage=128,
            batch_size_strategy=1024,
            memory_capacity=1e7,
            policy_network_train_steps=400,
            advantage_network_train_steps=20,
            reinitialize_advantage_networks=False)
        sess.run(tf.global_variables_initializer())
        _, advantage_losses, policy_loss = deep_cfr_solver.solve()
        for player, losses in six.iteritems(advantage_losses):
            logging.info("Advantage for player %d: %s", player,
                         losses[:2] + ["..."] + losses[-2:])
            logging.info("Advantage Buffer Size for player %s: '%s'", player,
                         len(deep_cfr_solver.advantage_buffers[player]))
        logging.info("Strategy Buffer Size: '%s'",
                     len(deep_cfr_solver.strategy_buffer))
        logging.info("Final policy loss: '%s'", policy_loss)

        average_policy = policy.tabular_policy_from_callable(
            game, deep_cfr_solver.action_probabilities)

        conv = exploitability.nash_conv(game, average_policy)
        logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)

        average_policy_values = expected_game_score.policy_value(
            game.new_initial_state(), [average_policy] * 2)
        print("Computed player 0 value: {}".format(average_policy_values[0]))
        print("Expected player 0 value: {}".format(-1 / 18))
        print("Computed player 1 value: {}".format(average_policy_values[1]))
        print("Expected player 1 value: {}".format(1 / 18))
Exemplo n.º 14
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    deep_cfr_solver = deep_cfr_tf2.DeepCFRSolver(
        game,
        policy_network_layers=(64, 64, 64, 64),
        advantage_network_layers=(64, 64, 64, 64),
        num_iterations=FLAGS.num_iterations,
        num_traversals=FLAGS.num_traversals,
        learning_rate=1e-3,
        batch_size_advantage=2048,
        batch_size_strategy=2048,
        memory_capacity=1e6,
        policy_network_train_steps=5000,
        advantage_network_train_steps=500,
        reinitialize_advantage_networks=True,
        infer_device="cpu",
        train_device="cpu")
    _, advantage_losses, policy_loss = deep_cfr_solver.solve()
    for player, losses in six.iteritems(advantage_losses):
        logging.info("Advantage for player %d: %s", player,
                     losses[:2] + ["..."] + losses[-2:])
        logging.info("Advantage Buffer Size for player %s: '%s'", player,
                     len(deep_cfr_solver.advantage_buffers[player]))
    logging.info("Strategy Buffer Size: '%s'",
                 len(deep_cfr_solver.strategy_buffer))
    logging.info("Final policy loss: '%s'", policy_loss)

    average_policy = policy.tabular_policy_from_callable(
        game, deep_cfr_solver.action_probabilities)

    conv = exploitability.nash_conv(game, average_policy)
    logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)

    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    print("Computed player 0 value: {}".format(average_policy_values[0]))
    print("Computed player 1 value: {}".format(average_policy_values[1]))
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)

    deep_cfr_solver = deep_cfr.DeepCFRSolver(
        game,
        policy_network_layers=(32, 32),
        advantage_network_layers=(16, 16),
        num_iterations=FLAGS.num_iterations,
        num_traversals=FLAGS.num_traversals,
        learning_rate=1e-3,
        batch_size_advantage=None,
        batch_size_strategy=None,
        memory_capacity=int(1e7))

    _, advantage_losses, policy_loss = deep_cfr_solver.solve()
    for player, losses in six.iteritems(advantage_losses):
        logging.info("Advantage for player %d: %s", player,
                     losses[:2] + ["..."] + losses[-2:])
        logging.info("Advantage Buffer Size for player %s: '%s'", player,
                     len(deep_cfr_solver.advantage_buffers[player]))
    logging.info("Strategy Buffer Size: '%s'",
                 len(deep_cfr_solver.strategy_buffer))
    logging.info("Final policy loss: '%s'", policy_loss)

    average_policy = policy.tabular_policy_from_callable(
        game, deep_cfr_solver.action_probabilities)
    pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy)
    conv = pyspiel.nash_conv(game, pyspiel_policy)
    logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)

    average_policy_values = expected_game_score.policy_value(
        game.new_initial_state(), [average_policy] * 2)
    logging.info("Computed player 0 value: %.2f (expected: %.2f).",
                 average_policy_values[0], -1 / 18)
    logging.info("Computed player 1 value: %.2f (expected: %.2f).",
                 average_policy_values[1], 1 / 18)
Exemplo n.º 16
0
  def test_braess_paradox(self):
    """Test that Braess paradox can be reproduced with the mean field game."""
    num_player = 8
    braess_network = dynamic_routing_utils.Network(
        {
            "O": "A",
            "A": ["B", "C"],
            "B": ["C", "D"],
            "C": ["D"],
            "D": ["E"],
            "E": []
        },
        node_position={
            "O": (0, 0),
            "A": (1, 0),
            "B": (2, 1),
            "C": (2, -1),
            "D": (3, 0),
            "E": (4, 0)
        },
        bpr_a_coefficient={
            "O->A": 0,
            "A->B": 1.0,
            "A->C": 0,
            "B->C": 0,
            "B->D": 0,
            "C->D": 1.0,
            "D->E": 0
        },
        bpr_b_coefficient={
            "O->A": 1.0,
            "A->B": 1.0,
            "A->C": 1.0,
            "B->C": 1.0,
            "B->D": 1.0,
            "C->D": 1.0,
            "D->E": 1.0
        },
        capacity={
            "O->A": num_player,
            "A->B": num_player,
            "A->C": num_player,
            "B->C": num_player,
            "B->D": num_player,
            "C->D": num_player,
            "D->E": num_player
        },
        free_flow_travel_time={
            "O->A": 0,
            "A->B": 1.0,
            "A->C": 2.0,
            "B->C": 0.25,
            "B->D": 2.0,
            "C->D": 1.0,
            "D->E": 0
        })

    demand = [
        dynamic_routing_utils.Vehicle("O->A", "D->E") for _ in range(num_player)
    ]
    game = dynamic_routing.DynamicRoutingGame(
        {"time_step_length": 0.125, "max_num_time_step": 40},
        network=braess_network,
        vehicles=demand)

    class TruePathPolicy(policy.Policy):

      def __init__(self, game):
        super().__init__(game, list(range(num_player)))
        self._path = {}

      def action_probabilities(self, state, player_id=None):
        assert player_id is not None
        legal_actions = state.legal_actions(player_id)
        if not legal_actions:
          return {dynamic_routing_utils.NO_POSSIBLE_ACTION: 1.0}
        elif len(legal_actions) == 1:
          return {legal_actions[0]: 1.0}
        else:
          if legal_actions[0] == 2:
            if self._path[player_id] in ["top", "middle"]:
              return {2: 1.0}
            elif self._path[player_id] == "bottom":
              return {3: 1.0}
            else:
              raise ValueError()
          elif legal_actions[0] == 4:
            if self._path[player_id] == "top":
              return {5: 1.0}
            elif self._path[player_id] == "middle":
              return {4: 1.0}
            else:
              raise ValueError()
        raise ValueError(f"{legal_actions} is not correct.")

    class NashEquilibriumBraess(TruePathPolicy):

      def __init__(self, game):
        super().__init__(game)
        for player_id in range(num_player):
          if player_id % 2 == 0:
            self._path[player_id] = "middle"
          if player_id % 4 == 1:
            self._path[player_id] = "top"
          if player_id % 4 == 3:
            self._path[player_id] = "bottom"

    class SocialOptimumBraess(NashEquilibriumBraess):

      def __init__(self, game):
        super().__init__(game)
        for player_id in range(num_player):
          if player_id % 2 == 0:
            self._path[player_id] = "top"
          if player_id % 2 == 1:
            self._path[player_id] = "bottom"

    ne_policy = NashEquilibriumBraess(game)
    # TODO(cabannes): debug issue with nash conv computation and uncomment the
    # following line.
    # self.assertEqual(exploitability.nash_conv(game, ne_policy), 0.0)
    self.assertSequenceAlmostEqual(
        -expected_game_score.policy_value(game.new_initial_state(), ne_policy),
        [3.75] * num_player)

    so_policy = SocialOptimumBraess(game)
    # TODO(cabannes): debug issue with nash conv computation and uncomment the
    # following line.
    # self.assertEqual(exploitability.nash_conv(game, so_policy), 0.125)
    self.assertSequenceAlmostEqual(
        -expected_game_score.policy_value(game.new_initial_state(), so_policy),
        [3.5] * num_player)
Exemplo n.º 17
0
 def test_expected_game_score_uniform_random_kuhn_poker(self):
     game = pyspiel.load_game("kuhn_poker")
     uniform_policy = policy.UniformRandomPolicy(game)
     uniform_policy_values = expected_game_score.policy_value(
         game.new_initial_state(), [uniform_policy] * 2)
     self.assertTrue(np.allclose(uniform_policy_values, [1 / 8, -1 / 8]))
Exemplo n.º 18
0
def neural_ficticious_self_play(seq_game,
                                num_epoch,
                                sess,
                                compute_metrics=False):
    env = rl_environment.Environment(seq_game)
    # Parameters from the game.
    num_players = env.num_players
    num_actions = env.action_spec()["num_actions"]
    info_state_size = env.observation_spec()["info_state"][0]

    # Parameters for the algorithm.
    hidden_layers_sizes = [int(l) for l in [128]]

    kwargs = {
        "replay_buffer_capacity": int(2e5),
        "reservoir_buffer_capacity": int(2e6),
        "min_buffer_size_to_learn": 1000,
        "anticipatory_param": 0.1,
        "batch_size": 128,
        "learn_every": 64,
        "rl_learning_rate": 0.01,
        "sl_learning_rate": 0.01,
        "optimizer_str": "sgd",
        "loss_str": "mse",
        "update_target_network_every": 19200,
        "discount_factor": 1.0,
        "epsilon_decay_duration": int(20e6),
        "epsilon_start": 0.06,
        "epsilon_end": 0.001,
    }

    # freq_epoch_printing = num_epoch // 10
    agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  **kwargs) for idx in range(num_players)
    ]
    joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

    sess.run(tf.global_variables_initializer())
    # print("TF initialized.")
    tick_time = time.time()
    for _ in range(num_epoch):
        # if ep % freq_epoch_printing == 0:
        #   print(f"Iteration {ep}")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
    timing = time.time() - tick_time
    # print("Finish.")
    if compute_metrics:
        tabular_policy = joint_avg_policy.TabularPolicy(seq_game)
        average_policy_values = expected_game_score.policy_value(
            seq_game.new_initial_state(), [tabular_policy])
        nash_conv = exploitability.nash_conv(env.game, joint_avg_policy)
        return timing, joint_avg_policy, average_policy_values, nash_conv
    return timing, joint_avg_policy