def xfsp_train(_):
    exploit_history = list()
    exploit_idx = list()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    fsp_solver = fictitious_play.XFPSolver(game)
    checkpoint = datetime.now()
    for ep in range(FLAGS.episodes):
        if (ep % 1000) == 0:
            delta = datetime.now() - checkpoint
            pol = policy.PolicyFromCallable(
                game, fsp_solver.average_policy_callable())
            conv = exploitability.exploitability(game, pol)
            exploit_history.append(conv)
            exploit_idx.append(ep)
            print(
                "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(ep, conv, delta.seconds))
            checkpoint = datetime.now()

        fsp_solver.iteration()

    agent_name = "xfsp"
    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable())
    for pid in [1, 2]:
        policy_to_csv(
            game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") +
            "_" + agent_name + "_" + str(pid + 1) + "_+" +
            str(FLAGS.episodes) + "episodes.csv")
 def test_shapleys_game(self):
   game = pyspiel.load_game_as_turn_based("matrix_shapleys_game")
   xfp_solver = fictitious_play.XFPSolver(game)
   for i in range(1000):
     xfp_solver.iteration()
     if i % 10 == 0:
       conv = exploitability.nash_conv(game, xfp_solver.average_policy())
       print("FP in Shapley's Game. Iter: {}, NashConv: {}".format(i, conv))
 def test_matching_pennies_3p(self):
   game = pyspiel.load_game_as_turn_based("matching_pennies_3p")
   xfp_solver = fictitious_play.XFPSolver(game)
   for i in range(1000):
     xfp_solver.iteration()
     if i % 10 == 0:
       conv = exploitability.nash_conv(game, xfp_solver.average_policy())
       print("FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format(
           i, conv))
def main(_):
    game = pyspiel.load_game(FLAGS.game, {"players": FLAGS.players})
    xfp_solver = fictitious_play.XFPSolver(game)
    for i in range(FLAGS.iterations):
        xfp_solver.iteration()
        conv = exploitability.exploitability(game, xfp_solver.average_policy())
        if i % FLAGS.print_freq == 0:
            print("Iteration: {} Conv: {}".format(i, conv))
            sys.stdout.flush()
示例#5
0
 def test_meta_game_leduc2p(self):
     print("Leduc 2p")
     game = pyspiel.load_game("leduc_poker")
     xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True)
     for _ in range(3):
         xfp_solver.iteration()
     meta_games = xfp_solver.get_empirical_metagame(10, seed=86487)
     self.assertIsNotNone(meta_games)
     # Metagame utility matrices for each player
     for i in range(2):
         print("player {}: \n{}".format(i + 1, meta_games[i]))
 def test_meta_game_kuhn4p(self):
   print("Kuhn 4p")
   game = pyspiel.load_game("kuhn_poker", {"players": 4})
   xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True)
   for _ in range(3):
     xfp_solver.iteration()
   meta_games = xfp_solver.get_empirical_metagame(10, seed=1)
   self.assertIsNotNone(meta_games)
   # Metagame utility tensors for each player
   for i in range(4):
     print("player {}: \n{}".format(i + 1, meta_games[i]))
示例#7
0
def ficticious_play(seq_game, number_of_iterations, compute_metrics=False):
    xfp_solver = fictitious_play.XFPSolver(seq_game)
    tick_time = time.time()
    for _ in range(number_of_iterations):
        xfp_solver.iteration()
    timing = time.time() - tick_time
    # print('done')
    # average_policies = xfp_solver.average_policy_tables()
    tabular_policy = policy_module.TabularPolicy(seq_game)
    if compute_metrics:
        nash_conv = exploitability.nash_conv(seq_game,
                                             xfp_solver.average_policy())
        average_policy_values = expected_game_score.policy_value(
            seq_game.new_initial_state(), [tabular_policy])
        return timing, tabular_policy, nash_conv, average_policy_values
    return timing, tabular_policy
示例#8
0
def XFP_Solving(game, iterations, save_every=0, save_prefix='base'):
    def save_xfp():
        xfp_policy = xfp_solver.average_policy_tables()
        policy_keys = np.concatenate((list(xfp_policy[0].keys()), list(xfp_policy[1].keys())), 0)
        policy_values = np.concatenate((list(map(lambda d: list(d.values()), list(xfp_policy[0].values()))),
                                        list(map(lambda d: list(d.values()), list(xfp_policy[1].values())))), 0)
        # change possible None's into 0
        policy_values = [(d if d else 0 for d in a) for a in policy_values]
        xfp_policy = dict(zip(policy_keys, policy_values))
        policy_handler.save_to_tabular_policy(game, xfp_policy, "policies/XFP/{}/{}".format(save_prefix, it))

    xfp_solver = fictitious_play.XFPSolver(game)
    for it in range(iterations + 1):
        xfp_solver.iteration()
        if save_every != 0 and it % save_every == 0:  # order is important
            save_xfp()
    save_xfp()
def get_kuhn_poker_data(num_players=3):
  """Returns the kuhn poker data for the number of players specified."""
  game = pyspiel.load_game('kuhn_poker', {'players': num_players})
  xfp_solver = fictitious_play.XFPSolver(game, save_oracles=True)
  for _ in range(3):
    xfp_solver.iteration()

  # Results are seed-dependent, so show some interesting cases
  if num_players == 2:
    meta_games = xfp_solver.get_empirical_metagame(100, seed=1)
  elif num_players == 3:
    meta_games = xfp_solver.get_empirical_metagame(100, seed=5)
  elif num_players == 4:
    meta_games = xfp_solver.get_empirical_metagame(100, seed=2)

  # Metagame utility matrices for each player
  payoff_tables = []
  for i in range(num_players):
    payoff_tables.append(meta_games[i])
  return payoff_tables
 def test_xfp(self):
   game = pyspiel.load_game("kuhn_poker")
   xfp_solver = fictitious_play.XFPSolver(game)
   for _ in range(100):
     xfp_solver.iteration()
   average_policies = xfp_solver.average_policy_tables()
   tabular_policy = policy.TabularPolicy(game)
   for player_id in range(2):
     for info_state, state_policy in average_policies[player_id].items():
       policy_to_update = tabular_policy.policy_for_key(info_state)
       for action, probability in state_policy.items():
         policy_to_update[action] = probability
   average_policy_values = expected_game_score.policy_value(
       game.new_initial_state(), [tabular_policy, tabular_policy])
   print("Kuhn 2P average values after 10 iterations")
   print("P0: {}".format(average_policy_values[0]))
   print("P1: {}".format(average_policy_values[1]))
   self.assertIsNotNone(average_policy_values)
   self.assertTrue(
       np.allclose(average_policy_values, [-1 / 18, 1 / 18], atol=1e-3))
示例#11
0
                np.save(save_prefix + '_exps', np.array(exps))
                print(f"saving to: {save_prefix + '_episodes.npy'}")
                np.save(save_prefix + '_episodes', np.array(episodes))
                if algorithm == 'cfr':
                    cfr_infostates.append(solver.num_infostates_expanded)
                    print("Num infostates expanded (mil): ",
                          solver.num_infostates_expanded / 1e6)
                    print(f"saving to: {save_prefix + '_infostates.npy'}")
                    np.save(save_prefix + '_infostates',
                            np.array(cfr_infostates))

    if algorithm == 'cfr':
        solver = cfr.CFRSolver(game)
        run(solver, iterations)
    elif algorithm == 'xfp':
        solver = fictitious_play.XFPSolver(game)
        run(solver, iterations)
    elif algorithm == 'xdo':
        brs = []
        info_test = []
        for i in range(2):
            br_info = exploitability.best_response(
                game,
                cfr.CFRSolver(game).average_policy(), i)
            full_br_policy = _full_best_response_policy(
                br_info["best_response_action"])
            info_sets = br_info['info_sets']
            info_test.append(info_sets)
            brs.append(full_br_policy)
        br_list = [brs]
        start_time = time.time()