def test_cfr_cce_dist_goofspiel(self): """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc.""" game = pyspiel.load_game( "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order=" "descending,returns_type=total_points))") for num_iterations in [1, 10, 100]: policies = [] cfr_solver = cfr.CFRSolver(game) for _ in range(num_iterations): cfr_solver.evaluate_and_update_policy() policies.append( policy.python_policy_to_pyspiel_policy( cfr_solver.current_policy())) mu = pyspiel.uniform_correlation_device(policies) cce_dist1 = pyspiel.cce_dist(game, mu) print( "goofspiel, cce test num_iterations: {}, cce_dist: {}".format( num_iterations, cce_dist1)) # Assemble the same correlation device manually, just as an example for # how to do non-uniform distributions of them and to test the python # bindings for lists of tuples works properly uniform_prob = 1.0 / len(policies) mu2 = [(uniform_prob, policy) for policy in policies] cce_dist2 = pyspiel.cce_dist(game, mu2) self.assertAlmostEqual(cce_dist1, cce_dist2)
def cfr_train(unused_arg): exploit_history = list() exploit_idx = list() tf.enable_eager_execution() game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)}) agent_name = "cfr" cfr_solver = cfr.CFRSolver(game) checkpoint = datetime.now() for ep in range(FLAGS.episodes): cfr_solver.evaluate_and_update_policy() if ep % 100 == 0: delta = datetime.now() - checkpoint conv = exploitability.exploitability(game, cfr_solver.average_policy()) exploit_idx.append(ep) exploit_history.append(conv) print( "Iteration {} exploitability {} - {} seconds since last checkpoint" .format(ep, conv, delta.seconds)) checkpoint = datetime.now() pickle.dump([exploit_idx, exploit_history], open( FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) + ".dat", "wb")) now = datetime.now() policy = cfr_solver.average_policy() agent_name = "cfr" for pid in [1, 2]: policy_to_csv( game, policy, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") + "_" + agent_name + "_" + str(pid + 1) + "_+" + str(ep) + "episodes.csv")
def main(_): game = pyspiel.load_game_as_turn_based(game_, ) cfr_solver = cfr.CFRSolver(game) print("policy_initial:", cfr_solver.current_policy().action_probability_array) for i in range(FLAGS.iterations): if i % FLAGS.print_freq == 0: conv = exploitability.exploitability(game, cfr_solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv)) print("Iteration{}".format(i)) print("policy_av:", cfr_solver.average_policy().action_probability_array) print("policy_cr:", cfr_solver.current_policy().action_probability_array) cfr_solver.evaluate_and_update_policy() write_csv(dir_ + game_ + "_" + algo_name + "_av.csv", cfr_solver.average_policy().action_probability_array[0]) write_csv(dir_ + game_ + "_" + algo_name + "_av.csv", cfr_solver.average_policy().action_probability_array[1]) write_csv(dir_ + game_ + "_" + algo_name + "_cr.csv", cfr_solver.current_policy().action_probability_array[0]) write_csv(dir_ + game_ + "_" + algo_name + "_cr.csv", cfr_solver.current_policy().action_probability_array[1])
def test_cfr_on_turn_based_game_with_exploitability(self): """Check if CFR can be applied to the sequential game.""" game = pyspiel.load_game( "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)") seq_game = pyspiel.convert_to_turn_based(game) cfr_solver = cfr.CFRSolver(seq_game) for _ in range(_NUM_ITERATION_CFR_TEST): cfr_solver.evaluate_and_update_policy() exploitability.nash_conv(seq_game, cfr_solver.average_policy())
def main(_): game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(FLAGS.players)}) cfr_solver = cfr.CFRSolver(game) for i in range(FLAGS.iterations): cfr_solver.evaluate_and_update_policy() if i % FLAGS.print_freq == 0: conv = exploitability.exploitability(game, cfr_solver.average_policy()) print("Iteration {} exploitability {}".format(i, conv))
def test_cfr_kuhn_poker(self): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) for _ in range(300): cfr_solver.evaluate_and_update_policy() average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose( average_policy_values, [-1 / 18, 1 / 18], atol=1e-3)
def CFR_Solving(game, iterations, save_every=0, save_prefix='base'): def save_cfr(): policy = cfr_solver.average_policy() policy = dict(zip(policy.state_lookup, policy.action_probability_array)) policy_handler.save_to_tabular_policy(game, policy, "policies/CFR/{}/{}".format(save_prefix, it)) cfr_solver = cfr.CFRSolver(game) for it in range(iterations + 1): if save_every != 0 and it % save_every == 0: # order is important save_cfr() cfr_solver.evaluate_and_update_policy() save_cfr()
def main(unused_argv): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) episodes = [] exploits = [] nashes = [] # Train the agent for a specific amount of episodes for ep in range(FLAGS.num_train_episodes): print("Running episode {} of {}".format(ep, FLAGS.num_train_episodes)) cfr_solver.evaluate_and_update_policy() avg_pol = cfr_solver.average_policy() # Calculate the exploitability and nash convergence expl = exploitability.exploitability(game, avg_pol) nash = exploitability.nash_conv(game, avg_pol) exploits.append(expl) nashes.append(nash) episodes.append(ep) # Get the average policy average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) cur_pol = cfr_solver.current_policy() # Plot the exploitability plt.plot(episodes, exploits, "-r", label="Exploitability") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_expl.png") plt.figure() # Plot the nash convergence plt.plot(episodes, nashes, "-r", label="NashConv") plt.xscale("log") plt.yscale("log") plt.xlim(FLAGS.eval_every, FLAGS.num_train_episodes) plt.legend(loc="upper right") plt.show() plt.savefig("cfr_nash.png") print(average_policy) print(average_policy_values) policy_to_csv(game, average_policy, "./kuhn_policy.csv")
def main(_): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRSolver(game) iterations = 1000 for i in range(iterations): cfr_value = cfr_solver.evaluate_and_update_policy() print("Game util at iteration {}: {}".format(i, cfr_value)) average_policy = cfr_solver.average_policy() average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Expected player 0 value: {}".format(-1 / 18))
def test_cfr_cce_ce_dist_goofspiel(self): """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc.""" game = pyspiel.load_game( "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order=" "descending,returns_type=total_points))") for num_iterations in [1, 10, 100]: policies = [] cfr_solver = cfr.CFRSolver(game) for _ in range(num_iterations): cfr_solver.evaluate_and_update_policy() policies.append( policy.python_policy_to_pyspiel_policy( cfr_solver.current_policy())) mu = pyspiel.uniform_correlation_device(policies) cce_dist_info = pyspiel.cce_dist(game, mu) print( "goofspiel, cce test num_iters: {}, cce_dist: {}, per player: {}" .format(num_iterations, cce_dist_info.dist_value, cce_dist_info.deviation_incentives)) # Try converting one of the BR policies: _ = policy.pyspiel_policy_to_python_policy( game, cce_dist_info.best_response_policies[0]) # Assemble the same correlation device manually, just as an example for # how to do non-uniform distributions of them and to test the python # bindings for lists of tuples works properly uniform_prob = 1.0 / len(policies) mu2 = [(uniform_prob, policy) for policy in policies] cce_dist_info2 = pyspiel.cce_dist(game, mu2) self.assertAlmostEqual(cce_dist_info2.dist_value, sum(cce_dist_info.deviation_incentives)) # Test the CEDist function too, why not. Disable the exact one, as it # takes too long for a test. # ce_dist_info = pyspiel.ce_dist(game, pyspiel.determinize_corr_dev(mu)) ce_dist_info = pyspiel.ce_dist( game, pyspiel.sampled_determinize_corr_dev(mu, 100)) print( "goofspiel, ce test num_iters: {}, ce_dist: {}, per player: {}" .format(num_iterations, ce_dist_info.dist_value, ce_dist_info.deviation_incentives)) print("number of conditional best responses per player:") for p in range(game.num_players()): print(" player {}, num: {}".format( p, len(ce_dist_info.conditional_best_response_policies[p])))
def counterfactual_regret_minimization(seq_game, number_of_iterations, compute_metrics=False): # freq_iteration_printing = number_of_iterations // 10 cfr_solver = cfr.CFRSolver(seq_game) tick_time = time.time() # print("CFRSolver initialized.") for _ in range(number_of_iterations): cfr_solver.evaluate_and_update_policy() # if i % freq_iteration_printing == 0: # print(f"Iteration {i}") timing = time.time() - tick_time # print("Finish.") if compute_metrics: nash_conv = exploitability.nash_conv(seq_game, cfr_solver.average_policy()) return timing, cfr_solver.average_policy(), nash_conv return timing, cfr_solver.average_policy()
print(f"saving to: {save_prefix + '_times.npy'}") np.save(save_prefix + '_times', np.array(times)) print(f"saving to: {save_prefix + '_exps.npy'}") np.save(save_prefix + '_exps', np.array(exps)) print(f"saving to: {save_prefix + '_episodes.npy'}") np.save(save_prefix + '_episodes', np.array(episodes)) if algorithm == 'cfr': cfr_infostates.append(solver.num_infostates_expanded) print("Num infostates expanded (mil): ", solver.num_infostates_expanded / 1e6) print(f"saving to: {save_prefix + '_infostates.npy'}") np.save(save_prefix + '_infostates', np.array(cfr_infostates)) if algorithm == 'cfr': solver = cfr.CFRSolver(game) run(solver, iterations) elif algorithm == 'xfp': solver = fictitious_play.XFPSolver(game) run(solver, iterations) elif algorithm == 'xdo': brs = [] info_test = [] for i in range(2): br_info = exploitability.best_response( game, cfr.CFRSolver(game).average_policy(), i) full_br_policy = _full_best_response_policy( br_info["best_response_action"]) info_sets = br_info['info_sets'] info_test.append(info_sets)