def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, deep_cfr_solver.action_probabilities)) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)
def DEEPCFR_Solving(game, iterations, save_every=0, save_prefix='base', num_travers=40, lr=1e-3, policy_layers=(32, 32), advantage_layers=(16, 16)): def save_deepcfr(): # and print some info i guess? print("---------iteration " + str(it) + "----------") for player, losses in six.iteritems(advantage_losses): print("Advantage for player ", player, losses) print("Advantage Buffer Size for player", player, len(deep_cfr_solver.advantage_buffers[player])) print("Strategy Buffer Size: ", len(deep_cfr_solver.strategy_buffer)) print("policy loss: ", policy_loss) callable_policy = tabular_policy_from_callable(game, deep_cfr_solver.action_probabilities) tabular_policy = tabular_policy_from_callable(game, callable_policy) policy = dict(zip(tabular_policy.state_lookup, tabular_policy.action_probability_array)) # save under map (save_prefix)_(num_travers) return policy_handler.save_to_tabular_policy(game, policy, "policies/DEEPCFR/{}/{}".format( save_prefix + "_" + str(num_travers), it)) with tf.Session() as sess: # set num iters and call solve() multiple times to allow intermediate saving and eval deep_cfr_solver = deep_cfr.DeepCFRSolver(sess, game, policy_network_layers=policy_layers, advantage_network_layers=advantage_layers, num_iterations=1, num_traversals=num_travers, learning_rate=lr) sess.run(tf.global_variables_initializer()) for it in range(iterations + 1): _, advantage_losses, policy_loss = deep_cfr_solver.solve() if save_every != 0 and it % save_every == 0: save_deepcfr() return save_deepcfr()
def test_deep_cfr_runs(self, game_name): game = pyspiel.load_game(game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(8, 4), advantage_network_layers=(4, 2), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) deep_cfr_solver.solve()
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(16, ), advantage_network_layers=(16, ), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=128, batch_size_strategy=1024, memory_capacity=1e7, policy_network_train_steps=400, advantage_network_train_steps=20, reinitialize_advantage_networks=False) sess.run(tf.global_variables_initializer()) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) conv = exploitability.nash_conv(game, average_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) print("Computed player 0 value: {}".format(average_policy_values[0])) print("Expected player 0 value: {}".format(-1 / 18)) print("Computed player 1 value: {}".format(average_policy_values[1])) print("Expected player 1 value: {}".format(1 / 18))
def test_matching_pennies_3p(self): # We don't expect Deep CFR to necessarily converge on 3-player games but # it's nonetheless interesting to see this result. game = pyspiel.load_game_as_turn_based('matching_pennies_3p') with tf.Session() as sess: deep_cfr_solver = deep_cfr.DeepCFRSolver( sess, game, policy_network_layers=(16, 8), advantage_network_layers=(32, 16), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) sess.run(tf.global_variables_initializer()) deep_cfr_solver.solve() conv = exploitability.nash_conv( game, policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities)) print('Deep CFR in Matching Pennies 3p. NashConv: {}'.format(conv))