def test_run_kuhn(self): env = rl_environment.Environment("kuhn_poker") state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ nfsp.NFSP( # pylint: disable=g-complex-comprehension sess, player_id, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[16], reservoir_buffer_capacity=10, anticipatory_param=0.1) for player_id in [0, 1] ] sess.run(tf.global_variables_initializer()) for unused_ep in range(10): time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = agents[current_player] agent_output = current_agent.step(time_step) time_step = env.step([agent_output.action]) for agent in agents: agent.step(time_step)
def nfsp_train(unused_arg): env = rl_environment.Environment(FLAGS.game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] kwargs = { "replay_buffer_capacity": 2e5, "epsilon_decay_duration": FLAGS.episodes, "epsilon_start": 0.06, "epsilon_end": 0.001, } sess = tf.Session() players = [ nfsp.NFSP(sess, idx, state_representation_size=state_size, num_actions=num_actions, hidden_layers_sizes=[64], reservoir_buffer_capacity=2e6, rl_learning_rate=0.1, sl_learning_rate=0.005, anticipatory_param=0.1, batch_size=128, learn_every=64, **kwargs) for idx in range(2) ] expl_policies_avg = NFSPPolicies(env, players, nfsp.MODE.average_policy) run_agents(sess, env, players, expl_policies_avg) sess.close()
def main(_): game = "leduc_poker" num_players = 2 env = rl_environment.Environment(game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": FLAGS.anticipatory_param, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } with tf.Session() as sess: agents = [ nfsp.NFSP(sess, idx, state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] # for agent in agents[2:]: # agent.restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained") for agent in agents: agent.restore(FLAGS.checkpoint_dir) # agents[1].restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained") # Evaluate against random agent random_agents = [ random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players) ] r_mean = evaluateBotAgainstBot(env, agents[0], agents[1], 10000) logging.info("Mean episode rewards: %s", r_mean) #analyzeHistory() #r_mean = eval_against_random_bots(env, agents, random_agents, 10000) #logging.info("Mean episode rewards: %s", r_mean) '''if not FLAGS.iteractive_play:
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = FLAGS.game_name num_players = FLAGS.num_players env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": FLAGS.anticipatory_param, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] logging.info("Losses: %s", losses) expl = exploitability.exploitability(env.game, expl_policies_avg) logging.info("[%s] Exploitability AVG %s", ep + 1, expl) logging.info("_____________________________________________") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = FLAGS.game_name num_players = FLAGS.num_players env_configs = { "players": num_players, "map": 0, "rng_seed": -1, "max_turns": 90, "dep_abs": False, "atk_abs": True, "redist_abs": True, "fort_abs": True, "dep_q": 31, "atk_q": 2, "redist_q": 2, "fort_q": 2 } env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": FLAGS.anticipatory_param, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) if FLAGS.use_checkpoints: for agent in agents: if agent.has_checkpoint(FLAGS.checkpoint_dir): agent.restore(FLAGS.checkpoint_dir) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] if FLAGS.human_player_id == player_id: print(time_step.observations['info_state'][player_id]) print(time_step.observations['legal_actions'][player_id]) visualise(time_step.observations['info_state'][player_id]) human_action = input('Human action:') time_step = env.step([int(human_action)]) else: agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] print(action_list) time_step = env.step(action_list)
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = FLAGS.game_name num_players = FLAGS.num_players env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes] kwargs = { "replay_buffer_capacity": FLAGS.replay_buffer_capacity, "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity, "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": FLAGS.anticipatory_param, "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": FLAGS.epsilon_start, "epsilon_end": FLAGS.epsilon_end, } with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) if FLAGS.use_checkpoints: for agent in agents: if agent.has_checkpoint(FLAGS.checkpoint_dir): agent.restore(FLAGS.checkpoint_dir) for ep in range(FLAGS.num_train_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] logging.info("Losses: %s", losses) if FLAGS.evaluation_metric == "exploitability": # Avg exploitability is implemented only for 2 players constant-sum # games, use nash_conv otherwise. expl = exploitability.exploitability( env.game, joint_avg_policy) logging.info("[%s] Exploitability AVG %s", ep + 1, expl) elif FLAGS.evaluation_metric == "nash_conv": nash_conv = exploitability.nash_conv( env.game, joint_avg_policy) logging.info("[%s] NashConv %s", ep + 1, nash_conv) else: raise ValueError(" ".join( ("Invalid evaluation metric, choose from", "'exploitability', 'nash_conv'."))) if FLAGS.use_checkpoints: for agent in agents: agent.save(FLAGS.checkpoint_dir) logging.info("_____________________________________________") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def neural_ficticious_self_play(seq_game, num_epoch, sess, compute_metrics=False): env = rl_environment.Environment(seq_game) # Parameters from the game. num_players = env.num_players num_actions = env.action_spec()["num_actions"] info_state_size = env.observation_spec()["info_state"][0] # Parameters for the algorithm. hidden_layers_sizes = [int(l) for l in [128]] kwargs = { "replay_buffer_capacity": int(2e5), "reservoir_buffer_capacity": int(2e6), "min_buffer_size_to_learn": 1000, "anticipatory_param": 0.1, "batch_size": 128, "learn_every": 64, "rl_learning_rate": 0.01, "sl_learning_rate": 0.01, "optimizer_str": "sgd", "loss_str": "mse", "update_target_network_every": 19200, "discount_factor": 1.0, "epsilon_decay_duration": int(20e6), "epsilon_start": 0.06, "epsilon_end": 0.001, } # freq_epoch_printing = num_epoch // 10 agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) # print("TF initialized.") tick_time = time.time() for _ in range(num_epoch): # if ep % freq_epoch_printing == 0: # print(f"Iteration {ep}") time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) timing = time.time() - tick_time # print("Finish.") if compute_metrics: tabular_policy = joint_avg_policy.TabularPolicy(seq_game) average_policy_values = expected_game_score.policy_value( seq_game.new_initial_state(), [tabular_policy]) nash_conv = exploitability.nash_conv(env.game, joint_avg_policy) return timing, joint_avg_policy, average_policy_values, nash_conv return timing, joint_avg_policy
def NFSP_Solving(game, iterations, save_every=0, save_prefix='base'): class NFSPPolicies(policy.Policy): """Joint policy to be evaluated.""" def __init__(self, nfsp_policies, mode): player_ids = [0, 1] super(NFSPPolicies, self).__init__(game, player_ids) self._policies = nfsp_policies self._mode = mode self._obs = {"info_state": [None, None], "legal_actions": [None, None]} def action_probabilities(self, state, player_id=None): cur_player = state.current_player() legal_actions = state.legal_actions(cur_player) self._obs["current_player"] = cur_player self._obs["info_state"][cur_player] = ( state.information_state_tensor(cur_player)) self._obs["legal_actions"][cur_player] = legal_actions info_state = rl_environment.TimeStep( observations=self._obs, rewards=None, discounts=None, step_type=None) with self._policies[cur_player].temp_mode_as(self._mode): p = self._policies[cur_player].step(info_state, is_evaluation=True).probs prob_dict = {action: p[action] for action in legal_actions} return prob_dict def save_nfsp(): tabular_policy = policy.tabular_policy_from_callable(game, expl_policies_avg) policy_handler.save_tabular_policy(game, tabular_policy, "policies/NFSP/{}/{}".format(save_prefix, it)) num_players = 2 env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] hidden_layers_sizes = [128] replay_buffer_capacity = int(2e5) reservoir_buffer_capacity = int(2e6) anticipatory_param = 0.1 hidden_layers_sizes = [int(l) for l in hidden_layers_sizes] kwargs = { "replay_buffer_capacity": replay_buffer_capacity, "epsilon_decay_duration": iterations, "epsilon_start": 0.06, "epsilon_end": 0.001, } with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, reservoir_buffer_capacity, anticipatory_param, **kwargs) for idx in range(num_players) ] expl_policies_avg = NFSPPolicies(agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) for it in range(iterations + 1): if save_every != 0 and it % save_every == 0: # order is important save_nfsp() time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) save_nfsp()
def train_network( num_episodes, hidden_layers_sizes, replay_buffer_capacity, reservoir_buffer_capacity, anticipatory_param, epsilon_start, ): # Train the NFSP network with specific params logging.info( "Training network with hyperparameters: LAY_SIZE={}, REPBUFCAP={}, RESBUFCAP={}, ANTPARAM={}, ESTART={}" .format( hidden_layers_sizes, replay_buffer_capacity, reservoir_buffer_capacity, anticipatory_param, epsilon_start, )) game = FLAGS.game_name num_players = FLAGS.num_players # Set the environment env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] # Set the arguments hidden_layers_sizes = [hidden_layers_sizes] kwargs = { "replay_buffer_capacity": int(replay_buffer_capacity), "reservoir_buffer_capacity": int(reservoir_buffer_capacity), "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn, "anticipatory_param": float(anticipatory_param), "batch_size": FLAGS.batch_size, "learn_every": FLAGS.learn_every, "rl_learning_rate": FLAGS.rl_learning_rate, "sl_learning_rate": FLAGS.sl_learning_rate, "optimizer_str": FLAGS.optimizer_str, "loss_str": FLAGS.loss_str, "update_target_network_every": FLAGS.update_target_network_every, "discount_factor": FLAGS.discount_factor, "epsilon_decay_duration": FLAGS.epsilon_decay_duration, "epsilon_start": float(epsilon_start), "epsilon_end": FLAGS.epsilon_end, } # Start the training session with tf.Session() as sess: agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, **kwargs) for idx in range(num_players) ] expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy) episodes = [] exploits = [] nashes = [] sess.run(tf.global_variables_initializer()) for ep in range(num_episodes): # Evalutate every x'th iteration, and calculate the exploitability # and nash convergence if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] expl = exploitability.exploitability(env.game, expl_policies_avg) nash = exploitability.nash_conv(env.game, expl_policies_avg) logging.info("[%s/%s] AVG Exploitability %s", ep + 1, num_episodes, expl) episodes.append(ep + 1) exploits.append(expl) nashes.append(nash) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) # Export the trained policy policy_to_csv( pyspiel.load_game("leduc_poker"), expl_policies_avg, "./leduc_best_policy.csv", ) return (episodes, exploits, nashes)
def runNFSP(hidden_layers_sizes, replay_buffer_capacity, reservoir_buffer_capacity, epsilon_start, epsilon_end, anticipatory_param): # Define data storage arrays episodes = [] exploits = [] # Initialize the game game = FLAGS.game num_players = FLAGS.num_players env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] kwargs = { "replay_buffer_capacity": replay_buffer_capacity, "epsilon_decay_duration": FLAGS.num_train_episodes, "epsilon_start": epsilon_start, "epsilon_end": epsilon_end, } # Start the TensorFlow session with tf.Session() as sess: # Initialize NFSP Agent agents = [ nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes, reservoir_buffer_capacity, anticipatory_param, **kwargs) for idx in range(num_players) ] expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_train_episodes): # Evaluate Agents if ((ep + 1) % FLAGS.eval_every == 0) & ((ep + 1) >= 100): losses = [agent.loss for agent in agents] logging.info("Losses: %s", losses) expl = exploitability.exploitability(env.game, expl_policies_avg) logging.info("[%s] Exploitability AVG %s", ep + 1, expl) logging.info("_____________________________________________") episodes.append(ep + 1) exploits.append(expl) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) for pid, agent in enumerate(agents): policy_to_csv(env.game, expl_policies_avg, f"{FLAGS.modeldir}/test_p{pid+1}.csv") play(agents, env) return episodes, exploits