def test_loss_modes(self): loss_dict = { "qpg": rl_losses.BatchQPGLoss, "rpg": rl_losses.BatchRPGLoss, "rm": rl_losses.BatchRMLoss, "a2c": rl_losses.BatchA2CLoss, } with self.session() as sess: for loss_str, loss_class in loss_dict.items(): agent_by_str = policy_gradient.PolicyGradient( sess, player_id=0, info_state_size=32, num_actions=2, loss_str=loss_str, loss_class=None) agent_by_class = policy_gradient.PolicyGradient( sess, player_id=0, info_state_size=32, num_actions=2, loss_str=None, loss_class=loss_class) self.assertEqual(agent_by_str._pi_loss.shape, agent_by_class._pi_loss.shape) self.assertEqual(agent_by_str._pi_loss.dtype, agent_by_class._pi_loss.dtype) self.assertEqual(agent_by_str._pi_loss.op.type, agent_by_class._pi_loss.op.type)
def test_run_game(self, loss_str, game_name): env = rl_environment.Environment(game_name) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ policy_gradient.PolicyGradient( # pylint: disable=g-complex-comprehension sess, player_id=player_id, info_state_size=info_state_size, num_actions=num_actions, loss_str=loss_str, hidden_layers_sizes=[8, 8], batch_size=16, entropy_cost=0.001, critic_learning_rate=0.01, pi_learning_rate=0.01, num_critic_before_pi=4) for player_id in [0, 1] ] sess.run(tf.global_variables_initializer()) for _ in range(2): time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = agents[current_player] agent_output = current_agent.step(time_step) time_step = env.step([agent_output.action]) for agent in agents: agent.step(time_step)
def main(_): game = "mst" num_players = 1 train_games, train_rewards, test_games, test_rewards = mst.game_params( FLAGS.num_nodes) env_configs = train_games[0] env = rl_environment.Environment(game, **env_configs) info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient(sess, idx, info_state_size, num_actions, loss_str=FLAGS.loss_str, hidden_layers_sizes=(128, )) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(env, agents) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_episodes): env_configs = train_games[ep % len(train_games)] env = rl_environment.Environment(game, **env_configs) if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] #expl = exploitability.exploitability(env.game, expl_policies_avg) msg = "-" * 80 + "\n" msg += "{}: {}\n".format(ep + 1, losses) #expl, losses) logging.info("%s", msg) if (ep + 1) % FLAGS.test_every == 0: test_accuracy = test_trained_bot(test_games, test_rewards, agents[0], ep, FLAGS.num_nodes, game, FLAGS.game_version) logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def main(_): game = "kuhn_poker" num_players = 2 env_configs = {"players": num_players} env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient(sess, idx, info_state_size, num_actions, loss_str=FLAGS.loss_str, hidden_layers_sizes=(128, )) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(env, agents) sess.run(tf.global_variables_initializer()) for ep in range(FLAGS.num_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] expl = exploitability.exploitability(env.game, expl_policies_avg) msg = "-" * 80 + "\n" msg += "{}: {}\n{}\n".format(ep + 1, expl, losses) logging.info("%s", msg) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) for pid, agent in enumerate(agents): policy_to_csv(env.game, expl_policies_avg, f"{FLAGS.modeldir}/test_p{pid+1}.csv")
def pgrad_train(unused_arg): sess = tf.Session() env = rl_environment.Environment(FLAGS.game) state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] agents = [ policy_gradient.PolicyGradient(sess, idx, state_size, num_actions, loss_str="rpg", hidden_layers_sizes=(128, )) for idx in range(2) ] expl_policies_avg = PolicyGradientPolicies(env, agents) run_agents(sess, env, agents, expl_policies_avg)
def test_run_hanabi(self): # Hanabi is an optional game, so check we have it before running the test. game = "hanabi" if game not in pyspiel.registered_names(): return num_players = 3 env_configs = { "players": num_players, "max_life_tokens": 1, "colors": 2, "ranks": 3, "hand_size": 2, "max_information_tokens": 3, "discount": 0. } env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with self.session() as sess: agents = [ policy_gradient.PolicyGradient( # pylint: disable=g-complex-comprehension sess, player_id=player_id, info_state_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=[8, 8], batch_size=16, entropy_cost=0.001, critic_learning_rate=0.01, pi_learning_rate=0.01, num_critic_before_pi=4) for player_id in range(num_players) ] sess.run(tf.global_variables_initializer()) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] agent_output = [agent.step(time_step) for agent in agents] time_step = env.step([agent_output[current_player].action]) for agent in agents: agent.step(time_step)
def build_graph(self, scope_name, current_player, info_state_size, num_actions, hidden_layers_sizes=[64,64], replay_buffer_capacity=1e5, batch_size=32, entropy_cost=0.001, critic_learning_rate=0.01, pi_learning_rate=0.01, num_critic_before_pi=32 ): with tf.variable_scope(scope_name) as scope: if self._oracle == "dqn": training_agent = dqn.DQN( session=self._session, player_id=current_player, state_representation_size=info_state_size, num_actions=num_actions, hidden_layers_sizes=hidden_layers_sizes, replay_buffer_capacity=replay_buffer_capacity, batch_size=batch_size) elif self._oracle in ["rpg", "qpg", "rm", "a2c"]: training_agent = policy_gradient.PolicyGradient( # pylint: disable=g-complex-comprehension session=self._session, player_id=current_player, info_state_size=info_state_size, num_actions=num_actions, loss_str=self._oracle, hidden_layers_sizes=hidden_layers_sizes, batch_size=batch_size, entropy_cost=entropy_cost, critic_learning_rate=critic_learning_rate, pi_learning_rate=pi_learning_rate, num_critic_before_pi=num_critic_before_pi) else: raise ValueError("Oracle selected is not supported.") return training_agent
def main_loop(unused_arg): """Trains a DQN agent in the catch environment.""" env = catch.Environment() info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] train_episodes = FLAGS.num_episodes with tf.Session() as sess: if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}: agent = policy_gradient.PolicyGradient( sess, player_id=0, info_state_size=info_state_size, num_actions=num_actions, loss_str=FLAGS.algorithm, hidden_layers_sizes=[128, 128], batch_size=128, entropy_cost=0.01, critic_learning_rate=0.1, pi_learning_rate=0.1, num_critic_before_pi=3) elif FLAGS.algorithm == "dqn": agent = dqn.DQN( sess, player_id=0, state_representation_size=info_state_size, num_actions=num_actions, learning_rate=0.1, replay_buffer_capacity=10000, hidden_layers_sizes=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) elif FLAGS.algorithm == "eva": agent = eva.EVAAgent( sess, env, player_id=0, state_size=info_state_size, num_actions=num_actions, learning_rate=1e-3, trajectory_len=2, num_neighbours=2, mixing_parameter=0.95, memory_capacity=10000, dqn_hidden_layers=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) else: raise ValueError("Algorithm not implemented!") sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step agent with final info state. agent.step(time_step) if ep and ep % FLAGS.eval_every == 0: logging.info("-" * 80) logging.info("Episode %s", ep) logging.info("Loss: %s", agent.loss) avg_return = _eval_agent(env, agent, 100) logging.info("Avg return: %s", avg_return)
def main(_): game = "risk" num_players = 2 env_configs = env_configs = { "players": num_players, "map": 0, "rng_seed": -1, "max_turns": 90, "dep_abs": False, "atk_abs": True, "redist_abs": True, "fort_abs": True, "dep_q": 31, "atk_q": 2, "redist_q": 2, "fort_q": 2 } env = rl_environment.Environment(game, **env_configs) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient(sess, idx, info_state_size, num_actions, loss_str=FLAGS.loss_str, hidden_layers_sizes=( 174, 174, 174, )) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(env, agents) sess.run(tf.global_variables_initializer()) if FLAGS.use_checkpoints: for agent in agents: if agent.has_checkpoint(FLAGS.checkpoint_dir): agent.restore(FLAGS.checkpoint_dir) for ep in range(FLAGS.num_episodes): if (ep + 1) % FLAGS.eval_every == 0: losses = [agent.loss for agent in agents] msg = "-" * 80 + "\n" msg += "{}: {}\n".format(ep + 1, losses) if FLAGS.use_checkpoints: for agent in agents: agent.save(FLAGS.checkpoint_dir) logging.info("%s", msg) time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step)
def PG_Solving(game, iterations, save_every=0, save_prefix='base'): class PolicyGradientPolicies(policy.Policy): """Joint policy to be evaluated.""" def __init__(self, nfsp_policies): player_ids = [0, 1] super(PolicyGradientPolicies, self).__init__(game, player_ids) self._policies = nfsp_policies self._obs = {"info_state": [None, None], "legal_actions": [None, None]} def action_probabilities(self, state, player_id=None): cur_player = state.current_player() legal_actions = state.legal_actions(cur_player) self._obs["current_player"] = cur_player self._obs["info_state"][cur_player] = ( state.information_state_tensor(cur_player)) self._obs["legal_actions"][cur_player] = legal_actions info_state = rl_environment.TimeStep( observations=self._obs, rewards=None, discounts=None, step_type=None) p = self._policies[cur_player].step(info_state, is_evaluation=True).probs prob_dict = {action: p[action] for action in legal_actions} return prob_dict def save_pg(): tabular_policy = policy.tabular_policy_from_callable(game, expl_policies_avg) policy_handler.save_tabular_policy(game, tabular_policy, "policies/PG/{}/{}".format(save_prefix, it)) num_players = 2 env = rl_environment.Environment(game, **{"players": num_players}) info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] with tf.Session() as sess: # pylint: disable=g-complex-comprehension agents = [ policy_gradient.PolicyGradient( sess, idx, info_state_size, num_actions, loss_str="rpg", # ["rpg", "qpg", "rm"] = PG loss to use. hidden_layers_sizes=(128,)) for idx in range(num_players) ] expl_policies_avg = PolicyGradientPolicies(agents) sess.run(tf.global_variables_initializer()) for it in range(iterations + 1): if save_every != 0 and it % save_every == 0: # order is important save_pg() time_step = env.reset() while not time_step.last(): player_id = time_step.observations["current_player"] agent_output = agents[player_id].step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step all agents with final info state. for agent in agents: agent.step(time_step) save_pg()