def test_run_games(self, game): env = rl_environment.Environment(game) num_players = env.num_players eva_agents = [] num_actions = env.action_spec()["num_actions"] state_size = env.observation_spec()["info_state"][0] with tf.Session() as sess: for player in range(num_players): eva_agents.append( eva.EVAAgent( sess, env, player, state_size, num_actions, embedding_network_layers=(64, 32), embedding_size=12, learning_rate=1e-4, mixing_parameter=0.5, memory_capacity=int(1e6), discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6))) sess.run(tf.global_variables_initializer()) time_step = env.reset() while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = eva_agents[current_player] # 1. Step the agent. # 2. Step the Environment. agent_output = current_agent.step(time_step) time_step = env.step([agent_output.action]) for agent in eva_agents: agent.step(time_step)
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) env = rl_environment.Environment(FLAGS.game_name) num_players = env.num_players num_actions = env.action_spec()["num_actions"] state_size = env.observation_spec()["info_state"][0] eva_agents = [] with tf.Session() as sess: for player in range(num_players): eva_agents.append( eva.EVAAgent(sess, env, player, state_size, num_actions, embedding_network_layers=(64, 32), embedding_size=12, learning_rate=1e-4, mixing_parameter=0.5, memory_capacity=1e6, discount_factor=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6))) sess.run(tf.global_variables_initializer()) time_step = env.reset() for _ in range(FLAGS.num_episodes): while not time_step.last(): current_player = time_step.observations["current_player"] current_agent = eva_agents[current_player] step_out = current_agent.step(time_step) time_step = env.step([step_out.action]) for agent in eva_agents: agent.step(time_step) game = pyspiel.load_game(FLAGS.game_name) joint_policy = JointPolicy(eva_agents) conv = exploitability.nash_conv( game, policy.PolicyFromCallable(game, joint_policy.action_probabilities)) logging.info("EVA in '%s' - NashConv: %s", FLAGS.game_name, conv)
def main_loop(unused_arg): """Trains a DQN agent in the catch environment.""" env = catch.Environment() info_state_size = env.observation_spec()["info_state"][0] num_actions = env.action_spec()["num_actions"] train_episodes = FLAGS.num_episodes with tf.Session() as sess: if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}: agent = policy_gradient.PolicyGradient( sess, player_id=0, info_state_size=info_state_size, num_actions=num_actions, loss_str=FLAGS.algorithm, hidden_layers_sizes=[128, 128], batch_size=128, entropy_cost=0.01, critic_learning_rate=0.1, pi_learning_rate=0.1, num_critic_before_pi=3) elif FLAGS.algorithm == "dqn": agent = dqn.DQN( sess, player_id=0, state_representation_size=info_state_size, num_actions=num_actions, learning_rate=0.1, replay_buffer_capacity=10000, hidden_layers_sizes=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) elif FLAGS.algorithm == "eva": agent = eva.EVAAgent( sess, env, player_id=0, state_size=info_state_size, num_actions=num_actions, learning_rate=1e-3, trajectory_len=2, num_neighbours=2, mixing_parameter=0.95, memory_capacity=10000, dqn_hidden_layers=[32, 32], epsilon_decay_duration=2000, # 10% total data update_target_network_every=250) else: raise ValueError("Algorithm not implemented!") sess.run(tf.global_variables_initializer()) # Train agent for ep in range(train_episodes): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step agent with final info state. agent.step(time_step) if ep and ep % FLAGS.eval_every == 0: logging.info("-" * 80) logging.info("Episode %s", ep) logging.info("Loss: %s", agent.loss) avg_return = _eval_agent(env, agent, 100) logging.info("Avg return: %s", avg_return)