예제 #1
0
 def test_run_games(self, game):
   env = rl_environment.Environment(game)
   num_players = env.num_players
   eva_agents = []
   num_actions = env.action_spec()["num_actions"]
   state_size = env.observation_spec()["info_state"][0]
   with tf.Session() as sess:
     for player in range(num_players):
       eva_agents.append(
           eva.EVAAgent(
               sess,
               env,
               player,
               state_size,
               num_actions,
               embedding_network_layers=(64, 32),
               embedding_size=12,
               learning_rate=1e-4,
               mixing_parameter=0.5,
               memory_capacity=int(1e6),
               discount_factor=1.0,
               epsilon_start=1.0,
               epsilon_end=0.1,
               epsilon_decay_duration=int(1e6)))
     sess.run(tf.global_variables_initializer())
     time_step = env.reset()
     while not time_step.last():
       current_player = time_step.observations["current_player"]
       current_agent = eva_agents[current_player]
       # 1.  Step the agent.
       # 2.  Step the Environment.
       agent_output = current_agent.step(time_step)
       time_step = env.step([agent_output.action])
     for agent in eva_agents:
       agent.step(time_step)
예제 #2
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    env = rl_environment.Environment(FLAGS.game_name)
    num_players = env.num_players
    num_actions = env.action_spec()["num_actions"]
    state_size = env.observation_spec()["info_state"][0]
    eva_agents = []
    with tf.Session() as sess:
        for player in range(num_players):
            eva_agents.append(
                eva.EVAAgent(sess,
                             env,
                             player,
                             state_size,
                             num_actions,
                             embedding_network_layers=(64, 32),
                             embedding_size=12,
                             learning_rate=1e-4,
                             mixing_parameter=0.5,
                             memory_capacity=1e6,
                             discount_factor=1.0,
                             epsilon_start=1.0,
                             epsilon_end=0.1,
                             epsilon_decay_duration=int(1e6)))
        sess.run(tf.global_variables_initializer())
        time_step = env.reset()
        for _ in range(FLAGS.num_episodes):
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                current_agent = eva_agents[current_player]
                step_out = current_agent.step(time_step)
                time_step = env.step([step_out.action])

            for agent in eva_agents:
                agent.step(time_step)

        game = pyspiel.load_game(FLAGS.game_name)
        joint_policy = JointPolicy(eva_agents)
        conv = exploitability.nash_conv(
            game,
            policy.PolicyFromCallable(game, joint_policy.action_probabilities))
        logging.info("EVA in '%s' - NashConv: %s", FLAGS.game_name, conv)
예제 #3
0
def main_loop(unused_arg):
  """Trains a DQN agent in the catch environment."""
  env = catch.Environment()
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  train_episodes = FLAGS.num_episodes

  with tf.Session() as sess:
    if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}:
      agent = policy_gradient.PolicyGradient(
          sess,
          player_id=0,
          info_state_size=info_state_size,
          num_actions=num_actions,
          loss_str=FLAGS.algorithm,
          hidden_layers_sizes=[128, 128],
          batch_size=128,
          entropy_cost=0.01,
          critic_learning_rate=0.1,
          pi_learning_rate=0.1,
          num_critic_before_pi=3)
    elif FLAGS.algorithm == "dqn":
      agent = dqn.DQN(
          sess,
          player_id=0,
          state_representation_size=info_state_size,
          num_actions=num_actions,
          learning_rate=0.1,
          replay_buffer_capacity=10000,
          hidden_layers_sizes=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    elif FLAGS.algorithm == "eva":
      agent = eva.EVAAgent(
          sess,
          env,
          player_id=0,
          state_size=info_state_size,
          num_actions=num_actions,
          learning_rate=1e-3,
          trajectory_len=2,
          num_neighbours=2,
          mixing_parameter=0.95,
          memory_capacity=10000,
          dqn_hidden_layers=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    else:
      raise ValueError("Algorithm not implemented!")

    sess.run(tf.global_variables_initializer())

    # Train agent
    for ep in range(train_episodes):
      time_step = env.reset()
      while not time_step.last():
        agent_output = agent.step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)
      # Episode is over, step agent with final info state.
      agent.step(time_step)

      if ep and ep % FLAGS.eval_every == 0:
        logging.info("-" * 80)
        logging.info("Episode %s", ep)
        logging.info("Loss: %s", agent.loss)
        avg_return = _eval_agent(env, agent, 100)
        logging.info("Avg return: %s", avg_return)