Пример #1
0
def main_loop(unused_arg):
  """Trains a tabular qlearner agent in the cliff walking environment."""
  env = cliff_walking.Environment(width=5, height=3)
  num_actions = env.action_spec()["num_actions"]

  train_episodes = FLAGS.num_episodes
  eval_interval = 50

  agent = tabular_qlearner.QLearner(
      player_id=0, step_size=0.05, num_actions=num_actions)

  # Train the agent
  for ep in range(train_episodes):
    time_step = env.reset()
    while not time_step.last():
      agent_output = agent.step(time_step)
      action_list = [agent_output.action]
      time_step = env.step(action_list)
    # Episode is over, step agent with final info state.
    agent.step(time_step)

    if ep and ep % eval_interval == 0:
      logging.info("-" * 80)
      logging.info("Episode %s", ep)
      logging.info("Last loss: %s", agent.loss)
      avg_return = eval_agent(env, agent, 100)
      logging.info("Avg return: %s", avg_return)
Пример #2
0
    def test_many_runs(self):
        random.seed(1234)
        for _ in range(30):
            height = random.randint(3, 10)
            width = random.randint(3, 10)
            env = cliff_walking.Environment(height=height, width=width)

            time_step = env.reset()
            self.assertEqual(time_step.step_type,
                             rl_environment.StepType.FIRST)
            self.assertEqual(time_step.rewards, None)

            action_int = cliff_walking.UP
            time_step = env.step(action_int)
            self.assertEqual(time_step.step_type, rl_environment.StepType.MID)
            self.assertEqual(time_step.rewards, [-1.0])

            action_int = cliff_walking.RIGHT
            for _ in range(1, width):
                time_step = env.step(action_int)
                self.assertEqual(time_step.step_type,
                                 rl_environment.StepType.MID)
                self.assertEqual(time_step.rewards, [-1.0])

            action_int = cliff_walking.DOWN
            time_step = env.step(action_int)

            self.assertEqual(time_step.step_type, rl_environment.StepType.LAST)
            self.assertEqual(time_step.rewards, [-1.0])
Пример #3
0
 def test_obs_spec(self):
     env = cliff_walking.Environment()
     obs_specs = env.observation_spec()
     self.assertEqual(len(obs_specs), 3)
     self.assertEqual(sorted(obs_specs.keys()),
                      ["current_player", "info_state", "legal_actions"])
     self.assertEqual(obs_specs["info_state"], (2, ))
Пример #4
0
 def test_action_spec(self):
     env = cliff_walking.Environment()
     action_spec = env.action_spec()
     self.assertEqual(len(action_spec), 4)
     self.assertEqual(sorted(action_spec.keys()),
                      ["dtype", "max", "min", "num_actions"])
     self.assertEqual(action_spec["num_actions"], 4)
     self.assertEqual(action_spec["dtype"], int)
Пример #5
0
    def test_action_interfaces(self):
        env = cliff_walking.Environment()
        time_step = env.reset()

        # Singleton list works
        action_list = [cliff_walking.UP]
        time_step = env.step(action_list)
        self.assertEqual(time_step.step_type, rl_environment.StepType.MID)

        # Integer works
        action_int = cliff_walking.UP
        time_step = env.step(action_int)
        self.assertEqual(time_step.step_type, rl_environment.StepType.MID)
Пример #6
0
def main():
    env = cliff_walking.Environment(width=12, height=4)
    num_actions = env.action_spec()["num_actions"]

    learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

    for learning_rate in learning_rates:
        agent = tabular_qlearner.QLearner(player_id=0,
                                          step_size=learning_rate,
                                          num_actions=num_actions)

        train(env, agent, 100)
        avg_reward = evaluate(env, agent, 50)

        print(avg_reward)