def main_loop(unused_arg): """Trains a tabular qlearner agent in the cliff walking environment.""" env = cliff_walking.Environment(width=5, height=3) num_actions = env.action_spec()["num_actions"] train_episodes = FLAGS.num_episodes eval_interval = 50 agent = tabular_qlearner.QLearner( player_id=0, step_size=0.05, num_actions=num_actions) # Train the agent for ep in range(train_episodes): time_step = env.reset() while not time_step.last(): agent_output = agent.step(time_step) action_list = [agent_output.action] time_step = env.step(action_list) # Episode is over, step agent with final info state. agent.step(time_step) if ep and ep % eval_interval == 0: logging.info("-" * 80) logging.info("Episode %s", ep) logging.info("Last loss: %s", agent.loss) avg_return = eval_agent(env, agent, 100) logging.info("Avg return: %s", avg_return)
def test_many_runs(self): random.seed(1234) for _ in range(30): height = random.randint(3, 10) width = random.randint(3, 10) env = cliff_walking.Environment(height=height, width=width) time_step = env.reset() self.assertEqual(time_step.step_type, rl_environment.StepType.FIRST) self.assertEqual(time_step.rewards, None) action_int = cliff_walking.UP time_step = env.step(action_int) self.assertEqual(time_step.step_type, rl_environment.StepType.MID) self.assertEqual(time_step.rewards, [-1.0]) action_int = cliff_walking.RIGHT for _ in range(1, width): time_step = env.step(action_int) self.assertEqual(time_step.step_type, rl_environment.StepType.MID) self.assertEqual(time_step.rewards, [-1.0]) action_int = cliff_walking.DOWN time_step = env.step(action_int) self.assertEqual(time_step.step_type, rl_environment.StepType.LAST) self.assertEqual(time_step.rewards, [-1.0])
def test_obs_spec(self): env = cliff_walking.Environment() obs_specs = env.observation_spec() self.assertEqual(len(obs_specs), 3) self.assertEqual(sorted(obs_specs.keys()), ["current_player", "info_state", "legal_actions"]) self.assertEqual(obs_specs["info_state"], (2, ))
def test_action_spec(self): env = cliff_walking.Environment() action_spec = env.action_spec() self.assertEqual(len(action_spec), 4) self.assertEqual(sorted(action_spec.keys()), ["dtype", "max", "min", "num_actions"]) self.assertEqual(action_spec["num_actions"], 4) self.assertEqual(action_spec["dtype"], int)
def test_action_interfaces(self): env = cliff_walking.Environment() time_step = env.reset() # Singleton list works action_list = [cliff_walking.UP] time_step = env.step(action_list) self.assertEqual(time_step.step_type, rl_environment.StepType.MID) # Integer works action_int = cliff_walking.UP time_step = env.step(action_int) self.assertEqual(time_step.step_type, rl_environment.StepType.MID)
def main(): env = cliff_walking.Environment(width=12, height=4) num_actions = env.action_spec()["num_actions"] learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for learning_rate in learning_rates: agent = tabular_qlearner.QLearner(player_id=0, step_size=learning_rate, num_actions=num_actions) train(env, agent, 100) avg_reward = evaluate(env, agent, 50) print(avg_reward)