def test_sac_2x2_grid_world_with_container_actions(self): """ Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = SACAgent.from_spec( agent_config, state_space=FloatBox(shape=(4,)), action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=False, render=False ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_double_dqn_on_2x2_grid_world(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld. """ env_spec = dict(world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, dueling_q=False, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10), update_spec=dict(update_interval=4, batch_size=24, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 1000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_ppo_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Assume we have learned something. # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world. self.assertGreater(results["mean_episode_reward"], -1.0)
def test_double_dqn_on_2x2_grid_world_single_action_to_container(self): """ Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container actions). """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_single_to_container.json") preprocessing_spec = agent_config.pop("preprocessing_spec") action_space = IntBox(0, 18) agent = DQNAgent.from_spec(agent_config, huber_loss=True, double_q=True, dueling_q=True, state_space=FloatBox(shape=(4, )), action_space=action_space, store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_ppo_on_2x2_grid_world_with_container_actions(self): """ Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ----- # |^|H| # ----- # | |G| ^=start, looking up # ----- # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec(agent_config, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space) time_steps = 5000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 1000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Marge q-tables of all four GPUs: agent.last_q_table["q_values"] = agent.last_q_table[ "q_values"].reshape((48, 4)) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 2000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check all learnt Q-values. q_values = agent.graph_executor.execute( ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:] recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8), decimals=1) recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9), decimals=1)
def test_double_dqn_on_2x2_grid_world_with_container_actions(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec(agent_config, double_q=True, dueling_q=False, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space, execution_spec=dict(seed=15), store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("LAST q-table:\n{}".format(agent.last_q_table)) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -7) self.assertGreaterEqual(results["max_episode_reward"], -1.0) self.assertLessEqual(results["episodes_executed"], time_steps / 3) # Check q-table for correct values. expected_q_values_per_state = { (0., 0., -1., 0.): { "forward": (-5.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., -1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, } for state, q_values_forward, q_values_jump in zip( agent.last_q_table["states"], agent.last_q_table["q_values"]["forward"], agent.last_q_table["q_values"]["jump"]): state, q_values_forward, q_values_jump = tuple(state), tuple( q_values_forward), tuple(q_values_jump) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal( q_values_forward, expected_q_values_per_state[state]["forward"], decimals=0) recursive_assert_almost_equal( q_values_jump, expected_q_values_per_state[state]["jump"], decimals=0)