def test_sac_2x2_grid_world_with_container_actions(self): """ Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = SACAgent.from_spec( agent_config, state_space=FloatBox(shape=(4,)), action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=False, render=False ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_double_dqn_on_2x2_grid_world(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld. """ env_spec = dict(world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, dueling_q=False, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10), update_spec=dict(update_interval=4, batch_size=24, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 1000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_2x2_grid_world_using_flow_methods(self): """ Tests a minimalistic 2x2 GridWorld. """ env = GridWorld(world="2x2") # Simple test runs with fixed actions. # X=player's position s, r, t = env.step_flow(2) # down: [" H", "XG"] self.assertTrue(s == 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(1) # right: [" H", " X"] self.assertTrue(s == 0) self.assertTrue(r == 1.0) self.assertTrue(t) s, r, t = env.step_flow(1) # right: [" X", " G"] -> in the hole self.assertTrue(s == 0) self.assertTrue(r == -5.0) self.assertTrue(t) # Run against a wall. s, r, t = env.step_flow(3) # left: ["XH", " G"] self.assertTrue(s == 0) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(2) # down: [" H", "XG"] self.assertTrue(s == 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(0) # up: ["XH", " G"] self.assertTrue(s == 0) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)
def test_ppo_agent_faulty_op_visualization(self): """ Creates a PPOAgent with a badly connected network and visualizes the root component. """ agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld.json") # Sabotage the NN. agent_config["network_spec"] = [{ "type": "dense", "units": 10 }, { "type": "embedding", "embed_dim": 3, "vocab_size": 4 }] env = GridWorld(world="2x2") # Build Agent and hence trigger the Space error. try: ppo_agent = PPOAgent.from_spec( agent_config, state_space=GridWorld.grid_world_2x2_flattened_state_space, action_space=env.action_space) except RLGraphSpaceError as e: print("Seeing expected RLGraphSpaceError ({}). Test ok.".format(e)) else: raise RLGraphError( "Not seeing expected RLGraphSpaceError with faulty input Space to embed layer of PPO!" )
def test_double_dqn_on_2x2_grid_world_single_action_to_container(self): """ Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container actions). """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_single_to_container.json") preprocessing_spec = agent_config.pop("preprocessing_spec") action_space = IntBox(0, 18) agent = DQNAgent.from_spec(agent_config, huber_loss=True, double_q=True, dueling_q=True, state_space=FloatBox(shape=(4, )), action_space=action_space, store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_ppo_on_2x2_grid_world(self): """ Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env. """ env = GridWorld(world="2x2") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_2x2_gridworld.json"), state_space=GridWorld.grid_world_2x2_flattened_state_space, action_space=env.action_space, execution_spec=dict(seed=15), ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) # Assume we have learned something. self.assertGreater(results["mean_episode_reward"], -0.2)
def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_ppo_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Assume we have learned something. # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world. self.assertGreater(results["mean_episode_reward"], -1.0)
def test_ppo_agent_visualization(self): """ Creates a PPOAgent and visualizes meta-graph (no APIs) and the NN-component. """ env = GridWorld(world="2x2") env.render() ppo_agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_2x2_gridworld.json"), state_space=GridWorld.grid_world_2x2_flattened_state_space, action_space=env.action_space) # Test graphviz component-graph drawing. draw_meta_graph(ppo_agent.root_component, output=rlgraph_dir + "/ppo.gv", apis=False, graph_fns=False) self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo.gv")) # Test graphviz component-graph w/ API drawing (only the Policy component). draw_meta_graph(ppo_agent.policy.neural_network, output=rlgraph_dir + "/ppo_nn.gv", apis=True) self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo_nn.gv"))
def test_impala_single_agent_compilation(self): """ Tests IMPALA agent compilation (single-node mode). """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05)) agent.terminate() print("Compiled IMPALA type=single agent.")
def test_ppo_on_2x2_grid_world_with_container_actions(self): """ Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ----- # |^|H| # ----- # | |G| ^=start, looking up # ----- # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec(agent_config, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space) time_steps = 5000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 1000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Marge q-tables of all four GPUs: agent.last_q_table["q_values"] = agent.last_q_table[ "q_values"].reshape((48, 4)) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_impala_single_agent_compilation(self): """ Tests IMPALA agent compilation (single-node mode). """ return if get_backend() == "pytorch": return env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05), # Make session-creation hang in docker. execution_spec=dict(disable_monitoring=True)) agent.terminate() print("Compiled {}".format(agent))
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05)) learn_updates = 50 for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) print("i={} Loss={:.4} Avg-reward={:.2}".format( i, float(ret[1]), mean_return)) # Assume we have learned something. self.assertGreater(mean_return, -0.1) # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start). action_probs = ret[3]["action_probs"].reshape((80, 4)) next_states = ret[3]["states"][:, 1:].reshape((80, )) for s_, probs in zip(next_states, action_probs): # Start state: # - Assume we picked "right" in state=1 (in order to step into goal state). # - OR we picked "up" or "left" in state=0 (unlikely, but possible). if s_ == 0: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) # One below start: # - Assume we picked "down" in start state with very large probability. # - OR we picked "left" or "down" in state=1 (unlikely, but possible). elif s_ == 1: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) agent.terminate()
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ #root_logger.setLevel(DEBUG) # test env = GridWorld("2x2") agent = DQNAgent.from_spec( config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"), dueling_q=False, state_space=env.state_space, action_space=env.action_space, observe_spec=dict(buffer_size=100), # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU. update_spec=dict(update_interval=4, batch_size=48, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.15), store_last_q_table=True ) time_steps = 400 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 250) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(update_interval=4, batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05), ) learn_updates = 1000 # Setup the queue runner. agent.call_api_method("setup_queue_runner") for _ in range(learn_updates): agent.update() #print("STATES:\n{}".format(agent.last_q_table["states"])) #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) #self.assertEqual(results["timesteps_executed"], time_steps) #self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], -3.5) #self.assertGreaterEqual(results["max_episode_reward"], 0.0) #self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_weights_getting_setting(self): """ Tests getting and setting of the Agent's weights. """ env = GridWorld(world="2x2") agent = Agent.from_spec( config_from_path("configs/dqn_agent_for_functionality_test.json"), state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() new_weights = {} for key, weight in weights["policy_weights"].items(): new_weights[key] = weight + 0.01 agent.set_weights(new_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal(new_actual_weights["policy_weights"], new_weights)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 2000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check all learnt Q-values. q_values = agent.graph_executor.execute( ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:] recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8), decimals=1) recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9), decimals=1)
def test_double_dqn_on_2x2_grid_world_with_container_actions(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec(agent_config, double_q=True, dueling_q=False, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space, execution_spec=dict(seed=15), store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("LAST q-table:\n{}".format(agent.last_q_table)) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -7) self.assertGreaterEqual(results["max_episode_reward"], -1.0) self.assertLessEqual(results["episodes_executed"], time_steps / 3) # Check q-table for correct values. expected_q_values_per_state = { (0., 0., -1., 0.): { "forward": (-5.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., -1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, } for state, q_values_forward, q_values_jump in zip( agent.last_q_table["states"], agent.last_q_table["q_values"]["forward"], agent.last_q_table["q_values"]["jump"]): state, q_values_forward, q_values_jump = tuple(state), tuple( q_values_forward), tuple(q_values_jump) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal( q_values_forward, expected_q_values_per_state[state]["forward"], decimals=0) recursive_assert_almost_equal( q_values_jump, expected_q_values_per_state[state]["jump"], decimals=0)
def test_long_chain_grid_world(self): """ Tests a minimalistic long-chain GridWorld. """ env = GridWorld(world="long-chain") # Simple test runs with fixed actions. # X=player's position s = env.reset() # ["X G"] self.assertTrue(s == 33) s, r, t, _ = env.step(2) # down: ["X G"] self.assertTrue(s == 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(1) # right: ["SX G"] self.assertTrue(s == 34) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) env.reset() # ["X G"] # Right, left, down, up, right -> Move one right each iteration. for x in range(20): s, r, t, _ = env.step(1) self.assertTrue(s == x + 33 + 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(3) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(2) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(0) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(1) self.assertTrue(s == x + 33 + 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)
def test_4x4_grid_world_with_container_actions(self): """ Tests a 4x4 GridWorld using forward+turn+jump container actions. """ env = GridWorld(world="4x4", action_type="ftj", state_representation="xy+orientation") # Simple test runs with fixed actions. # Fall into hole. s = env.reset() # [0, 0, 0] (x, y, orientation) recursive_assert_almost_equal(s, [0, 0, 0, 1]) s, r, t, _ = env.step(dict(turn=2, forward=2)) # turn=2 (right), move=2 (forward), jump=0 recursive_assert_almost_equal(s, [1, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=1)) # turn=2 (right), move=1 (stay), jump=0 recursive_assert_almost_equal(s, [1, 0, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=2)) # turn=1 (no turn), move=2 (forward), jump=0 recursive_assert_almost_equal(s, [1, 1, 0, -1]) self.assertTrue(r == -5.0) self.assertTrue(t) # Jump quite a lot and reach goal. env.reset() # [0, 0, 0] (x, y, orientation) s, r, t, _ = env.step(dict(turn=2, forward=1)) recursive_assert_almost_equal(s, [0, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1)) recursive_assert_almost_equal(s, [2, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=2)) recursive_assert_almost_equal(s, [2, 1, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=2, jump=1)) recursive_assert_almost_equal(s, [2, 3, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=0)) recursive_assert_almost_equal(s, [3, 3, -1, 0]) self.assertTrue(r == 1.0) self.assertTrue(t) # Run against a wall. env.reset() # [0, 0, 0] (x, y, orientation) s, r, t, _ = env.step(dict(turn=1, forward=0)) recursive_assert_almost_equal(s, [0, 1, 0, 1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=0, forward=2)) recursive_assert_almost_equal(s, [0, 1, -1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) # Jump over a hole (no reset). s, r, t, _ = env.step(dict(turn=2, forward=1)) # turn around s, r, t, _ = env.step(dict(turn=2, forward=1)) recursive_assert_almost_equal(s, [0, 1, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1)) recursive_assert_almost_equal(s, [2, 1, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, discount=0.95) worker = SingleThreadedWorker( env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict(loss_per_item=[ spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True) ]), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed # (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)
def test_2x2_grid_world(self): """ Tests a minimalistic 2x2 GridWorld. """ env = GridWorld(world="2x2") # Make everything deterministic. env.seed(55) # Simple test runs with fixed actions. # X=player's position s = env.reset() # ["XH", " G"] X=player's position self.assertTrue(s == 0) s, r, t, _ = env.step(2) # down: [" H", "XG"] self.assertTrue(s == 1) self.assertTrue(r == -1.0) self.assertTrue(t is False) s, r, t, _ = env.step(1) # right: [" H", " X"] self.assertTrue(s == 3) self.assertTrue(r == 1.0) self.assertTrue(t is True) env.reset() # ["XH", " G"] X=player's position s, r, t, _ = env.step(1) # right: [" X", " G"] -> in the hole self.assertTrue(s == 2) self.assertTrue(r == -5.0) self.assertTrue(t is True) # Run against a wall. env.reset() # ["XH", " G"] X=player's position s, r, t, _ = env.step(3) # left: ["XH", " G"] self.assertTrue(s == 0) self.assertTrue(r == -1.0) self.assertTrue(t is False) s, r, t, _ = env.step(2) # down: [" H", "XG"] self.assertTrue(s == 1) self.assertTrue(r == -1.0) self.assertTrue(t is False) s, r, t, _ = env.step(0) # up: ["XH", " G"] self.assertTrue(s == 0) self.assertTrue(r == -1.0) self.assertTrue(t is False)