def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_ppo_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Assume we have learned something. # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world. self.assertGreater(results["mean_episode_reward"], -1.0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 1000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Marge q-tables of all four GPUs: agent.last_q_table["q_values"] = agent.last_q_table[ "q_values"].reshape((48, 4)) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_episodes(self): """ Simply tests if episode execution loop works and returns a result. """ agent = RandomAgent(action_space=self.environment.action_space, state_space=self.environment.state_space) worker = SingleThreadedWorker(env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False) result = worker.execute_episodes(5, max_timesteps_per_episode=10) # Max 5 * 10. self.assertLessEqual(result['timesteps_executed'], 50) self.assertEqual(result['episodes_executed'], 5) self.assertLessEqual(result['env_frames'], 50) self.assertGreaterEqual(result['runtime'], 0.0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ #root_logger.setLevel(DEBUG) # test env = GridWorld("2x2") agent = DQNAgent.from_spec( config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"), dueling_q=False, state_space=env.state_space, action_space=env.action_space, observe_spec=dict(buffer_size=100), # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU. update_spec=dict(update_interval=4, batch_size=48, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.15), store_last_q_table=True ) time_steps = 400 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 250) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards( self): """ Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring container actions. """ env = RandomEnv( state_space=Dict( {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}), action_space=Dict({ "F_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "F_forward_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "B_jump": BoolBox() }), reward_space=FloatBox(low=-1000.0, high=-100000.0), # hugely negative rewards terminal_prob=0.0000001) agent_config = config_from_path( "configs/ppo_agent_for_random_env_with_container_spaces.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, preprocessing_spec=None, worker_executes_preprocessing=True, #episode_finish_callback=lambda episode_return, duration, timesteps, env_num: #print("episode return {}; steps={}".format(episode_return, timesteps)) ) results = worker.execute_timesteps(num_timesteps=int(1e6), use_exploration=True) print(results)
def test_prioritized_replay_atari_throughput(self): """ Tests throughput on standard Atari environments using the prioritized replay memory. """ agent = DQNAgent( states_spec=self.env.state_space, action_spec=self.env.action_space, network_spec=self.network, memory_spec=dict( type='prioritized', capacity=100000, next_states=True ) ) worker = SingleThreadedWorker( env_spec=lambda: self.env, agent=agent, frameskip=1 ) result = worker.execute_timesteps(num_timesteps=1000000, use_exploration=True) print('Agent throughput = {} ops/s'.format(result['ops_per_second'])) print('Environment throughput = {} frames/s'.format(result['env_frames_per_second']))
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 2000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check all learnt Q-values. q_values = agent.graph_executor.execute( ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:] recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8), decimals=1) recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9), decimals=1)
def run_experiment(self, environment, experiment_num=0): environment = RLgraphEnvironmentWrapper(environment) environment.add_episode_end_callback(self.episode_finished, environment, runner_id=1) config = copy(self.config) max_episodes = config.pop('max_episodes', None) max_timesteps = config.pop('max_timesteps', None) max_episode_timesteps = config.pop('max_episode_timesteps') agent = Agent.from_spec( spec=config, state_space=environment.state_space, action_space=environment.action_space, ) if experiment_num == 0 and self.load_model_file: logging.info("Loading model data from file: {}".format( self.load_model)) agent.load_model(self.load_model_file) runner = SingleThreadedWorker(agent=agent, environment=environment) environment.reset() agent.reset_buffers() if max_timesteps: runner.execute_timesteps( num_timesteps=max_timesteps, max_timesteps_per_episode=max_episode_timesteps) else: runner.execute_episodes( num_episodes=max_episodes, max_timesteps_per_episode=max_episode_timesteps) return dict(initial_reset_time=0, episode_rewards=runner.episode_rewards, episode_timesteps=runner.episode_steps, episode_end_times=runner.episode_durations)
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, discount=0.95) worker = SingleThreadedWorker( env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict(loss_per_item=[ spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True) ]), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed # (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)