def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_ppo_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = PPOAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 10000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Assume we have learned something. # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world. self.assertGreater(results["mean_episode_reward"], -1.0)
def run_experiment(self, environment, experiment_num=0): environment = RLgraphEnvironmentWrapper(environment) environment.add_episode_end_callback(self.episode_finished, environment, runner_id=1) config = copy(self.config) max_episodes = config.pop('max_episodes', None) max_timesteps = config.pop('max_timesteps', None) max_episode_timesteps = config.pop('max_episode_timesteps') agent = Agent.from_spec( spec=config, state_space=environment.state_space, action_space=environment.action_space, ) if experiment_num == 0 and self.load_model_file: logging.info("Loading model data from file: {}".format( self.load_model)) agent.load_model(self.load_model_file) runner = SingleThreadedWorker(agent=agent, environment=environment) environment.reset() agent.reset_buffers() if max_timesteps: runner.execute_timesteps( num_timesteps=max_timesteps, max_timesteps_per_episode=max_episode_timesteps) else: runner.execute_episodes( num_episodes=max_episodes, max_timesteps_per_episode=max_episode_timesteps) return dict(initial_reset_time=0, episode_rewards=runner.episode_rewards, episode_timesteps=runner.episode_steps, episode_end_times=runner.episode_durations)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 1000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Marge q-tables of all four GPUs: agent.last_q_table["q_values"] = agent.last_q_table[ "q_values"].reshape((48, 4)) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_timesteps(self): """ Simply tests if timestep execution loop works and returns a result. """ agent = RandomAgent(action_space=self.environment.action_space, state_space=self.environment.state_space) worker = SingleThreadedWorker(env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False) result = worker.execute_timesteps(100) self.assertEqual(result['timesteps_executed'], 100) self.assertGreater(result['episodes_executed'], 0) self.assertLessEqual(result['episodes_executed'], 100) self.assertGreaterEqual(result['env_frames'], 100) self.assertGreaterEqual(result['runtime'], 0.0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ #root_logger.setLevel(DEBUG) # test env = GridWorld("2x2") agent = DQNAgent.from_spec( config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"), dueling_q=False, state_space=env.state_space, action_space=env.action_space, observe_spec=dict(buffer_size=100), # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU. update_spec=dict(update_interval=4, batch_size=48, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.15), store_last_q_table=True ) time_steps = 400 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 250) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards( self): """ Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring container actions. """ env = RandomEnv( state_space=Dict( {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}), action_space=Dict({ "F_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "F_forward_direction_low-1.0_high1.0": FloatBox(shape=(), low=-1.0, high=1.0), "B_jump": BoolBox() }), reward_space=FloatBox(low=-1000.0, high=-100000.0), # hugely negative rewards terminal_prob=0.0000001) agent_config = config_from_path( "configs/ppo_agent_for_random_env_with_container_spaces.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, preprocessing_spec=None, worker_executes_preprocessing=True, #episode_finish_callback=lambda episode_return, duration, timesteps, env_num: #print("episode return {}; steps={}".format(episode_return, timesteps)) ) results = worker.execute_timesteps(num_timesteps=int(1e6), use_exploration=True) print(results)
def test_prioritized_replay_atari_throughput(self): """ Tests throughput on standard Atari environments using the prioritized replay memory. """ agent = DQNAgent( states_spec=self.env.state_space, action_spec=self.env.action_space, network_spec=self.network, memory_spec=dict( type='prioritized', capacity=100000, next_states=True ) ) worker = SingleThreadedWorker( env_spec=lambda: self.env, agent=agent, frameskip=1 ) result = worker.execute_timesteps(num_timesteps=1000000, use_exploration=True) print('Agent throughput = {} ops/s'.format(result['ops_per_second'])) print('Environment throughput = {} frames/s'.format(result['env_frames_per_second']))
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 2000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check all learnt Q-values. q_values = agent.graph_executor.execute( ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:] recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8), decimals=1) recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9), decimals=1)