def test_cartpole_with_worker(self): env = OpenAIGymEnv("CartPole-v0") agent_config = config_from_path("configs/backend_performance_dqn_cartpole.json") # Test cpu settings for batching here. agent_config["update_spec"] = None agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0"), agent=agent, frameskip=1, num_environments=1, worker_executes_preprocessing=False ) result = worker.execute_timesteps(1000) print(result)
def test_dqn_on_cart_pole(self): """ Creates a DQNAgent and runs it via a Runner on the CartPole Env. """ dummy_env = OpenAIGymEnv("CartPole-v0") agent = DQNAgent.from_spec( config_from_path("configs/dqn_agent_for_cartpole.json"), double_q=False, dueling_q=False, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=15), update_spec=dict(update_interval=4, batch_size=24, sync_interval=64), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=15), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 25) self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], 200)
def test_double_dueling_dqn_on_cart_pole(self): """ Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env. """ dummy_env = OpenAIGymEnv("CartPole-v0") agent = DQNAgent.from_spec( config_from_path("configs/dqn_agent_for_cartpole.json"), double_q=True, dueling_q=True, state_space=dummy_env.state_space, action_space=dummy_env.action_space, observe_spec=dict(buffer_size=200), execution_spec=dict(seed=156), update_spec=dict(update_interval=4, batch_size=64, sync_interval=16), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=10), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) #print("STATES:\n{}".format(agent.last_q_table["states"])) #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 15) self.assertGreaterEqual(results["max_episode_reward"], 160.0) self.assertLessEqual(results["episodes_executed"], 100)
def test_sac_on_cartpole(self): """ Creates an SAC-Agent and runs it on CartPole. """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, # self.is_windows, episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs: print("episode: return={} ts={}".format(episode_return, timesteps)) ) time_steps = 5000 results = worker.execute_timesteps(time_steps) print(results) self.assertTrue(results["timesteps_executed"] == time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 20) self.assertGreater(results["mean_episode_reward"], 40.0) self.assertGreater(results["max_episode_reward"], 100.0) self.assertGreater(results["mean_episode_reward_last_10_episodes"], 100.0)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward"], -800)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, #self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, env_num: print("episode return {}; steps={}".format( episode_return, timesteps))) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertLessEqual(results["episodes_executed"], time_steps / 10) # Assume we have learned something. self.assertGreaterEqual(results["mean_episode_reward"], 40.0)
def test_dqn_on_pong(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) agent_config = config_from_path("configs/dqn_agent_for_pong.json") preprocessing_spec = agent_config.pop("preprocessor_spec") agent = Agent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=self.pong_preprocessed_state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space) time_steps = 4000000 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, render=True, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True)
def test_ppo_on_cart_pole(self): """ Creates a PPO Agent and runs it via a Runner on the CartPole Env. """ env = OpenAIGymEnv("CartPole-v0", seed=36) agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space ) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows ) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], 23) #self.assertGreaterEqual(results["max_episode_reward"], 100.0) self.assertLessEqual(results["episodes_executed"], time_steps / 10)
def test_readme_example(self): """ Tests deterministic functionality of RandomEnv. """ from rlgraph.agents import DQNAgent from rlgraph.environments import OpenAIGymEnv environment = OpenAIGymEnv('CartPole-v0') config = config_from_path("../../examples/configs/dqn_cartpole.json") # Create from .json file or dict, see agent API for all # possible configuration parameters. agent = DQNAgent.from_spec(config, state_space=environment.state_space, action_space=environment.action_space) # Get an action, take a step, observe reward. state = environment.reset() preprocessed_state, action = agent.get_action( states=state, extra_returns="preprocessed_states") # Execute step in environment. next_state, reward, terminal, info = environment.step(action) # Observe result. agent.observe(preprocessed_states=preprocessed_state, actions=action, internals=[], next_states=next_state, rewards=reward, terminals=terminal) # Call update when desired: loss = agent.update()
def test_post_processing(self): """ Tests external batch post-processing for the PPO agent. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) num_samples = 200 states = agent.preprocessed_state_space.sample(num_samples) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) sequence_indices_space = BoolBox(add_batch_rank=True) # GAE is separately tested, just testing if this API method returns results. pg_advantages = agent.post_process( dict(states=states, rewards=reward_space.sample(num_samples), terminals=terminal_space.sample(num_samples, fill_value=0), sequence_indices=sequence_indices_space.sample(num_samples, fill_value=0)))
def test_memory_compilation(self): # Builds a memory and returns build stats. env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) record_space = Dict(states=env.state_space, actions=env.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=True) input_spaces = dict( # insert: records records=record_space, # get_records: num_records num_records=int, # update_records: indices, update indices=IntBox(add_batch_rank=True), update=FloatBox(add_batch_rank=True)) input_spaces.pop("num_records") memory = MemPrioritizedReplay(capacity=20000, ) test = ComponentTest(component=memory, input_spaces=input_spaces, auto_build=False) return test.build()
def test_impala_on_outbreak(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Breakout-v0", frameskip=4, max_num_noops=30, episodic_life=True, visualize=False) config_ = config_from_path("configs/impala_agent_for_breakout.json") agent = IMPALAAgent.from_spec( config_, state_space=env.state_space, action_space=env.action_space, ) learn_updates = 4000000 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) mean_returns.append(mean_return) print("i={} Loss={:.4} Avg-reward={:.2}".format( i, float(ret[1]), mean_return)) time.sleep(3) agent.terminate() time.sleep(3)
def test_act(self): env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ray_apex_for_pong.json") if get_backend() == "pytorch": agent_config["memory_spec"]["type"] = "mem_prioritized_replay" agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) state = env.reset() action = agent.get_action(state) print("Component call count = {}".format(Component.call_count)) state_space = env.state_space count = 200 samples = state_space.sample(count) start = time.perf_counter() for s in samples: action = agent.get_action(s) end = time.perf_counter() - start print("Took {} s for {} separate actions, mean = {}".format(end, count, end / count)) # Now instead test 100 batch actions samples = state_space.sample(count) start = time.perf_counter() action = agent.get_action(samples) end = time.perf_counter() - start print("Took {} s for {} batched actions.".format(end, count)) profile = Component.call_times print_call_chain(profile, False, 0.03)
def test_sac_on_pendulum(self): """ Creates an SAC-Agent and runs it on Pendulum. """ env = OpenAIGymEnv("Pendulum-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space ) worker = SingleThreadedWorker( env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=False, # self.is_windows episode_finish_callback=lambda episode_return, duration, timesteps, **kwargs: print("episode: return={} ts={}".format(episode_return, timesteps)) ) # Note: SAC is more computationally expensive. episodes = 50 results = worker.execute_episodes(episodes) print(results) self.assertTrue(results["timesteps_executed"] == episodes * 200) self.assertTrue(results["episodes_executed"] == episodes) self.assertGreater(results["mean_episode_reward_last_10_episodes"], -700) self.assertGreater(results["max_episode_reward"], -100)
def test_moving_standardize_python(self): env = OpenAIGymEnv("Pong-v0") space = env.state_space moving_standardize = MovingStandardize(backend="python") moving_standardize.create_variables( input_spaces=dict(preprocessing_inputs=space), action_space=None) samples = [space.sample() for _ in range(100)] out = None for sample in samples: out = moving_standardize._graph_fn_apply(sample) # Assert shape remains intact. expected_shape = (1, ) + space.shape self.assertEqual(expected_shape, moving_standardize.mean_est.shape) # Assert mean estimate. expected_mean = np.mean(samples, axis=0) self.assertTrue(np.allclose(moving_standardize.mean_est, expected_mean)) expected_variance = np.var(samples, ddof=1, axis=0) variance_estimate = moving_standardize.std_sum_est / ( moving_standardize.sample_count - 1.0) self.assertEqual(expected_shape, variance_estimate.shape) self.assertTrue(np.allclose(variance_estimate, expected_variance)) std = np.sqrt(variance_estimate) + SMALL_NUMBER # Final output. expected_out = (samples[-1] - moving_standardize.mean_est) / std self.assertTrue(np.allclose(out, expected_out))
def test_value_function_weights(self): """ Tests changing of value function weights. """ env = OpenAIGymEnv("Pong-v0") agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() assert "value_function_weights" in weights assert "policy_weights" in weights policy_weights = weights["policy_weights"] value_function_weights = weights["value_function_weights"] # Just change vf weights. for key, weight in value_function_weights.items(): value_function_weights[key] = weight + 0.01 agent.set_weights(policy_weights, value_function_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal( new_actual_weights["value_function_weights"], value_function_weights)
def test_subgraph_components(self): return # TODO fix when we have built selective subgraph fetching correctly. # Create agent. agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) # Do not build yet. agent = ApexAgent.from_spec(agent_config, state_space=environment.state_space, action_space=environment.action_space, auto_build=False) # Prepare all steps until build device strategy so we can test subgraph fetching. agent.graph_executor.init_execution() agent.graph_executor.setup_graph() # Meta graph must be built for sub-graph tracing. agent.graph_builder.build_meta_graph(agent.input_spaces) sub_graph = agent.graph_builder.get_subgraph( "update_from_external_batch") print("Sub graph components:") print(sub_graph.sub_components) print("Sub graph API: ") print(sub_graph.api_methods)
class TestSingleThreadedDQN(unittest.TestCase): # TODO test on the relevant Atari environments. env = OpenAIGymEnv(gym_env='Pong-v0') # TODO define classic atari dqn network. network = list() def test_replay_memory_atari_throughput(self): """ Tests throughput on standard Atari environments using the replay memory. """ agent = DQNAgent( states_spec=self.env.state_space, action_spec=self.env.action_space, network_spec=self.network, memory_spec=dict( type='replay_memory', capacity=100000, next_states=True ) ) worker = SingleThreadedWorker( env_spec=lambda: self.env, agent=agent, frameskip=1 ) result = worker.execute_timesteps(num_timesteps=1000000, use_exploration=True) print('Agent throughput = {} ops/s'.format(result['ops_per_second'])) print('Environment throughput = {} frames/s'.format(result['env_frames_per_second'])) def test_prioritized_replay_atari_throughput(self): """ Tests throughput on standard Atari environments using the prioritized replay memory. """ agent = DQNAgent( states_spec=self.env.state_space, action_spec=self.env.action_space, network_spec=self.network, memory_spec=dict( type='prioritized', capacity=100000, next_states=True ) ) worker = SingleThreadedWorker( env_spec=lambda: self.env, agent=agent, frameskip=1 ) result = worker.execute_timesteps(num_timesteps=1000000, use_exploration=True) print('Agent throughput = {} ops/s'.format(result['ops_per_second'])) print('Environment throughput = {} frames/s'.format(result['env_frames_per_second']))
def test_double_dueling_dqn_on_cart_pole(self): """ Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env. """ gym_env = "CartPole-v0" dummy_env = OpenAIGymEnv(gym_env) config_ = config_from_path("configs/dqn_agent_for_cartpole.json") # Add dueling config to agent. config_["policy_spec"] = { "units_state_value_stream": 3, "action_adapter_spec": { "pre_network_spec": [{ "type": "dense", "units": 3 }] } } agent = DQNAgent.from_spec(config_, double_q=True, dueling_q=True, state_space=dummy_env.state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=13), update_spec=dict(update_interval=4, batch_size=64, sync_interval=16), optimizer_spec=dict(type="adam", learning_rate=0.01), store_last_q_table=True) time_steps = 3000 worker = SingleThreadedWorker( env_spec=lambda: OpenAIGymEnv(gym_env, seed=10), agent=agent, render=self.is_windows, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], 25) self.assertLessEqual(results["episodes_executed"], 150)
def test_ppo_compilation(self): """ Tests PPO agent compilation. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec( agent_config, state_space=env.state_space, action_space=env.action_space ) print("Compiled {}".format(agent))
def test_apex_compilation(self): """ Tests agent compilation without Ray to ease debugging on Windows. """ agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec(agent_config, state_space=environment.state_space, action_space=environment.action_space) print("Compiled {}".format(agent))
def test_dqn_compilation(self): """ Creates a DQNAgent and runs it via a Runner on an openAI Pong Env. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/dqn_pytorch_test.json") agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space )
def test_dqn_compilation(self): """ Tests DQN Agent compilation. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path("configs/dqn_agent_for_pong.json") agent = DQNAgent.from_spec( # Uses 2015 DQN parameters as closely as possible. agent_config, state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space )
def test_actor_critic_compilation(self): """ Tests Policy gradient agent compilation. """ env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) agent_config = config_from_path( "configs/actor_critic_agent_for_pong.json") agent = ActorCriticAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space)
def test_multi_gpu_apex_agent_compilation(self): """ Tests if the multi gpu strategy can compile successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ root_logger.setLevel(DEBUG) agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) print("Compiled Apex agent")
def test_multi_gpu_apex_agent_compilation(self): """ Tests if the multi gpu strategy can compile successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ root_logger.setLevel(DEBUG) agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec( agent_config, state_space=environment.state_space, action_space=environment.action_space ) print("Compiled Apex agent")
def test_apex_compilation(self): """ Tests agent compilation without Ray to ease debugging on Windows. """ agent_config = config_from_path("configs/ray_apex_for_pong.json") agent_config["execution_spec"].pop("ray_spec") # TODO remove after unified. if get_backend() == "pytorch": agent_config["memory_spec"]["type"] = "mem_prioritized_replay" environment = OpenAIGymEnv("Pong-v0", frameskip=4) agent = ApexAgent.from_spec(agent_config, state_space=environment.state_space, action_space=environment.action_space) print('Compiled apex agent')
class TestSingleThreadedWorker(unittest.TestCase): environment = OpenAIGymEnv(gym_env='CartPole-v0') def test_timesteps(self): """ Simply tests if timestep execution loop works and returns a result. """ agent = RandomAgent( action_space=self.environment.action_space, state_space=self.environment.state_space ) worker = SingleThreadedWorker( env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False ) result = worker.execute_timesteps(100) self.assertEqual(result['timesteps_executed'], 100) self.assertGreater(result['episodes_executed'], 0) self.assertLessEqual(result['episodes_executed'], 100) self.assertGreaterEqual(result['env_frames'], 100) self.assertGreaterEqual(result['runtime'], 0.0) def test_episodes(self): """ Simply tests if episode execution loop works and returns a result. """ agent = RandomAgent( action_space=self.environment.action_space, state_space=self.environment.state_space ) worker = SingleThreadedWorker( env_spec=lambda: self.environment, agent=agent, frameskip=1, worker_executes_preprocessing=False ) result = worker.execute_episodes(5, max_timesteps_per_episode=10) # Max 5 * 10. self.assertLessEqual(result['timesteps_executed'], 50) self.assertEqual(result['episodes_executed'], 5) self.assertLessEqual(result['env_frames'], 50) self.assertGreaterEqual(result['runtime'], 0.0)
def test_ppo_on_pendulum(self): """ Creates a PPO Agent and runs it via a Runner on the Pendulum env. """ env = OpenAIGymEnv("Pendulum-v0") agent = PPOAgent.from_spec( config_from_path("configs/ppo_agent_for_pendulum.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) results = worker.execute_episodes(500, use_exploration=True) print(results)
def test_sac_on_cartpole(self): """ Creates an SAC-Agent and runs it on CartPole. """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False, render=self.is_windows) time_steps = 10000 results = worker.execute_timesteps(time_steps) print(results)