def test_latest_batch(self): """ Tests if we can fetch latest steps. """ for backend in (None, "python"): ring_buffer = RingBuffer(capacity=self.capacity, backend=backend) test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces) # Insert 5 random elements. observation = non_terminal_records(self.record_space, 5) test.test(("insert_records", observation), expected_outputs=None) # First, test if the basic computation works. batch = test.test(("get_records", 5), expected_outputs=None) recursive_assert_almost_equal(batch, observation) # Next, insert capacity more elements: observation = non_terminal_records(self.record_space, self.capacity) test.test(("insert_records", observation), expected_outputs=None) # If we now fetch capacity elements, we expect to see exactly the last 10. batch = test.test(("get_records", self.capacity), expected_outputs=None) recursive_assert_almost_equal(batch, observation) # If we fetch n elements, we expect to see exactly the last n. for last_n in range(1, 6): batch = test.test(("get_records", last_n), expected_outputs=None) recursive_assert_almost_equal(batch["actions"]["action1"], observation["actions"]["action1"][-last_n:]) recursive_assert_almost_equal(batch["states"]["state2"], observation["states"]["state2"][-last_n:]) recursive_assert_almost_equal(batch["terminals"], observation["terminals"][-last_n:])
def assert_equal(outs, expected_outputs, decimals=7): """ Convenience wrapper: See implementation of `recursive_assert_almost_equal` for details. """ recursive_assert_almost_equal(outs, expected_outputs, decimals=decimals)
def test_value_function_weights(self): """ Tests changing of value function weights. """ env = OpenAIGymEnv("Pong-v0") agent_config = config_from_path("configs/ppo_agent_for_pong.json") agent = PPOAgent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() assert "value_function_weights" in weights assert "policy_weights" in weights policy_weights = weights["policy_weights"] value_function_weights = weights["value_function_weights"] # Just change vf weights. for key, weight in value_function_weights.items(): value_function_weights[key] = weight + 0.01 agent.set_weights(policy_weights, value_function_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal( new_actual_weights["value_function_weights"], value_function_weights)
def test_demos_with_container_actions(self): # Tests if dqfd can fit a set of states to a set of actions. vocab_size = 100 embed_dim = 128 # ID/state space. state_space = IntBox(vocab_size, shape=(10, )) # Container action space. actions_space = {} num_outputs = 3 for i in range(3): actions_space['action_{}'.format(i)] = IntBox(low=0, high=num_outputs) actions_space = Dict(actions_space) agent_config = config_from_path("configs/dqfd_container.json") agent_config["network_spec"] = [ dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size), dict(type="reshape", flatten=True), dict(type="dense", units=embed_dim, activation="relu", scope="dense_1") ] agent = DQFDAgent.from_spec(agent_config, state_space=state_space, action_space=actions_space) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) # Create a set of demos. demo_states = agent.preprocessed_state_space.with_batch_rank().sample( 20) demo_actions = actions_space.with_batch_rank().sample(20) demo_rewards = rewards.sample(20, fill_value=1.0) demo_next_states = agent.preprocessed_state_space.with_batch_rank( ).sample(20) demo_terminals = terminals.sample(20, fill_value=False) # Insert. agent.observe_demos( preprocessed_states=demo_states, actions=demo_actions, rewards=demo_rewards, next_states=demo_next_states, terminals=demo_terminals, ) # Fit demos. agent.update_from_demos(num_updates=5000, batch_size=20) # Evaluate demos: agent_actions = agent.get_action(demo_states, apply_preprocessing=False, use_exploration=False) recursive_assert_almost_equal(agent_actions, demo_actions)
def test_actor_component_with_lstm_network(self): # state space and internal state space state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False) internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True) time_percentages_space = FloatBox() # action_space. action_space = IntBox(2, add_batch_rank=True, add_time_rank=True) preprocessor = PreprocessorStack.from_spec( [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)] ) policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space) exploration = Exploration(epsilon_spec=dict(decay_spec=dict( type="linear_decay", from_=1.0, to_=0.1) )) actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest( component=actor_component, input_spaces=dict( states=state_space, other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True), time_percentage=time_percentages_space ), action_space=action_space ) # Some state inputs (batch size=2, seq-len=1000; batch-major). np.random.seed(10) states = state_space.sample(size=(1000, 2)) initial_internal_states = internal_states_space.zeros(size=2) # only batch time_percentages = time_percentages_space.sample(1000) # Run n times a single time-step to simulate acting and env interaction with an LSTM. preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float) actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int) for i, time_percentage in enumerate(time_percentages): ret = test.test(( "get_preprocessed_state_and_action", # expand time dim at 1st slot as we are time-major == False [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage] )) preprocessed_states[i] = ret["preprocessed_state"][:, 0, :] # take out time-rank again () actions[i] = ret["action"] # Check c/h-state shape. self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3)) # batch-size=2, LSTM units=3 self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3)) # Check all preprocessed states (easy: just divided by 10). expected_preprocessed_state = states / 10 recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state) # Check the exploration functionality over the actions. # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small. stddev_actions = actions.std() self.assertGreater(stddev_actions, 0.4) self.assertLess(stddev_actions, 0.6)
def check_env(self, prop, expected_value, decimals=7): """ Checks a property of our environment for (almost) equality. Args: prop (str): The name of the Environment's property to check. expected_value (any): The expected value of the given property. decimals (Optional[int]): The number of digits after the floating point up to which to compare actual and expected values. """ is_value = getattr(self.env, prop, None) recursive_assert_almost_equal(is_value, expected_value, decimals=decimals)
def test_sequential_vector_env(self): num_envs = 4 env = SequentialVectorEnv(num_environments=num_envs, env_spec={ "type": "gridworld", "world": "2x2" }) # Simple test runs with fixed actions. # X=player's position s = env.reset(index=0) # ["XH", " G"] X=player's position self.assertTrue(s == 0) s = env.reset_all() all(self.assertTrue(s_ == 0) for s_ in s) s, r, t, _ = env.step([2 for _ in range(num_envs)]) # down: [" H", "XG"] all(self.assertTrue(s_ == 1) for s_ in s) all(recursive_assert_almost_equal(r_, -0.1) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([1 for _ in range(num_envs) ]) # right: [" H", " X"] all(self.assertTrue(s_ == 3) for s_ in s) all(recursive_assert_almost_equal(r_, 1.0) for r_ in r) all(self.assertTrue(t_) for t_ in t) [env.reset(index=i) for i in range(num_envs)] # ["XH", " G"] X=player's position s, r, t, _ = env.step([1 for _ in range(num_envs) ]) # right: [" X", " G"] -> in the hole all(self.assertTrue(s_ == 2) for s_ in s) all(self.assertTrue(r_ == -5.0) for r_ in r) all(self.assertTrue(t_) for t_ in t) # Run against a wall. env.reset_all() # ["XH", " G"] X=player's position s, r, t, _ = env.step([3 for _ in range(num_envs)]) # left: ["XH", " G"] all(self.assertTrue(s_ == 0) for s_ in s) all(recursive_assert_almost_equal(r_, -0.1) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([2 for _ in range(num_envs)]) # down: [" H", "XG"] all(self.assertTrue(s_ == 1) for s_ in s) all(recursive_assert_almost_equal(r_, -0.1) for r_ in r) all(self.assertTrue(not t_) for t_ in t) s, r, t, _ = env.step([0 for _ in range(num_envs)]) # up: ["XH", " G"] all(self.assertTrue(s_ == 0) for s_ in s) all(recursive_assert_almost_equal(r_, -0.1) for r_ in r) all(self.assertTrue(not t_) for t_ in t)
def test_double_dqn_on_2x2_grid_world(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld. """ env_spec = dict(world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, dueling_q=False, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, execution_spec=dict(seed=10), update_spec=dict(update_interval=4, batch_size=24, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.05), store_last_q_table=True) time_steps = 1000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_2x2_grid_world_using_flow_methods(self): """ Tests a minimalistic 2x2 GridWorld. """ env = GridWorld(world="2x2") # Simple test runs with fixed actions. # X=player's position s, r, t = env.step_flow(2) # down: [" H", "XG"] self.assertTrue(s == 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(1) # right: [" H", " X"] self.assertTrue(s == 0) self.assertTrue(r == 1.0) self.assertTrue(t) s, r, t = env.step_flow(1) # right: [" X", " G"] -> in the hole self.assertTrue(s == 0) self.assertTrue(r == -5.0) self.assertTrue(t) # Run against a wall. s, r, t = env.step_flow(3) # left: ["XH", " G"] self.assertTrue(s == 0) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(2) # down: [" H", "XG"] self.assertTrue(s == 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t = env.step_flow(0) # up: ["XH", " G"] self.assertTrue(s == 0) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 1000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) # Marge q-tables of all four GPUs: agent.last_q_table["q_values"] = agent.last_q_table[ "q_values"].reshape((48, 4)) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def check_agent(self, prop, expected_value, decimals=7, key_or_index=None): """ Checks a property of our Agent for (almost) equality. Args: prop (str): The name of the Agent's property to check. expected_value (any): The expected value of the given property. decimals (Optional[int]): The number of digits after the floating point up to which to compare actual and expected values. key_or_index (Optional[int, str]): Optional key or index into the propery in case of nested data structure. """ is_value = getattr(self.agent, prop, None) if key_or_index is not None: is_value = is_value[key_or_index] recursive_assert_almost_equal(is_value, expected_value, decimals=decimals)
def check_var(self, variable, expected_value, decimals=7): """ Checks a value of our an Agent's variable for (almost) equality against an expected one. Args: variable (str): The global scope (within Agent's root-component) of the variable to check. expected_value (any): The expected value of the given variable. decimals (Optional[int]): The number of digits after the floating point up to which to compare actual and expected values. """ variables_dict = self.agent.root_component.variables assert variable in variables_dict, "ERROR: Variable '{}' not found in Agent '{}'!".\ format(variable, self.agent.name) var = variables_dict[variable] value = self.graph_executor.read_variable_values(var) recursive_assert_almost_equal(value, expected_value, decimals=decimals)
def test_learning_2x2_grid_world(self): """ Tests if apex can learn a simple environment using a single worker, thus replicating dqn. """ env_spec = dict(type="grid-world", world="2x2", save_mode=False) agent_config = config_from_path( "configs/apex_agent_for_2x2_gridworld.json") # TODO remove after unified backends if get_backend() == "pytorch": agent_config["memory_spec"]["type"] = "mem_prioritized_replay" executor = ApexExecutor( environment_spec=env_spec, agent_config=agent_config, ) # Define executor, test assembly. print("Successfully created executor.") # Executes actual workload. result = executor.execute_workload( workload=dict(num_timesteps=5000, report_interval=100, report_interval_min_seconds=1)) full_worker_stats = executor.result_by_worker() print("All finished episode rewards") print(full_worker_stats["episode_rewards"]) print("STATES:\n{}".format( executor.local_agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format( np.round_(executor.local_agent.last_q_table["q_values"], decimals=2))) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip( executor.local_agent.last_q_table["states"], executor.local_agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system. THIS TEST REQUIRES A MULTI GPU SYSTEM. """ #root_logger.setLevel(DEBUG) # test env = GridWorld("2x2") agent = DQNAgent.from_spec( config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"), dueling_q=False, state_space=env.state_space, action_space=env.action_space, observe_spec=dict(buffer_size=100), # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU. update_spec=dict(update_interval=4, batch_size=48, sync_interval=32), optimizer_spec=dict(type="adam", learning_rate=0.15), store_last_q_table=True ) time_steps = 400 worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("STATES:\n{}".format(agent.last_q_table["states"])) print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], 250) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(update_interval=4, batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05), ) learn_updates = 1000 # Setup the queue runner. agent.call_api_method("setup_queue_runner") for _ in range(learn_updates): agent.update() #print("STATES:\n{}".format(agent.last_q_table["states"])) #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2))) #self.assertEqual(results["timesteps_executed"], time_steps) #self.assertEqual(results["env_frames"], time_steps) #self.assertGreaterEqual(results["mean_episode_reward"], -3.5) #self.assertGreaterEqual(results["max_episode_reward"], 0.0) #self.assertLessEqual(results["episodes_executed"], 350) # Check q-table for correct values. expected_q_values_per_state = { (1.0, 0, 0, 0): (-1, -5, 0, -1), (0, 1.0, 0, 0): (-1, 1, 0, 0) } for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]): state, q_values = tuple(state), tuple(q_values) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
def test_weights_getting_setting(self): """ Tests getting and setting of the Agent's weights. """ env = GridWorld(world="2x2") agent = Agent.from_spec( config_from_path("configs/dqn_agent_for_functionality_test.json"), state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() new_weights = {} for key, weight in weights["policy_weights"].items(): new_weights[key] = weight + 0.01 agent.set_weights(new_weights) new_actual_weights = agent.get_weights() recursive_assert_almost_equal(new_actual_weights["policy_weights"], new_weights)
def test_policy_sync(self): """ Tests weight syncing of policy (and only policy, not Q-functions). """ env = OpenAIGymEnv("CartPole-v0") agent = SACAgent.from_spec( config_from_path("configs/sac_agent_for_cartpole.json"), state_space=env.state_space, action_space=env.action_space) weights = agent.get_weights() print("weights =", weights.keys()) new_weights = {} for key, value in weights["policy_weights"].items(): new_weights[key] = value + 0.01 agent.set_weights(policy_weights=new_weights, value_function_weights=None) updated_weights = agent.get_weights()["policy_weights"] recursive_assert_almost_equal(updated_weights, new_weights)
def test_simple_variational_auto_encoder(self): # Space must contain batch dimension (otherwise, NNlayer will complain). input_spaces = dict( input_=FloatBox(shape=(3,), add_batch_rank=True), z_vector=FloatBox(shape=(1,), add_batch_rank=True) ) variational_auto_encoder = VariationalAutoEncoder( z_units=1, encoder_network_spec=config_from_path("configs/test_vae_encoder_network.json"), decoder_network_spec=config_from_path("configs/test_vae_decoder_network.json") ) # Do not seed, we calculate expectations manually. test = ComponentTest(component=variational_auto_encoder, input_spaces=input_spaces) # Batch of size=3. input_ = np.array([[0.1, 0.2, 0.3], [1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) global_scope = "variational-auto-encoder/" # Calculate output manually. var_dict = test.read_variable_values(variational_auto_encoder.variable_registry) encoder_network_out = dense_layer( input_, var_dict[global_scope+"encoder-network/encoder-layer/dense/kernel"], var_dict[global_scope+"encoder-network/encoder-layer/dense/bias"] ) expected_mean = dense_layer( encoder_network_out, var_dict[global_scope+"mean-layer/dense/kernel"], var_dict[global_scope+"mean-layer/dense/bias"] ) expected_stddev = dense_layer( encoder_network_out, var_dict[global_scope + "stddev-layer/dense/kernel"], var_dict[global_scope + "stddev-layer/dense/bias"] ) out = test.test(("encode", input_), expected_outputs=None) recursive_assert_almost_equal(out["mean"], expected_mean, decimals=5) recursive_assert_almost_equal(out["stddev"], np.exp(expected_stddev), decimals=5) self.assertTrue(out["z_sample"].shape == (3, 1)) test.terminate()
def test_capacity_with_episodes(self): """ Tests if inserts of non-terminals work. Note that this does not test episode semantics itself, which are tested below. """ ring_buffer = RingBuffer(capacity=self.capacity) test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces) # Internal memory variables. ring_buffer_variables = test.get_variable_values( ring_buffer, self.ring_buffer_variables) size_value = ring_buffer_variables["size"] index_value = ring_buffer_variables["index"] num_episodes_value = ring_buffer_variables["num-episodes"] episode_index_values = ring_buffer_variables["episode-indices"] # Assert indices 0 before insert. self.assertEqual(size_value, 0) self.assertEqual(index_value, 0) self.assertEqual(num_episodes_value, 0) self.assertEqual(np.sum(episode_index_values), 0) # Insert one more element than capacity. Note: this is different than # replay test because due to episode semantics, it matters if # these are terminal or not. This tests if episode index updating # causes problems if none of the inserted elements are terminal. observation = non_terminal_records(self.record_space, self.capacity + 1) test.test(("insert_records", observation), expected_outputs=None) ring_buffer_variables = test.get_variable_values( ring_buffer, self.ring_buffer_variables) size_value = ring_buffer_variables["size"] index_value = ring_buffer_variables["index"] num_episodes_value = ring_buffer_variables["num-episodes"] episode_index_values = ring_buffer_variables["episode-indices"] # Size should be equivalent to capacity when full. self.assertEqual(size_value, self.capacity) # Index should be one over capacity due to modulo. self.assertEqual(index_value, 1) self.assertEqual(num_episodes_value, 0) self.assertEqual(np.sum(episode_index_values), 0) # If we fetch n elements, we expect to see exactly the last n. for last_n in range(1, 6): batch = test.test(("get_records", last_n), expected_outputs=None) recursive_assert_almost_equal( batch["actions"]["action1"], observation["actions"]["action1"][-last_n:]) recursive_assert_almost_equal( batch["states"]["state2"], observation["states"]["state2"][-last_n:]) recursive_assert_almost_equal(batch["terminals"], observation["terminals"][-last_n:])
def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self): """ Tests if the multi gpu strategy can learn successfully on a multi gpu system, but also runs on a CPU-only system using fake-GPU logic for testing purposes. """ env_spec = dict(type="grid-world", world="2x2") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/multi_gpu_dqn_for_2x2_gridworld.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec( agent_config, state_space=self.grid_world_2x2_flattened_state_space, action_space=dummy_env.action_space, ) time_steps = 2000 worker = SingleThreadedWorker(env_spec=env_spec, agent=agent, worker_executes_preprocessing=True, preprocessing_spec=preprocessing_spec) results = worker.execute_timesteps(time_steps, use_exploration=True) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -4.5) self.assertGreaterEqual(results["max_episode_reward"], 0.0) self.assertLessEqual(results["episodes_executed"], time_steps / 2) # Check all learnt Q-values. q_values = agent.graph_executor.execute( ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:] recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8), decimals=1) recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9), decimals=1)
def test_random_env(self): """ Tests deterministic functionality of RandomEnv. """ env = RandomEnv(state_space=FloatBox(shape=(2, 2)), action_space=IntBox(2), deterministic=True) # Simple test runs with fixed actions. s = env.reset() recursive_assert_almost_equal(s, np.array([[0.77132064, 0.02075195], [0.63364823, 0.74880388]])) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(s, np.array([[0.1980629, 0.7605307], [0.1691108, 0.0883398]])) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(r, np.array(0.7217553)) s, r, t, _ = env.step(env.action_space.sample()) self.assertEqual(t, False) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(s, np.array([[0.4418332, 0.434014], [0.617767 , 0.5131382]])) s, r, t, _ = env.step(env.action_space.sample())
def test_impala_on_2x2_grid_world(self): """ Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld. """ env = GridWorld("2x2") agent = IMPALAAgent.from_spec( config_from_path("configs/impala_agent_for_2x2_gridworld.json"), state_space=env.state_space, action_space=env.action_space, execution_spec=dict(seed=12), update_spec=dict(batch_size=16), optimizer_spec=dict(type="adam", learning_rate=0.05)) learn_updates = 50 for i in range(learn_updates): ret = agent.update() mean_return = self._calc_mean_return(ret) print("i={} Loss={:.4} Avg-reward={:.2}".format( i, float(ret[1]), mean_return)) # Assume we have learned something. self.assertGreater(mean_return, -0.1) # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start). action_probs = ret[3]["action_probs"].reshape((80, 4)) next_states = ret[3]["states"][:, 1:].reshape((80, )) for s_, probs in zip(next_states, action_probs): # Start state: # - Assume we picked "right" in state=1 (in order to step into goal state). # - OR we picked "up" or "left" in state=0 (unlikely, but possible). if s_ == 0: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) # One below start: # - Assume we picked "down" in start state with very large probability. # - OR we picked "left" or "down" in state=1 (unlikely, but possible). elif s_ == 1: recursive_assert_almost_equal(probs[0], 0.0, decimals=2) self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99) recursive_assert_almost_equal(probs[3], 0.0, decimals=2) agent.terminate()
def test_policy_for_bounded_continuous_action_space(self): """ https://github.com/rlgraph/rlgraph/issues/43 """ nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True) action_space = FloatBox(low=-1.0, high=1.0, shape=(1, ), add_batch_rank=True) # Double the shape for alpha/beta params. # action_space_parameters = Tuple(FloatBox(shape=(1,)), FloatBox(shape=(1,)), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=nn_input_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = np.matmul( nn_input, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", nn_input), expected_outputs=expected_nn_output) # Raw action layer output. expected_raw_logits = np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)) test.test(("get_adapter_outputs", nn_input), expected_outputs=dict(adapter_outputs=expected_raw_logits, nn_outputs=expected_nn_output), decimals=5) # Parameter (alpha/betas). expected_alpha_parameters = np.log( np.exp(expected_raw_logits[:, 0:1]) + 1.0) + 1.0 expected_beta_parameters = np.log( np.exp(expected_raw_logits[:, 1:]) + 1.0) + 1.0 expected_parameters = tuple( [expected_alpha_parameters, expected_beta_parameters]) test.test(("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs", "parameters"]), expected_outputs=dict(adapter_outputs=expected_raw_logits, parameters=expected_parameters), decimals=5) print("Params: {}".format(expected_parameters)) action = test.test(("get_action", nn_input))["action"] self.assertTrue(action.dtype == np.float32) self.assertGreaterEqual(action.min(), -1.0) self.assertLessEqual(action.max(), 1.0) self.assertTrue(action.shape == (3, 1)) out = test.test(("get_action_and_log_likelihood", nn_input)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. actions_scaled_back = (action + 1.0) / 2.0 expected_action_log_llh_output = np.log( beta.pdf(actions_scaled_back, expected_alpha_parameters, expected_beta_parameters)) # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]]) test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. actions = test.test(("get_stochastic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Deterministic sample. actions = test.test(("get_deterministic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Distribution's entropy. entropy = test.test(("get_entropy", nn_input))["entropy"] self.assertTrue(entropy.dtype == np.float32) self.assertTrue(entropy.shape == (3, 1))
def test_policy_for_discrete_action_space_with_dueling_layer(self): # np.random.seed(10) # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). nn_input_space = FloatBox(shape=(3, ), add_batch_rank=True) # action_space (2 possible actions). action_space = IntBox(2, add_batch_rank=True) # flat_float_action_space = FloatBox(shape=(2,), add_batch_rank=True) # Policy with dueling logic. policy = DuelingPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), action_adapter_spec=dict(pre_network_spec=[ dict(type="dense", units=10, activation="lrelu", activation_params=[0.1]) ]), units_state_value_stream=10, action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=nn_input_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = relu( np.matmul( nn_input, ComponentTest.read_params( "dueling-policy/test-network/hidden-layer", policy_params)), 0.1) test.test(("get_nn_outputs", nn_input), expected_outputs=expected_nn_output) # Single state values. expected_state_values = np.matmul( relu( np.matmul( expected_nn_output, ComponentTest.read_params( "dueling-policy/dense-layer-state-value-stream", policy_params))), ComponentTest.read_params("dueling-policy/state-value-node", policy_params)) test.test( ("get_state_values", nn_input, ["state_values", "nn_outputs"]), expected_outputs=dict(state_values=expected_state_values, nn_outputs=expected_nn_output), decimals=5) # Raw action layer output. expected_raw_advantages = np.matmul( relu( np.matmul( expected_nn_output, ComponentTest.read_params( "dueling-policy/action-adapter-0/action-network/dense-layer", policy_params)), 0.1), ComponentTest.read_params( "dueling-policy/action-adapter-0/action-network/action-layer", policy_params)) # Q-values: One for each item in the batch. expected_q_values_output = expected_state_values + expected_raw_advantages - \ np.mean(expected_raw_advantages, axis=-1, keepdims=True) test.test( ("get_adapter_outputs", nn_input, ["adapter_outputs", "advantages"]), expected_outputs=dict(adapter_outputs=expected_q_values_output, advantages=expected_raw_advantages), decimals=5) # Parameter (probabilities). Softmaxed q_values. expected_parameters_output = np.maximum( softmax(expected_q_values_output, axis=-1), SMALL_NUMBER) test.test( ("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs", "parameters"]), expected_outputs=dict(adapter_outputs=expected_q_values_output, parameters=expected_parameters_output), decimals=5) print("Probs: {}".format(expected_parameters_output)) expected_actions = np.argmax(expected_q_values_output, axis=-1) test.test(("get_action", nn_input, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", nn_input)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. expected_action_log_llh_output = np.log( np.array([ expected_parameters_output[0][action[0]], expected_parameters_output[1][action[1]], expected_parameters_output[2][action[2]], ])) test.test(("get_log_likelihood", [nn_input, action]), expected_outputs=dict( log_likelihood=expected_action_log_llh_output, adapter_outputs=expected_q_values_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (3, )) # Deterministic sample. out = test.test(("get_deterministic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (3, )) # Distribution's entropy. out = test.test(("get_entropy", nn_input), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (3, ))
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul( states, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)) test.test( ("get_adapter_outputs", states), expected_outputs=dict(adapter_outputs=expected_action_layer_output, nn_outputs=expected_nn_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_parameters_output = np.maximum( softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters", "log_probs"]), expected_outputs=dict( adapter_outputs=expected_action_layer_output, parameters=np.array(expected_parameters_output, dtype=np.float32), log_probs=np.log(expected_parameters_output)), decimals=5) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) # Get action AND log-llh. out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. expected_action_log_llh_output = np.log( np.array([ expected_parameters_output[0][action[0]], expected_parameters_output[1][action[1]] ])) test.test(("get_log_likelihood", [states, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (2, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (2, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2, ))
def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding( self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(3, ), add_batch_rank=True, add_time_rank=True) # action_space (4 possible actions). action_space = IntBox(4, add_batch_rank=True, add_time_rank=True) flat_float_action_space = FloatBox(shape=(4, ), add_batch_rank=True, add_time_rank=True) # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.). network_spec = config_from_path("configs/test_lrelu_nn.json") # Add folding and unfolding to network. network_spec["fold_time_rank"] = True network_spec["unfold_time_rank"] = True shared_value_function_policy = SharedValueFunctionPolicy( network_spec=network_spec, action_adapter_spec=dict(fold_time_rank=True, unfold_time_rank=True), action_space=action_space, value_fold_time_rank=True, value_unfold_time_rank=True) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space, ) policy_params = test.read_variable_values( shared_value_function_policy.variable_registry) # Some NN inputs. states = state_space.sample(size=(2, 3)) states_folded = np.reshape(states, newshape=(6, 3)) # Raw NN-output (3 hidden nodes). All weights=1.5, no biases. expected_nn_output = np.reshape(relu( np.matmul( states_folded, ComponentTest.read_params( "shared-value-function-policy/test-network/hidden-layer", policy_params)), 0.1), newshape=states.shape) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value expected_action_layer_output = np.matmul( expected_nn_output, ComponentTest.read_params( "shared-value-function-policy/action-adapter-0/action-network/action-layer/", policy_params)) expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 3, 4)) test.test( ("get_adapter_outputs", states), expected_outputs=dict(adapter_outputs=expected_action_layer_output, nn_outputs=expected_nn_output), decimals=5) # State-values: One for each item in the batch. expected_state_value_output = np.matmul( expected_nn_output, ComponentTest.read_params( "shared-value-function-policy/value-function-node/dense-layer", policy_params)) expected_state_value_output_unfolded = np.reshape( expected_state_value_output, newshape=(2, 3, 1)) test.test(("get_state_values", states, ["state_values"]), expected_outputs=dict( state_values=expected_state_value_output_unfolded), decimals=5) expected_action_layer_output_unfolded = np.reshape( expected_action_layer_output, newshape=(2, 3, 4)) test.test(("get_state_values_adapter_outputs_and_parameters", states, ["state_values", "adapter_outputs"]), expected_outputs=dict( state_values=expected_state_value_output_unfolded, adapter_outputs=expected_action_layer_output_unfolded), decimals=5) # Parameter (probabilities). Softmaxed logits. expected_parameters_output = np.maximum( softmax(expected_action_layer_output_unfolded, axis=-1), SMALL_NUMBER) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters", "nn_outputs"]), expected_outputs=dict( nn_outputs=expected_nn_output, adapter_outputs=expected_action_layer_output_unfolded, parameters=expected_parameters_output), decimals=5) print("Probs: {}".format(expected_parameters_output)) expected_actions = np.argmax(expected_action_layer_output_unfolded, axis=-1) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-llh. expected_action_log_llh_output = np.log( np.array([[ expected_parameters_output[0][0][action[0][0]], expected_parameters_output[0][1][action[0][1]], expected_parameters_output[0][2][action[0][2]], ], [ expected_parameters_output[1][0][action[1][0]], expected_parameters_output[1][1][action[1][1]], expected_parameters_output[1][2][action[1][2]], ]])) test.test(("get_log_likelihood", [states, action]), expected_outputs=dict( log_likelihood=expected_action_log_llh_output, adapter_outputs=expected_action_layer_output_unfolded), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue( out["action"].shape == (2, 3)) # Make sure output is unfolded. # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue( out["action"].shape == (2, 3)) # Make sure output is unfolded. # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue( out["entropy"].shape == (2, 3)) # Make sure output is unfolded.
def test_shared_value_function_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (3 possible actions). action_space = IntBox(3, add_batch_rank=True) # Policy with baseline action adapter. shared_value_function_policy = SharedValueFunctionPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), action_space=action_space) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space, ) policy_params = test.read_variable_values( shared_value_function_policy.variable_registry) # Some NN inputs (4 input nodes, batch size=3). states = state_space.sample(size=3) # Raw NN-output (3 hidden nodes). All weights=1.5, no biases. expected_nn_output = relu( np.matmul( states, ComponentTest.read_params( "shared-value-function-policy/test-network/hidden-layer", policy_params)), 0.1) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value expected_action_layer_output = np.matmul( expected_nn_output, ComponentTest.read_params( "shared-value-function-policy/action-adapter-0/action-network/action-layer/", policy_params)) test.test( ("get_adapter_outputs", states), expected_outputs=dict(adapter_outputs=expected_action_layer_output, nn_outputs=expected_nn_output), decimals=5) # State-values: One for each item in the batch. expected_state_value_output = np.matmul( expected_nn_output, ComponentTest.read_params( "shared-value-function-policy/value-function-node/dense-layer", policy_params)) test.test( ("get_state_values", states, ["state_values"]), expected_outputs=dict(state_values=expected_state_value_output), decimals=5) # Logits-values. test.test(("get_state_values_adapter_outputs_and_parameters", states, ["state_values", "adapter_outputs"]), expected_outputs=dict( state_values=expected_state_value_output, adapter_outputs=expected_action_layer_output), decimals=5) # Parameter (probabilities). Softmaxed logits. expected_parameters_output = np.maximum( softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER) test.test( ("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters"]), expected_outputs=dict(adapter_outputs=expected_action_layer_output, parameters=expected_parameters_output), decimals=5) print("Probs: {}".format(expected_parameters_output)) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) # Get action AND log-llh. out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-llh. expected_action_log_llh_output = np.log( np.array([ expected_parameters_output[0][action[0]], expected_parameters_output[1][action[1]], expected_parameters_output[2][action[2]], ])) test.test(("get_log_likelihood", [states, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (3, )) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (3, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (3, ))
def test_episode_fetching(self): """ Test if we can accurately fetch most recent episodes. """ for backend in (None, "python"): ring_buffer = RingBuffer(capacity=self.capacity, backend=backend) test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces) # Insert 2 non-terminals, 1 terminal observation = non_terminal_records(self.record_space, 2) test.test(("insert_records", observation), expected_outputs=None) observation = terminal_records(self.record_space, 1) test.test(("insert_records", observation), expected_outputs=None) ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables) num_episodes_value = ring_buffer_variables["num-episodes"] episode_index_values = ring_buffer_variables["episode-indices"] # One episode. self.assertEqual(num_episodes_value, 1) expected_indices = [0] * self.capacity expected_indices[0] = 2 recursive_assert_almost_equal(episode_index_values, expected_indices) # We should now be able to retrieve one episode of length 3. episode = test.test(("get_episodes", 1), expected_outputs=None) expected_terminals = [0, 0, 1] recursive_assert_almost_equal(episode["terminals"], expected_terminals) # We should not be able to retrieve two episodes, and still return just one. episode = test.test(("get_episodes", 2), expected_outputs=None) expected_terminals = [0, 0, 1] recursive_assert_almost_equal(episode["terminals"], expected_terminals) # Insert 7 non-terminals. observation = non_terminal_records(self.record_space, 7) test.test(("insert_records", observation), expected_outputs=None) ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables) index_value = ring_buffer_variables["index"] episode_index_values = ring_buffer_variables["episode-indices"] # Episode indices should not have changed. expected_indices[0] = 2 recursive_assert_almost_equal(episode_index_values, expected_indices) # Inserted 2 non-terminal, 1 terminal, 7 non-terminal at capacity 10 -> should be at 0 again. self.assertEqual(index_value, 0) # Now inserting one terminal so the terminal buffer has layout [1 0 1 0 0 0 0 0 0 0] observation = terminal_records(self.record_space, 1) test.test(("insert_records", observation), expected_outputs=None) # Episode indices: ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables) num_episodes_value = ring_buffer_variables["num-episodes"] recursive_assert_almost_equal(num_episodes_value, 2) # # Check if we can fetch 2 episodes: episodes = test.test(("get_episodes", 2), expected_outputs=None) # # # We now expect to have retrieved: # # - 10 time steps # # - 2 terminal values 1 # # - Terminal values spaced apart 1 index due to the insertion order self.assertEqual(len(episodes['terminals']), self.capacity) self.assertEqual(episodes['terminals'][0], True) self.assertEqual(episodes['terminals'][2], True)
def test_4x4_grid_world_with_container_actions(self): """ Tests a 4x4 GridWorld using forward+turn+jump container actions. """ env = GridWorld(world="4x4", action_type="ftj", state_representation="xy+orientation") # Simple test runs with fixed actions. # Fall into hole. s = env.reset() # [0, 0, 0] (x, y, orientation) recursive_assert_almost_equal(s, [0, 0, 0, 1]) s, r, t, _ = env.step(dict(turn=2, forward=2)) # turn=2 (right), move=2 (forward), jump=0 recursive_assert_almost_equal(s, [1, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=1)) # turn=2 (right), move=1 (stay), jump=0 recursive_assert_almost_equal(s, [1, 0, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=2)) # turn=1 (no turn), move=2 (forward), jump=0 recursive_assert_almost_equal(s, [1, 1, 0, -1]) self.assertTrue(r == -5.0) self.assertTrue(t) # Jump quite a lot and reach goal. env.reset() # [0, 0, 0] (x, y, orientation) s, r, t, _ = env.step(dict(turn=2, forward=1)) recursive_assert_almost_equal(s, [0, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1)) recursive_assert_almost_equal(s, [2, 0, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=2)) recursive_assert_almost_equal(s, [2, 1, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=2, jump=1)) recursive_assert_almost_equal(s, [2, 3, 0, -1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=2, forward=0)) recursive_assert_almost_equal(s, [3, 3, -1, 0]) self.assertTrue(r == 1.0) self.assertTrue(t) # Run against a wall. env.reset() # [0, 0, 0] (x, y, orientation) s, r, t, _ = env.step(dict(turn=1, forward=0)) recursive_assert_almost_equal(s, [0, 1, 0, 1]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=0, forward=2)) recursive_assert_almost_equal(s, [0, 1, -1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) # Jump over a hole (no reset). s, r, t, _ = env.step(dict(turn=2, forward=1)) # turn around s, r, t, _ = env.step(dict(turn=2, forward=1)) recursive_assert_almost_equal(s, [0, 1, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1)) recursive_assert_almost_equal(s, [2, 1, 1, 0]) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)
def test_long_chain_grid_world(self): """ Tests a minimalistic long-chain GridWorld. """ env = GridWorld(world="long-chain") # Simple test runs with fixed actions. # X=player's position s = env.reset() # ["X G"] self.assertTrue(s == 33) s, r, t, _ = env.step(2) # down: ["X G"] self.assertTrue(s == 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(1) # right: ["SX G"] self.assertTrue(s == 34) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) env.reset() # ["X G"] # Right, left, down, up, right -> Move one right each iteration. for x in range(20): s, r, t, _ = env.step(1) self.assertTrue(s == x + 33 + 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(3) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(2) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(0) self.assertTrue(s == x + 33) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t) s, r, t, _ = env.step(1) self.assertTrue(s == x + 33 + 1) recursive_assert_almost_equal(r, -0.1) self.assertTrue(not t)