def test_large_impala_policy_without_agent(self): """ Creates a large IMPALA architecture network inside a policy and runs a few input samples through it. """ # Create the network. large_impala_architecture = LargeIMPALANetwork(worker_sample_size=1) # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function). policy = SharedValueFunctionPolicy( network_spec=large_impala_architecture, action_space=self.action_space, switched_off_apis={ #"get_action_from_logits_and_parameters", "get_action_from_logits_and_probabilities", "get_log_likelihood" }) test = ComponentTest( policy, input_spaces=dict( nn_input=self.input_space, internal_states=self.internal_states_space, #parameters=self.parameters_and_logits_space, #logits=self.parameters_and_logits_space ), action_space=self.action_space, execution_spec=dict(disable_monitoring=True)) # Send a 1x1 sample through the network (1=sequence-length (time-rank), 1=batch-size). nn_input = self.input_space.sample(size=(1, 1)) initial_internal_states = self.internal_states_space.zeros(size=1) expected = None out = test.test(("get_action", [nn_input, initial_internal_states]), expected_outputs=expected) print("First action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (1, 1)) self.assertEquals(out["last_internal_states"][0].shape, (1, 256)) self.assertEquals(out["last_internal_states"][1].shape, (1, 256)) # Send another 1x1 sample through the network using the previous internal-state. next_nn_input = self.input_space.sample(size=(1, 1)) expected = None out = test.test( ("get_action", [next_nn_input, out["last_internal_states"]]), expected_outputs=expected) print("Second action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (1, 1)) self.assertEquals(out["last_internal_states"][0].shape, (1, 256)) self.assertEquals(out["last_internal_states"][1].shape, (1, 256)) test.terminate()
def test_large_impala_actor_component_without_agent(self): """ Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input samples through it. """ batch_size = 4 time_steps = 1 # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function). policy = SharedValueFunctionPolicy( LargeIMPALANetwork(worker_sample_size=time_steps), action_space=self.action_space, deterministic=False) actor_component = ActorComponent(preprocessor_spec=None, policy_spec=policy, exploration_spec=None) test = ComponentTest(actor_component, input_spaces=dict( states=self.input_space, internal_states=self.internal_states_space), action_space=self.action_space, execution_spec=dict(disable_monitoring=True)) # Send a sample through the network (sequence-length (time-rank) x batch-size). nn_dict_input = self.input_space.sample(size=(time_steps, batch_size)) initial_internal_states = self.internal_states_space.zeros( size=batch_size) expected = None out = test.test(("get_preprocessed_state_and_action", [nn_dict_input, initial_internal_states]), expected_outputs=expected) print("First action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (time_steps, batch_size)) self.assertEquals(out["last_internal_states"][0].shape, (batch_size, 256)) self.assertEquals(out["last_internal_states"][1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel). recursive_assert_almost_equal( out["preprocessed_state"], dict( RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"], INSTR=nn_dict_input["INSTR"], previous_action=nn_dict_input["previous_action"], previous_reward=nn_dict_input["previous_reward"], )) # Send another 1x1 sample through the network using the previous internal-state. next_nn_input = self.input_space.sample(size=(time_steps, batch_size)) expected = None out = test.test(("get_preprocessed_state_and_action", [next_nn_input, out["last_internal_states"]]), expected_outputs=expected) print("Second action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (time_steps, batch_size)) self.assertEquals(out["last_internal_states"][0].shape, (batch_size, 256)) self.assertEquals(out["last_internal_states"][1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel, which gets divided by 255). recursive_assert_almost_equal( out["preprocessed_state"], dict( RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"], INSTR=next_nn_input["INSTR"], previous_action=next_nn_input["previous_action"], previous_reward=next_nn_input["previous_reward"], )) test.terminate()