def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3 ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"] biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. expected = ( # t_ np.array([False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ]) ) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_2x2_grid_world(self): preprocessor_spec = [dict( type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space.num_categories )] network_spec = config_from_path("configs/test_simple_nn.json") # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal). network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]] network_spec["layers"][0]["biases_spec"] = False exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_adapter_spec=dict( weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]], biases_spec=False ), action_space=self.grid_world_2x2_action_space, deterministic=True), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="grid_world", world="2x2"), actor_component_spec=actor_component, state_space=self.grid_world_2x2_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.grid_world_2x2_action_probs_space, num_steps=5 ) test = ComponentTest( component=environment_stepper, action_space=self.grid_world_2x2_action_space, ) # Step 5 times through the Env and collect results. expected = ( np.array([False, True, False, True, False]), # t_ np.array([0, 1, 0, 1, 0, 1]), # s' (raw) np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299]], dtype=np.float32) ) out = test.test("step", expected_outputs=expected, decimals=2) print(out) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([True, False, True, False, True]), # t_ np.array([1, 0, 1, 0, 1, 0]), # s' (raw) np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825]], dtype=np.float32) ) out = test.test("step", expected_outputs=expected) print(out) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_compare_with_non_env_stepper(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space.with_batch_rank() action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) test = ComponentTest( component=actor_component, input_spaces=dict(states=state_space), action_space=action_space, ) s = dummy_env.reset() time_start = time.monotonic() for i in range(self.time_steps): out = test.test( ("get_preprocessed_state_and_action", np.array([s]))) #preprocessed_s = out["preprocessed_state"] a = out["action"] # Act in env. s, r, t, _ = dummy_env.step(a[0]) # remove batch if t is True: s = dummy_env.reset() time_end = time.monotonic() print("Done running {} steps in bare-metal env in {}sec.".format( self.time_steps, time_end - time_start)) test.terminate()
def test_actor_component_with_dict_preprocessor(self): # state_space (a complex Dict Space, that will be partially preprocessed). state_space = Dict( a=FloatBox(shape=(2,)), b=FloatBox(shape=(5,)), add_batch_rank=True ) # action_space. action_space = IntBox(2, add_batch_rank=True) preprocessor_spec = dict( type="dict-preprocessor-stack", preprocessors=dict( a=[ dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=0.5) ] ) ) # Simple custom NN with dict input (splits into 2 streams (simple dense layers) and concats at the end). policy = Policy(network_spec=DummyNNWithDictInput(num_units_a=2, num_units_b=3, scope="dummy-nn"), action_space=action_space) exploration = None # no exploration actor_component = ActorComponent(preprocessor_spec, policy, exploration) test = ComponentTest( component=actor_component, input_spaces=dict(states=state_space), action_space=action_space ) # Some state inputs (batch size=4). states = state_space.sample(size=4) # Get and check some actions. actor_component_params = test.read_variable_values(actor_component.variable_registry) # Expected NN-output. expected_nn_output_stream_a = np.matmul( states["a"] * 0.5, actor_component_params["actor-component/policy/dummy-nn/dense-a/dense/kernel"] ) expected_nn_output_stream_b = np.matmul( states["b"], actor_component_params["actor-component/policy/dummy-nn/dense-b/dense/kernel"] ) expected_nn_output = np.concatenate((expected_nn_output_stream_a, expected_nn_output_stream_b), axis=-1) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = dict(a=states["a"] * 0.5, b=states["b"]) test.test( ("get_preprocessed_state_and_action", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, nn_outputs=expected_nn_output ) )
def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self): internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"] biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( np.array([False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states ( np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]])) ) ) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_actor_component_with_lstm_network(self): # state space and internal state space state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False) internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True) time_percentages_space = FloatBox() # action_space. action_space = IntBox(2, add_batch_rank=True, add_time_rank=True) preprocessor = PreprocessorStack.from_spec( [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)] ) policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space) exploration = Exploration(epsilon_spec=dict(decay_spec=dict( type="linear_decay", from_=1.0, to_=0.1) )) actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest( component=actor_component, input_spaces=dict( states=state_space, other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True), time_percentage=time_percentages_space ), action_space=action_space ) # Some state inputs (batch size=2, seq-len=1000; batch-major). np.random.seed(10) states = state_space.sample(size=(1000, 2)) initial_internal_states = internal_states_space.zeros(size=2) # only batch time_percentages = time_percentages_space.sample(1000) # Run n times a single time-step to simulate acting and env interaction with an LSTM. preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float) actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int) for i, time_percentage in enumerate(time_percentages): ret = test.test(( "get_preprocessed_state_and_action", # expand time dim at 1st slot as we are time-major == False [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage] )) preprocessed_states[i] = ret["preprocessed_state"][:, 0, :] # take out time-rank again () actions[i] = ret["action"] # Check c/h-state shape. self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3)) # batch-size=2, LSTM units=3 self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3)) # Check all preprocessed states (easy: just divided by 10). expected_preprocessed_state = states / 10 recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state) # Check the exploration functionality over the actions. # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small. stddev_actions = actions.std() self.assertGreater(stddev_actions, 0.4) self.assertLess(stddev_actions, 0.6)
def test_to_find_out_what_breaks_specifiable_server_start_via_thread_pools( self): env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( # The images from the env are divided by 255. RGB_INTERLEAVED=[dict(type="divide", divisor=255)], # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. dict(network_spec=LargeIMPALANetwork(), action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=100, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset")
def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai-gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_space=action_space, **agent_config["policy_spec"]), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1].min() >= 0) # make sure we have pixels self.assertTrue(out[1].max() <= 255) self.assertTrue(out[2].dtype == np.float32) # rewards self.assertTrue(out[2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[2].max() <= 1.0) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_2x2_grid_world_returning_actions_and_rewards( self): preprocessor_spec = [ dict(type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space. num_categories) ] network_spec = config_from_path("configs/test_simple_nn.json") # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal). network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]] network_spec["layers"][0]["biases_spec"] = False exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]], biases_spec=False), action_space=self.grid_world_2x2_action_space, deterministic=True), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="grid_world", world="2x2"), actor_component_spec=actor_component, state_space=self.grid_world_2x2_state_space, reward_space="float32", add_action=True, add_reward=True, num_steps=5) test = ComponentTest( component=environment_stepper, action_space=self.grid_world_2x2_action_space, ) # Step 5 times through the Env and collect results. expected = ( np.array([False, True, False, True, False]), # t_ np.array([0, 1, 0, 1, 0, 1]), # s' (raw) np.array([2, 1, 2, 1, 2]), # actions taken np.array([-1.0, 1.0, -1.0, 1.0, -1.0]) # rewards ) out = test.test("step", expected_outputs=expected, decimals=2) print(out) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_on_deterministic_env(self): preprocessor_spec = None network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=5), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( np.array([True, False, False, False]), # t_ np.array([[0.0], [1.0], [2.0], [3.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, True, False]), # t_ np.array([[3.0], [4.0], [0.0], [1.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_simple_actor_component(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(5, ), add_batch_rank=True) # action_space. action_space = IntBox(10) preprocessor = PreprocessorStack.from_spec([ dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=2) ]) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) exploration = Exploration() # no exploration actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest(component=actor_component, input_spaces=dict(states=state_space), action_space=action_space) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Some state inputs (5 input nodes, batch size=2). states = state_space.sample(2) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_and_action", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions)) # Get actions and action-probs by calling a different API-method. states = state_space.sample(5) # Get and check some actions. actor_component_params = test.read_variable_values( actor_component.variables) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params[ "actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params[ "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # No reshape necessary (simple action space), softmax to get probs. expected_action_probs = softmax(expected_action_layer_output) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_action_and_action_probs", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, action_probs=expected_action_probs))
def __init__(self, environment_spec, actor_component_spec, num_steps=20, state_space=None, reward_space=None, internal_states_space=None, add_action_probs=False, action_probs_space=None, add_action=False, add_reward=False, add_previous_action_to_state=False, add_previous_reward_to_state=False, scope="environment-stepper", **kwargs): """ Args: environment_spec (dict): A specification dict for constructing an Environment object that will be run inside a SpecifiableServer for in-graph stepping. actor_component_spec (Union[ActorComponent,dict]): A specification dict to construct this EnvStepper's ActionComponent (to generate actions) or an already constructed ActionComponent object. num_steps (int): The number of steps to perform per `step` call. state_space (Optional[Space]): The state Space of the Environment. If None, will construct a dummy environment to get the state Space from there. reward_space (Optional[Space]): The reward Space of the Environment. If None, will construct a dummy environment to get the reward Space from there. internal_states_space (Optional[Space]): The internal states Space (when using an RNN inside the ActorComponent). add_action_probs (bool): Whether to add all action probabilities for each step to the ActionComponent's outputs at each step. These will be added as additional tensor inside the Default: False. action_probs_space (Optional[Space]): If add_action_probs is True, the Space that the action_probs will have. This is usually just the flattened (one-hot) action space. add_action (bool): Whether to add the action to the output of the `step` API-method. Default: False. add_reward (bool): Whether to add the reward to the output of the `step` API-method. Default: False. add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: False. add_previous_action_to_state (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: False. add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: False. """ super(EnvironmentStepper, self).__init__(scope=scope, **kwargs) # Only to retrieve some information about the particular Env. dummy_env = Environment.from_spec( environment_spec) # type: Environment # Create the SpecifiableServer with the given env spec. if state_space is None or reward_space is None: state_space = dummy_env.state_space if reward_space is None: _, reward, _, _ = dummy_env.step( dummy_env.action_space.sample()) # TODO: this may break on non 64-bit machines. tf seems to interpret a python float as tf.float64. reward_space = Space.from_spec( "float64" if type(reward) == float else float, shape=(1, )).with_batch_rank() else: reward_space = Space.from_spec(reward_space).with_batch_rank() self.reward_space = reward_space self.action_space = dummy_env.action_space dummy_env.terminate() # The state that the environment produces. self.state_space_env = state_space # The state that must be fed into the actor-component to produce an action. # May contain prev_action and prev_reward. self.state_space_actor = state_space self.add_previous_action_to_state = add_previous_action_to_state self.add_previous_reward_to_state = add_previous_reward_to_state # Circle actions and/or rewards in `step` API-method? self.add_action = add_action self.add_reward = add_reward # The Problem with ContainerSpaces here is that py_func (SpecifiableServer) cannot handle container # spaces, which is why we need to painfully convert these into flat spaces and tuples here whenever # we make a call to the env. So to keep things unified, we treat all container spaces # (state space, preprocessed state) from here on as tuples of primitive spaces sorted by their would be # flat-keys in a flattened dict). self.state_space_env_flattened = self.state_space_env.flatten() # Need to flatten the state-space in case it's a ContainerSpace for the return dtypes. self.state_space_env_list = list( self.state_space_env_flattened.values()) # TODO: automate this by lookup from the NN Component self.internal_states_space = None if internal_states_space is not None: self.internal_states_space = internal_states_space.with_batch_rank( add_batch_rank=1) # Add the action/reward spaces to the state space (must be Dict). if self.add_previous_action_to_state is True: assert isinstance(self.state_space_actor, Dict),\ "ERROR: If `add_previous_action_to_state` is True as input, state_space must be a Dict!" self.state_space_actor["previous_action"] = self.action_space if self.add_previous_reward_to_state is True: assert isinstance(self.state_space_actor, Dict),\ "ERROR: If `add_previous_reward_to_state` is True as input, state_space must be a Dict!" self.state_space_actor["previous_reward"] = self.reward_space self.state_space_actor_flattened = self.state_space_actor.flatten() self.state_space_actor_list = list( self.state_space_actor_flattened.values()) self.add_action_probs = add_action_probs self.action_probs_space = action_probs_space self.environment_spec = environment_spec self.environment_server = SpecifiableServer( class_=Environment, spec=environment_spec, output_spaces=dict( step_for_env_stepper=self.state_space_env_list + [self.reward_space, bool], reset_for_env_stepper=self.state_space_env_list), shutdown_method="terminate") # Add the sub-components. self.actor_component = ActorComponent.from_spec( actor_component_spec) # type: ActorComponent self.preprocessed_state_space = self.actor_component.preprocessor.get_preprocessed_space( self.state_space_actor) self.num_steps = num_steps # Variables that hold information of last step through Env. self.current_terminal = None self.current_state = None self.current_action = None # Only if self.add_action is True. self.current_reward = None # Only if self.add_reward is True. self.current_internal_states = None self.current_action_probs = None self.time_step = 0 self.has_rnn = self.actor_component.policy.neural_network.has_rnn() # Add all sub-components (only ActorComponent). self.add_components(self.actor_component)
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, num_workers=1, worker_sample_size=100, dynamic_batching=False, visualize=False, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. num_workers (int): How many actors (workers) should be run in separate threads. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file). Default: False. visualize (Union[int,bool]): Whether and how many workers to visualize. Default: False (no visualization). """ # Now that we fixed the Agent's spec, call the super constructor. super(SingleIMPALAAgent, self).__init__( type="single", discount=discount, architecture=architecture, fifo_queue_spec=fifo_queue_spec, environment_spec=environment_spec, feed_previous_action_through_nn=feed_previous_action_through_nn, feed_previous_reward_through_nn=feed_previous_reward_through_nn, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, worker_sample_size=worker_sample_size, name=kwargs.pop("name", "impala-single-agent"), **kwargs) self.dynamic_batching = dynamic_batching self.num_workers = num_workers self.visualize = visualize # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we # actually call below during our build. if self.dynamic_batching: self.policy = DynamicBatchingPolicy(policy_spec=self.policy, scope="") self.env_output_splitter = ContainerSplitter( tuple_length=3 if self.has_rnn is False else 4, scope="env-output-splitter") self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys( ) if isinstance(self.state_space, Dict) else "dummy"), scope="states-dict-splitter") self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Slice some data from the EnvStepper (e.g only first internal states are needed). if self.has_rnn: internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) else: internal_states_slicer = None self.transposer = Transpose(scope="transposer") # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_steppers = list() for i in range(self.num_workers): environment_spec_ = copy.deepcopy(environment_spec) if self.visualize is True or (isinstance(self.visualize, int) and i + 1 <= self.visualize): environment_spec_["visualize"] = True # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1. policy_spec = copy.deepcopy(self.policy_spec) if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \ "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]: policy_spec["network_spec"]["worker_sample_size"] = 1 env_stepper = EnvironmentStepper( environment_spec=environment_spec_, actor_component_spec=ActorComponent( preprocessor_spec=self.preprocessing_spec, policy_spec=policy_spec, exploration_spec=self.exploration_spec), state_space=self.state_space.with_batch_rank(), action_space=self.action_space.with_batch_rank(), reward_space=float, internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_action=not self.feed_previous_action_through_nn, add_reward=not self.feed_previous_reward_through_nn, add_previous_action_to_state=self. feed_previous_action_through_nn, add_previous_reward_to_state=self. feed_previous_reward_through_nn, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space), scope="env-stepper-{}".format(i)) if self.dynamic_batching: env_stepper.actor_component.policy.parent_component = None env_stepper.actor_component.policy = DynamicBatchingPolicy( policy_spec=env_stepper.actor_component.policy, scope="") env_stepper.actor_component.add_components( env_stepper.actor_component.policy) self.environment_steppers.append(env_stepper) # Create the QueueRunners (one for each env-stepper). self.queue_runner = QueueRunner( self.fifo_queue, "step", -1, # -1: Take entire return value of API-method `step` as record to insert. self.env_output_splitter, self.fifo_input_merger, internal_states_slicer, *self.environment_steppers) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.queue_runner, self.transposer, self.staging_area, self.preprocessor, self.states_dict_splitter, self.policy, self.loss_function, self.optimizer ] # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op
def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
def test_environment_stepper_on_deepmind_lab(self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only divide and flatten the image). [{ "type": "divide", "divisor": 255 }, { "type": "reshape", "flatten": True }], # Policy spec. dict(network_spec="../configs/test_lstm_nn.json", action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space_test_lstm, num_steps=1000, # Add both prev-action and -reward into the state sent through the network. #add_previous_action_to_state=True, #add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True)) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() steps = 10 out = None for _ in range(steps): out = test.test("step") time_total = time.monotonic() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)" .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[0].dtype == np.float32) #self.assertTrue(out[0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[0].max() <= 1.0) #self.assertTrue(out[1].dtype == np.int32) # actions #self.assertTrue(out[2].dtype == np.float32) # rewards #self.assertTrue(out[0].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) # action probs (test whether sum to one). #self.assertTrue(out[1][6].dtype == np.float32) #self.assertTrue(out[1][6].min() >= 0.0) #self.assertTrue(out[1][6].max() <= 1.0) #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False), # np.ones(shape=(environment_stepper.num_steps,)), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3)) self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3)) # Check whether episode returns match single rewards (including terminal signals). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[0][i] # self.assertAlmostEqual(episode_returns, out[3][i]) # # Terminal: Reset for next step. # if out[4][i] is np.bool_(True): # episode_returns = 0.0 test.terminate()
def test_large_impala_actor_component_without_agent(self): """ Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input samples through it. """ batch_size = 4 time_steps = 1 # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function). policy = SharedValueFunctionPolicy( LargeIMPALANetwork(worker_sample_size=time_steps), action_space=self.action_space, deterministic=False) actor_component = ActorComponent(preprocessor_spec=None, policy_spec=policy, exploration_spec=None) test = ComponentTest(actor_component, input_spaces=dict( states=self.input_space, internal_states=self.internal_states_space), action_space=self.action_space, execution_spec=dict(disable_monitoring=True)) # Send a sample through the network (sequence-length (time-rank) x batch-size). nn_dict_input = self.input_space.sample(size=(time_steps, batch_size)) initial_internal_states = self.internal_states_space.zeros( size=batch_size) expected = None out = test.test(("get_preprocessed_state_and_action", [nn_dict_input, initial_internal_states]), expected_outputs=expected) print("First action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (time_steps, batch_size)) self.assertEquals(out["last_internal_states"][0].shape, (batch_size, 256)) self.assertEquals(out["last_internal_states"][1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel). recursive_assert_almost_equal( out["preprocessed_state"], dict( RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"], INSTR=nn_dict_input["INSTR"], previous_action=nn_dict_input["previous_action"], previous_reward=nn_dict_input["previous_reward"], )) # Send another 1x1 sample through the network using the previous internal-state. next_nn_input = self.input_space.sample(size=(time_steps, batch_size)) expected = None out = test.test(("get_preprocessed_state_and_action", [next_nn_input, out["last_internal_states"]]), expected_outputs=expected) print("Second action: {}".format(out["action"])) self.assertEquals(out["action"].shape, (time_steps, batch_size)) self.assertEquals(out["last_internal_states"][0].shape, (batch_size, 256)) self.assertEquals(out["last_internal_states"][1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel, which gets divided by 255). recursive_assert_almost_equal( out["preprocessed_state"], dict( RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"], INSTR=next_nn_input["INSTR"], previous_action=next_nn_input["previous_action"], previous_reward=next_nn_input["previous_reward"], )) test.terminate()
def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. # Reset the stepper. test.test("reset") time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[1][0].dtype == np.float32) # preprocessed states #self.assertTrue(out[1][0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[1][0].max() <= 1.0) #self.assertTrue(out[1][1].dtype == np.int32) # actions #self.assertTrue(out[1][2].dtype == np.float32) # rewards #self.assertTrue(out[1][3].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) self.assertTrue(out[1][2].dtype == np.float32) # rewards self.assertTrue(out[1][2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[1][2].max() <= 1.0) # Check whether episode returns match single rewards (including resetting after each terminal signal). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[2][i] # self.assertAlmostEqual(episode_returns, out[1][3][i]) # # Terminal: Reset accumulated episode-return before next step. # if out[1][4][i] is np.bool_(True): # episode_returns = 0.0 # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_environment_stepper_component_with_large_impala_architecture( self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return worker_sample_size = 100 env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. worker_sample_size=1 as its an actor network. dict(network_spec=LargeIMPALANetwork(worker_sample_size=1), action_space=action_space)) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=worker_sample_size, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest(component=environment_stepper, action_space=action_space, execution_spec=dict(disable_monitoring=True)) environment_stepper.environment_server.start_server() # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[1]["INSTR"].dtype == np.object) self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8) self.assertTrue( out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) + state_space["RGB_INTERLEAVED"].shape) self.assertTrue( out[1]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255) self.assertTrue(out[1]["previous_action"].dtype == np.int32) # actions self.assertTrue( out[1]["previous_action"].shape == (worker_sample_size + 1, )) self.assertTrue( out[1]["previous_reward"].dtype == np.float32) # rewards self.assertTrue( out[1]["previous_reward"].shape == (worker_sample_size + 1, )) # action probs (test whether sum to one). self.assertTrue(out[2].dtype == np.float32) self.assertTrue(out[2].shape == (100, action_space.num_categories)) self.assertTrue(out[2].min() >= 0.0) self.assertTrue(out[2].max() <= 1.0) recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False), np.ones(shape=(worker_sample_size, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256)) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256)) environment_stepper.environment_server.stop_server() test.terminate()
def test_large_impala_actor_component_without_agent(self): """ Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input samples through it. """ batch_size = 4 # Use IMPALA paper's preprocessor of division by 255 (only for the Image). preprocessor_spec_for_actor_component = dict( type="dict-preprocessor-stack", preprocessors=dict( RGB_INTERLEAVED=[dict(type="divide", divisor=255)])) # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function). policy = Policy( LargeIMPALANetwork(), action_space=self.action_space, action_adapter_spec=dict(type="baseline_action_adapter")) exploration = Exploration(epsilon_spec=dict( decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100))) actor_component = ActorComponent(preprocessor_spec_for_actor_component, policy, exploration) test = ComponentTest(actor_component, input_spaces=dict( states=self.input_space, internal_states=self.internal_states_space), action_space=self.action_space) # Send a sample through the network (sequence-length (time-rank) x batch-size). nn_dict_input = self.input_space.sample(size=(batch_size, 1)) initial_internal_states = self.internal_states_space.zeros( size=batch_size) expected = None preprocessed_states, actions, last_internal_states = test.test( ("get_preprocessed_state_and_action", [nn_dict_input, initial_internal_states]), expected_outputs=expected) print("First action: {}".format(actions)) self.assertEquals(actions.shape, (batch_size, 1)) self.assertEquals(last_internal_states[0].shape, (batch_size, 256)) self.assertEquals(last_internal_states[1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel). recursive_assert_almost_equal( preprocessed_states, dict( RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"] / 255, INSTR=nn_dict_input["INSTR"], previous_action=nn_dict_input["previous_action"], previous_reward=nn_dict_input["previous_reward"], )) # Send another 1x1 sample through the network using the previous internal-state. next_nn_input = self.input_space.sample(size=(batch_size, 1)) expected = None preprocessed_states, actions, last_internal_states = test.test( ("get_preprocessed_state_and_action", [next_nn_input, last_internal_states]), expected_outputs=expected) print("Second action: {}".format(actions)) self.assertEquals(actions.shape, (batch_size, 1)) self.assertEquals(last_internal_states[0].shape, (batch_size, 256)) self.assertEquals(last_internal_states[1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel, which gets divided by 255). recursive_assert_almost_equal( preprocessed_states, dict( RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"] / 255, INSTR=next_nn_input["INSTR"], previous_action=next_nn_input["previous_action"], previous_reward=next_nn_input["previous_reward"], )) # Send time x batch states through the network to simulate agent-type=learner behavior. # time-steps=20, batch=1 (must match last-internal-states) next_nn_input = self.input_space.sample(size=(batch_size, 20)) expected = None preprocessed_states, actions, last_internal_states = test.test( ("get_preprocessed_state_and_action", [next_nn_input, last_internal_states]), expected_outputs=expected) print("Actions 3 to 22: {}".format(actions)) self.assertEquals(actions.shape, (batch_size, 20)) self.assertEquals(last_internal_states[0].shape, (batch_size, 256)) self.assertEquals(last_internal_states[1].shape, (batch_size, 256)) # Check preprocessed state (all the same except 'image' channel). recursive_assert_almost_equal( preprocessed_states, dict( RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"] / 255, INSTR=next_nn_input["INSTR"], previous_action=next_nn_input["previous_action"], previous_reward=next_nn_input["previous_reward"], ))
def test_environment_stepper_component_with_large_impala_architecture( self): env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( ## The images from the env are divided by 255. #RGB_INTERLEAVED=[dict(type="divide", divisor=255)], # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. dict(network_spec=LargeIMPALANetwork(), action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=100, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0]["INSTR"].dtype == np.object) self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32) self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >= 0.0) # make sure we have pixels / 255 self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0) self.assertTrue(out[1].dtype == np.int32) # actions self.assertTrue(out[2].dtype == np.float32) # rewards self.assertTrue(out[3].dtype == np.float32) # episode return self.assertTrue(out[4].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[5]["INSTR"].dtype == np.object) # next state (raw, not preprocessed) self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue( out[5]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255) # action probs (test whether sum to one). self.assertTrue(out[6].dtype == np.float32) self.assertTrue(out[6].min() >= 0.0) self.assertTrue(out[6].max() <= 1.0) recursive_assert_almost_equal( out[6].sum(axis=-1, keepdims=False), np.ones(shape=(environment_stepper.num_steps, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[7][0].dtype == np.float32) self.assertTrue(out[7][1].dtype == np.float32) self.assertTrue(out[7][0].shape == (environment_stepper.num_steps, 256)) self.assertTrue(out[7][1].shape == (environment_stepper.num_steps, 256)) # Check whether episode returns match single rewards (including terminal signals). episode_returns = 0.0 for i in range(environment_stepper.num_steps): episode_returns += out[2][i] self.assertAlmostEqual(episode_returns, out[3][i]) # Terminal: Reset for next step. if out[4][i] is np.bool_(True): episode_returns = 0.0 test.terminate()