Пример #1
0
    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        expected = (
            # t_
            np.array([False, False, False]),
            # s' (raw)
            np.array([[0.0], [1.0], [2.0], [3.0]]),
            # action probs
            np.array([
                softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([False, False, True]),
            np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #2
0
    def test_environment_stepper_on_2x2_grid_world(self):
        preprocessor_spec = [dict(
            type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space.num_categories
        )]
        network_spec = config_from_path("configs/test_simple_nn.json")
        # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal).
        network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]]
        network_spec["layers"][0]["biases_spec"] = False
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_adapter_spec=dict(
                weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]],
                biases_spec=False
            ), action_space=self.grid_world_2x2_action_space, deterministic=True),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="grid_world", world="2x2"),
            actor_component_spec=actor_component,
            state_space=self.grid_world_2x2_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.grid_world_2x2_action_probs_space,
            num_steps=5
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.grid_world_2x2_action_space,
        )

        # Step 5 times through the Env and collect results.
        expected = (
            np.array([False, True, False, True, False]),  # t_
            np.array([0, 1, 0, 1, 0, 1]),  # s' (raw)
            np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299]], dtype=np.float32)
        )
        out = test.test("step", expected_outputs=expected, decimals=2)
        print(out)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([True, False, True, False, True]),  # t_
            np.array([1, 0, 1, 0, 1, 0]),  # s' (raw)
            np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825]], dtype=np.float32)
        )
        out = test.test("step", expected_outputs=expected)
        print(out)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
 def test_compare_with_non_env_stepper(self):
     environment_spec = dict(type="openai_gym",
                             gym_env="Pong-v0",
                             frameskip=4,
                             seed=10)
     dummy_env = Environment.from_spec(environment_spec)
     state_space = dummy_env.state_space.with_batch_rank()
     action_space = dummy_env.action_space
     agent_config = config_from_path("configs/dqn_agent_for_pong.json")
     actor_component = ActorComponent(
         agent_config["preprocessing_spec"],
         dict(network_spec=agent_config["network_spec"],
              action_adapter_spec=agent_config["action_adapter_spec"],
              action_space=action_space), agent_config["exploration_spec"])
     test = ComponentTest(
         component=actor_component,
         input_spaces=dict(states=state_space),
         action_space=action_space,
     )
     s = dummy_env.reset()
     time_start = time.monotonic()
     for i in range(self.time_steps):
         out = test.test(
             ("get_preprocessed_state_and_action", np.array([s])))
         #preprocessed_s = out["preprocessed_state"]
         a = out["action"]
         # Act in env.
         s, r, t, _ = dummy_env.step(a[0])  # remove batch
         if t is True:
             s = dummy_env.reset()
     time_end = time.monotonic()
     print("Done running {} steps in bare-metal env in {}sec.".format(
         self.time_steps, time_end - time_start))
     test.terminate()
    def test_actor_component_with_dict_preprocessor(self):
        # state_space (a complex Dict Space, that will be partially preprocessed).
        state_space = Dict(
            a=FloatBox(shape=(2,)),
            b=FloatBox(shape=(5,)),
            add_batch_rank=True
        )
        # action_space.
        action_space = IntBox(2, add_batch_rank=True)

        preprocessor_spec = dict(
            type="dict-preprocessor-stack",
            preprocessors=dict(
                a=[
                    dict(type="convert_type", to_dtype="float"),
                    dict(type="multiply", factor=0.5)
                ]
            )
        )
        # Simple custom NN with dict input (splits into 2 streams (simple dense layers) and concats at the end).
        policy = Policy(network_spec=DummyNNWithDictInput(num_units_a=2, num_units_b=3, scope="dummy-nn"),
                        action_space=action_space)
        exploration = None  # no exploration

        actor_component = ActorComponent(preprocessor_spec, policy, exploration)

        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(states=state_space),
            action_space=action_space
        )

        # Some state inputs (batch size=4).
        states = state_space.sample(size=4)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(actor_component.variable_registry)
        # Expected NN-output.
        expected_nn_output_stream_a = np.matmul(
            states["a"] * 0.5, actor_component_params["actor-component/policy/dummy-nn/dense-a/dense/kernel"]
        )
        expected_nn_output_stream_b = np.matmul(
            states["b"], actor_component_params["actor-component/policy/dummy-nn/dense-b/dense/kernel"]
        )
        expected_nn_output = np.concatenate((expected_nn_output_stream_a, expected_nn_output_stream_b), axis=-1)

        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = dict(a=states["a"] * 0.5, b=states["b"])
        test.test(
            ("get_preprocessed_state_and_action", states),
            expected_outputs=dict(
                preprocessed_state=expected_preprocessed_state, action=expected_actions,
                nn_outputs=expected_nn_output
            )
        )
Пример #5
0
    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self):
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1])
        expected = (
            np.array([False, False, True, False]),
            np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)),
            ]),  # action probs
            # internal states
            (
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])),
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))
            )
        )
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
    def test_actor_component_with_lstm_network(self):
        # state space and internal state space
        state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False)
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True)
        time_percentages_space = FloatBox()
        # action_space.
        action_space = IntBox(2, add_batch_rank=True, add_time_rank=True)

        preprocessor = PreprocessorStack.from_spec(
            [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)]
        )
        policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space)
        exploration = Exploration(epsilon_spec=dict(decay_spec=dict(
            type="linear_decay", from_=1.0, to_=0.1)
        ))
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(
                states=state_space,
                other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True),
                time_percentage=time_percentages_space
            ),
            action_space=action_space
        )
        # Some state inputs (batch size=2, seq-len=1000; batch-major).
        np.random.seed(10)
        states = state_space.sample(size=(1000, 2))
        initial_internal_states = internal_states_space.zeros(size=2)  # only batch
        time_percentages = time_percentages_space.sample(1000)

        # Run n times a single time-step to simulate acting and env interaction with an LSTM.
        preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float)
        actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int)
        for i, time_percentage in enumerate(time_percentages):
            ret = test.test((
                "get_preprocessed_state_and_action",
                # expand time dim at 1st slot as we are time-major == False
                [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage]
            ))
            preprocessed_states[i] = ret["preprocessed_state"][:, 0, :]  # take out time-rank again ()
            actions[i] = ret["action"]
            # Check c/h-state shape.
            self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3))  # batch-size=2, LSTM units=3
            self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3))

        # Check all preprocessed states (easy: just divided by 10).
        expected_preprocessed_state = states / 10
        recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state)

        # Check the exploration functionality over the actions.
        # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small.
        stddev_actions = actions.std()
        self.assertGreater(stddev_actions, 0.4)
        self.assertLess(stddev_actions, 0.6)
Пример #7
0
    def test_to_find_out_what_breaks_specifiable_server_start_via_thread_pools(
            self):
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # The images from the env  are divided by 255.
                    RGB_INTERLEAVED=[dict(type="divide", divisor=255)],
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec.
            dict(network_spec=LargeIMPALANetwork(), action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=100,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")
Пример #8
0
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai-gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_space=action_space,
                 **agent_config["policy_spec"]),
            agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1].max() <= 255)
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[2].max() <= 1.0)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #9
0
    def test_environment_stepper_on_2x2_grid_world_returning_actions_and_rewards(
            self):
        preprocessor_spec = [
            dict(type="reshape",
                 flatten=True,
                 flatten_categories=self.grid_world_2x2_action_space.
                 num_categories)
        ]
        network_spec = config_from_path("configs/test_simple_nn.json")
        # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal).
        network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1],
                                                     [-0.2, 0.2], [-0.4, 0.2]]
        network_spec["layers"][0]["biases_spec"] = False
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1],
                                                        [0.4, 0.2, -0.2, 0.2]],
                                          biases_spec=False),
                 action_space=self.grid_world_2x2_action_space,
                 deterministic=True), exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="grid_world", world="2x2"),
            actor_component_spec=actor_component,
            state_space=self.grid_world_2x2_state_space,
            reward_space="float32",
            add_action=True,
            add_reward=True,
            num_steps=5)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.grid_world_2x2_action_space,
        )

        # Step 5 times through the Env and collect results.
        expected = (
            np.array([False, True, False, True, False]),  # t_
            np.array([0, 1, 0, 1, 0, 1]),  # s' (raw)
            np.array([2, 1, 2, 1, 2]),  # actions taken
            np.array([-1.0, 1.0, -1.0, 1.0, -1.0])  # rewards
        )
        out = test.test("step", expected_outputs=expected, decimals=2)
        print(out)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #10
0
    def test_environment_stepper_on_deterministic_env(self):
        preprocessor_spec = None
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=5),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                np.array([True, False, False, False]),  # t_
                np.array([[0.0], [1.0], [2.0], [3.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, True, False]),  # t_
                np.array([[3.0], [4.0], [0.0], [1.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #11
0
    def test_simple_actor_component(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)
        # action_space.
        action_space = IntBox(10)

        preprocessor = PreprocessorStack.from_spec([
            dict(type="convert_type", to_dtype="float"),
            dict(type="multiply", factor=2)
        ])
        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        exploration = Exploration()  # no exploration
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(component=actor_component,
                             input_spaces=dict(states=state_space),
                             action_space=action_space)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)

        # Some state inputs (5 input nodes, batch size=2).
        states = state_space.sample(2)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_and_action", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions))

        # Get actions and action-probs by calling a different API-method.
        states = state_space.sample(5)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # No reshape necessary (simple action space), softmax to get probs.
        expected_action_probs = softmax(expected_action_layer_output)
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_action_and_action_probs", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions,
                      action_probs=expected_action_probs))
Пример #12
0
    def __init__(self,
                 environment_spec,
                 actor_component_spec,
                 num_steps=20,
                 state_space=None,
                 reward_space=None,
                 internal_states_space=None,
                 add_action_probs=False,
                 action_probs_space=None,
                 add_action=False,
                 add_reward=False,
                 add_previous_action_to_state=False,
                 add_previous_reward_to_state=False,
                 scope="environment-stepper",
                 **kwargs):
        """
        Args:
            environment_spec (dict): A specification dict for constructing an Environment object that will be run
                inside a SpecifiableServer for in-graph stepping.
            actor_component_spec (Union[ActorComponent,dict]): A specification dict to construct this EnvStepper's
                ActionComponent (to generate actions) or an already constructed ActionComponent object.
            num_steps (int): The number of steps to perform per `step` call.
            state_space (Optional[Space]): The state Space of the Environment. If None, will construct a dummy
                environment to get the state Space from there.
            reward_space (Optional[Space]): The reward Space of the Environment. If None, will construct a dummy
                environment to get the reward Space from there.
            internal_states_space (Optional[Space]): The internal states Space (when using an RNN inside the
                ActorComponent).
            add_action_probs (bool): Whether to add all action probabilities for each step to the ActionComponent's
                outputs at each step. These will be added as additional tensor inside the
                Default: False.
            action_probs_space (Optional[Space]): If add_action_probs is True, the Space that the action_probs will have.
                This is usually just the flattened (one-hot) action space.
            add_action (bool): Whether to add the action to the output of the `step` API-method.
                Default: False.
            add_reward (bool): Whether to add the reward to the output of the `step` API-method.
                Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
            add_previous_action_to_state (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
        """
        super(EnvironmentStepper, self).__init__(scope=scope, **kwargs)

        # Only to retrieve some information about the particular Env.
        dummy_env = Environment.from_spec(
            environment_spec)  # type: Environment

        # Create the SpecifiableServer with the given env spec.
        if state_space is None or reward_space is None:
            state_space = dummy_env.state_space
            if reward_space is None:
                _, reward, _, _ = dummy_env.step(
                    dummy_env.action_space.sample())
                # TODO: this may break on non 64-bit machines. tf seems to interpret a python float as tf.float64.
                reward_space = Space.from_spec(
                    "float64" if type(reward) == float else float,
                    shape=(1, )).with_batch_rank()
        else:
            reward_space = Space.from_spec(reward_space).with_batch_rank()

        self.reward_space = reward_space
        self.action_space = dummy_env.action_space

        dummy_env.terminate()

        # The state that the environment produces.
        self.state_space_env = state_space
        # The state that must be fed into the actor-component to produce an action.
        # May contain prev_action and prev_reward.
        self.state_space_actor = state_space
        self.add_previous_action_to_state = add_previous_action_to_state
        self.add_previous_reward_to_state = add_previous_reward_to_state

        # Circle actions and/or rewards in `step` API-method?
        self.add_action = add_action
        self.add_reward = add_reward

        # The Problem with ContainerSpaces here is that py_func (SpecifiableServer) cannot handle container
        # spaces, which is why we need to painfully convert these into flat spaces and tuples here whenever
        # we make a call to the env. So to keep things unified, we treat all container spaces
        # (state space, preprocessed state) from here on as tuples of primitive spaces sorted by their would be
        # flat-keys in a flattened dict).
        self.state_space_env_flattened = self.state_space_env.flatten()
        # Need to flatten the state-space in case it's a ContainerSpace for the return dtypes.
        self.state_space_env_list = list(
            self.state_space_env_flattened.values())

        # TODO: automate this by lookup from the NN Component
        self.internal_states_space = None
        if internal_states_space is not None:
            self.internal_states_space = internal_states_space.with_batch_rank(
                add_batch_rank=1)

        # Add the action/reward spaces to the state space (must be Dict).
        if self.add_previous_action_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_action_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_action"] = self.action_space
        if self.add_previous_reward_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_reward_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_reward"] = self.reward_space
        self.state_space_actor_flattened = self.state_space_actor.flatten()
        self.state_space_actor_list = list(
            self.state_space_actor_flattened.values())

        self.add_action_probs = add_action_probs
        self.action_probs_space = action_probs_space

        self.environment_spec = environment_spec
        self.environment_server = SpecifiableServer(
            class_=Environment,
            spec=environment_spec,
            output_spaces=dict(
                step_for_env_stepper=self.state_space_env_list +
                [self.reward_space, bool],
                reset_for_env_stepper=self.state_space_env_list),
            shutdown_method="terminate")
        # Add the sub-components.
        self.actor_component = ActorComponent.from_spec(
            actor_component_spec)  # type: ActorComponent
        self.preprocessed_state_space = self.actor_component.preprocessor.get_preprocessed_space(
            self.state_space_actor)

        self.num_steps = num_steps

        # Variables that hold information of last step through Env.
        self.current_terminal = None
        self.current_state = None
        self.current_action = None  # Only if self.add_action is True.
        self.current_reward = None  # Only if self.add_reward is True.
        self.current_internal_states = None
        self.current_action_probs = None
        self.time_step = 0

        self.has_rnn = self.actor_component.policy.neural_network.has_rnn()

        # Add all sub-components (only ActorComponent).
        self.add_components(self.actor_component)
Пример #13
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 num_workers=1,
                 worker_sample_size=100,
                 dynamic_batching=False,
                 visualize=False,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            num_workers (int): How many actors (workers) should be run in separate threads.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.
            dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the
                optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file).
                Default: False.
            visualize (Union[int,bool]): Whether and how many workers to visualize.
                Default: False (no visualization).
        """
        # Now that we fixed the Agent's spec, call the super constructor.
        super(SingleIMPALAAgent, self).__init__(
            type="single",
            discount=discount,
            architecture=architecture,
            fifo_queue_spec=fifo_queue_spec,
            environment_spec=environment_spec,
            feed_previous_action_through_nn=feed_previous_action_through_nn,
            feed_previous_reward_through_nn=feed_previous_reward_through_nn,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            worker_sample_size=worker_sample_size,
            name=kwargs.pop("name", "impala-single-agent"),
            **kwargs)
        self.dynamic_batching = dynamic_batching
        self.num_workers = num_workers
        self.visualize = visualize

        # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we
        # actually call below during our build.
        if self.dynamic_batching:
            self.policy = DynamicBatchingPolicy(policy_spec=self.policy,
                                                scope="")

        self.env_output_splitter = ContainerSplitter(
            tuple_length=3 if self.has_rnn is False else 4,
            scope="env-output-splitter")
        self.fifo_output_splitter = ContainerSplitter(
            *self.fifo_queue_keys, scope="fifo-output-splitter")
        self.states_dict_splitter = ContainerSplitter(
            *list(self.fifo_record_space["states"].keys(
            ) if isinstance(self.state_space, Dict) else "dummy"),
            scope="states-dict-splitter")

        self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

        # Slice some data from the EnvStepper (e.g only first internal states are needed).
        if self.has_rnn:
            internal_states_slicer = Slice(scope="internal-states-slicer",
                                           squeeze=True)
        else:
            internal_states_slicer = None

        self.transposer = Transpose(scope="transposer")

        # Create an IMPALALossFunction with some parameters.
        self.loss_function = IMPALALossFunction(
            discount=self.discount,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            slice_actions=self.feed_previous_action_through_nn,
            slice_rewards=self.feed_previous_reward_through_nn)

        # Merge back to insert into FIFO.
        self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

        # Dummy Flattener to calculate action-probs space.
        dummy_flattener = ReShape(
            flatten=True, flatten_categories=self.action_space.num_categories)

        self.environment_steppers = list()
        for i in range(self.num_workers):
            environment_spec_ = copy.deepcopy(environment_spec)
            if self.visualize is True or (isinstance(self.visualize, int)
                                          and i + 1 <= self.visualize):
                environment_spec_["visualize"] = True

            # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1.
            policy_spec = copy.deepcopy(self.policy_spec)
            if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \
                    "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]:
                policy_spec["network_spec"]["worker_sample_size"] = 1

            env_stepper = EnvironmentStepper(
                environment_spec=environment_spec_,
                actor_component_spec=ActorComponent(
                    preprocessor_spec=self.preprocessing_spec,
                    policy_spec=policy_spec,
                    exploration_spec=self.exploration_spec),
                state_space=self.state_space.with_batch_rank(),
                action_space=self.action_space.with_batch_rank(),
                reward_space=float,
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_action=not self.feed_previous_action_through_nn,
                add_reward=not self.feed_previous_reward_through_nn,
                add_previous_action_to_state=self.
                feed_previous_action_through_nn,
                add_previous_reward_to_state=self.
                feed_previous_reward_through_nn,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space),
                scope="env-stepper-{}".format(i))
            if self.dynamic_batching:
                env_stepper.actor_component.policy.parent_component = None
                env_stepper.actor_component.policy = DynamicBatchingPolicy(
                    policy_spec=env_stepper.actor_component.policy, scope="")
                env_stepper.actor_component.add_components(
                    env_stepper.actor_component.policy)

            self.environment_steppers.append(env_stepper)

        # Create the QueueRunners (one for each env-stepper).
        self.queue_runner = QueueRunner(
            self.fifo_queue,
            "step",
            -1,  # -1: Take entire return value of API-method `step` as record to insert.
            self.env_output_splitter,
            self.fifo_input_merger,
            internal_states_slicer,
            *self.environment_steppers)

        sub_components = [
            self.fifo_output_splitter, self.fifo_queue, self.queue_runner,
            self.transposer, self.staging_area, self.preprocessor,
            self.states_dict_splitter, self.policy, self.loss_function,
            self.optimizer
        ]

        # Add all the agent's sub-components to the root.
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              build_options=None)
            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))
                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
Пример #14
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 worker_sample_size=100,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.

        Keyword Args:
            type (str): One of "single", "actor" or "learner". Default: "single".
        """
        type_ = kwargs.pop("type", "single")
        assert type_ in ["single", "actor", "learner"]
        self.type = type_
        self.worker_sample_size = worker_sample_size

        # Network-spec by default is a "large architecture" IMPALA network.
        self.network_spec = kwargs.pop(
            "network_spec",
            dict(
                type=
                "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork"
                .format("Large" if architecture == "large" else "Small")))
        if isinstance(self.network_spec, dict) and "type" in self.network_spec and \
                "IMPALANetwork" in self.network_spec["type"]:
            self.network_spec = default_dict(
                self.network_spec,
                dict(worker_sample_size=1 if self.type ==
                     "actor" else self.worker_sample_size + 1))

        # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need.
        self.exploration_spec = kwargs.pop("exploration_spec", None)
        optimizer_spec = kwargs.pop("optimizer_spec", None)
        observe_spec = kwargs.pop("observe_spec", None)

        self.feed_previous_action_through_nn = feed_previous_action_through_nn
        self.feed_previous_reward_through_nn = feed_previous_reward_through_nn

        # Run everything in a single process.
        if self.type == "single":
            environment_spec = environment_spec or self.default_environment_spec
            update_spec = kwargs.pop("update_spec", None)
        # Actors won't need to learn (no optimizer needed in graph).
        elif self.type == "actor":
            optimizer_spec = None
            update_spec = kwargs.pop("update_spec", dict(do_updates=False))
            environment_spec = environment_spec or self.default_environment_spec
        # Learners won't need to explore (act) or observe (insert into Queue).
        else:
            observe_spec = None
            update_spec = kwargs.pop("update_spec", None)
            environment_spec = None

        # Add previous-action/reward preprocessors to env-specific preprocessor spec.
        # TODO: remove this empty hard-coded preprocessor.
        self.preprocessing_spec = kwargs.pop(
            "preprocessing_spec",
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # Flatten actions.
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=kwargs.get(
                                 "action_space").num_categories)
                    ],
                    # Bump reward and convert to float32, so that it can be concatenated by the Concat layer.
                    previous_reward=[dict(type="reshape", new_shape=(1, ))])))

        # Limit communication in distributed mode between each actor and the learner (never between actors).
        execution_spec = kwargs.pop("execution_spec", None)
        if execution_spec is not None and execution_spec.get(
                "mode") == "distributed":
            default_dict(
                execution_spec["session_config"],
                dict(type="monitored-training-session",
                     allow_soft_placement=True,
                     device_filters=["/job:learner/task:0"] + ([
                         "/job:actor/task:{}".format(
                             execution_spec["distributed_spec"]["task_index"])
                     ] if self.type == "actor" else ["/job:learner/task:0"])))
            # If Actor, make non-chief in either case (even if task idx == 0).
            if self.type == "actor":
                execution_spec["distributed_spec"]["is_chief"] = False
                # Hard-set device to the CPU for actors.
                execution_spec["device_strategy"] = "custom"
                execution_spec[
                    "default_device"] = "/job:{}/task:{}/cpu".format(
                        self.type,
                        execution_spec["distributed_spec"]["task_index"])

        self.policy_spec = kwargs.pop("policy_spec", dict())
        # TODO: Create some auto-setting based on LSTM inside the NN.
        default_dict(
            self.policy_spec,
            dict(type="shared-value-function-policy",
                 deterministic=False,
                 reuse_variable_scope="shared-policy",
                 action_space=kwargs.get("action_space")))

        # Now that we fixed the Agent's spec, call the super constructor.
        super(IMPALAAgent,
              self).__init__(discount=discount,
                             preprocessing_spec=self.preprocessing_spec,
                             network_spec=self.network_spec,
                             policy_spec=self.policy_spec,
                             exploration_spec=self.exploration_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             execution_spec=execution_spec,
                             name=kwargs.pop(
                                 "name", "impala-{}-agent".format(self.type)),
                             **kwargs)
        # Always use 1st learner as the parameter server for all policy variables.
        if self.execution_spec["mode"] == "distributed" and self.execution_spec[
                "distributed_spec"]["cluster_spec"]:
            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu")))

        # Check whether we have an RNN.
        self.has_rnn = self.policy.neural_network.has_rnn()
        # Check, whether we are running with GPU.
        self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \
            self.execution_spec["gpu_spec"]["num_gpus"] > 0

        # Some FIFO-queue specs.
        self.fifo_queue_keys = ["terminals", "states"] + \
                               (["actions"] if not self.feed_previous_action_through_nn else []) + \
                               (["rewards"] if not self.feed_previous_reward_through_nn else []) + \
                               ["action_probs"] + \
                               (["initial_internal_states"] if self.has_rnn else [])
        # Define FIFO record space.
        # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain
        # num-steps items.
        self.fifo_record_space = Dict(
            {
                "terminals":
                bool,
                "action_probs":
                FloatBox(shape=(self.action_space.num_categories, )),
            },
            add_batch_rank=False,
            add_time_rank=self.worker_sample_size)
        self.fifo_record_space["states"] = self.state_space.with_time_rank(
            self.worker_sample_size + 1)
        # Add action and rewards to state or do they have an extra channel?
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_action"] = \
                self.action_space.with_time_rank(self.worker_sample_size + 1)
        else:
            self.fifo_record_space[
                "actions"] = self.action_space.with_time_rank(
                    self.worker_sample_size)
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_reward"] = FloatBox(
                add_time_rank=self.worker_sample_size + 1)
        else:
            self.fifo_record_space["rewards"] = FloatBox(
                add_time_rank=self.worker_sample_size)

        if self.has_rnn:
            self.fifo_record_space[
                "initial_internal_states"] = self.internal_states_space.with_time_rank(
                    False)

        # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue).
        self.fifo_queue = FIFOQueue.from_spec(
            fifo_queue_spec or dict(capacity=1),
            reuse_variable_scope="shared-fifo-queue",
            only_insert_single_records=True,
            record_space=self.fifo_record_space,
            device="/job:learner/task:0/cpu"
            if self.execution_spec["mode"] == "distributed"
            and self.execution_spec["distributed_spec"]["cluster_spec"] else
            None)

        # Remove `states` key from input_spaces: not needed.
        del self.input_spaces["states"]

        # Add all our sub-components to the core.
        if self.type == "single":
            pass

        elif self.type == "actor":
            # No learning, no loss function.
            self.loss_function = None
            # A Dict Splitter to split things from the EnvStepper.
            self.env_output_splitter = ContainerSplitter(
                tuple_length=4, scope="env-output-splitter")

            self.states_dict_splitter = None

            # Slice some data from the EnvStepper (e.g only first internal states are needed).
            self.internal_states_slicer = Slice(scope="internal-states-slicer",
                                                squeeze=True)
            # Merge back to insert into FIFO.
            self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

            # Dummy Flattener to calculate action-probs space.
            dummy_flattener = ReShape(
                flatten=True,
                flatten_categories=self.action_space.num_categories)
            self.environment_stepper = EnvironmentStepper(
                environment_spec=environment_spec,
                actor_component_spec=ActorComponent(self.preprocessor,
                                                    self.policy,
                                                    self.exploration),
                state_space=self.state_space.with_batch_rank(),
                reward_space=
                float,  # TODO <- float64 for deepmind? may not work for other envs
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_previous_action_to_state=True,
                add_previous_reward_to_state=True,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space))
            sub_components = [
                self.environment_stepper, self.env_output_splitter,
                self.internal_states_slicer, self.fifo_input_merger,
                self.fifo_queue
            ]
        # Learner.
        else:
            self.environment_stepper = None

            # A Dict splitter to split up items from the queue.
            self.fifo_input_merger = None
            self.fifo_output_splitter = ContainerSplitter(
                *self.fifo_queue_keys, scope="fifo-output-splitter")
            self.states_dict_splitter = ContainerSplitter(
                *list(self.fifo_record_space["states"].keys()),
                scope="states-dict-splitter")
            self.internal_states_slicer = None

            self.transposer = Transpose(
                scope="transposer", device=dict(ops="/job:learner/task:0/cpu"))
            self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

            # Create an IMPALALossFunction with some parameters.
            self.loss_function = IMPALALossFunction(
                discount=self.discount,
                weight_pg=weight_pg,
                weight_baseline=weight_baseline,
                weight_entropy=weight_entropy,
                slice_actions=self.feed_previous_action_through_nn,
                slice_rewards=self.feed_previous_reward_through_nn,
                device="/job:learner/task:0/gpu")

            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu",
                                 ops="/job:learner/task:0/gpu")))
            for component in [
                    self.staging_area, self.preprocessor, self.optimizer
            ]:
                component.propagate_sub_component_properties(
                    dict(device="/job:learner/task:0/gpu"))

            sub_components = [
                self.fifo_output_splitter, self.fifo_queue,
                self.states_dict_splitter, self.transposer, self.staging_area,
                self.preprocessor, self.policy, self.loss_function,
                self.optimizer
            ]

        if self.type != "single":
            # Add all the agent's sub-components to the root.
            self.root_component.add_components(*sub_components)

            # Define the Agent's (root Component's) API.
            self.define_graph_api(*sub_components)

        if self.type != "single" and self.auto_build:
            if self.type == "learner":
                build_options = dict(
                    build_device_context="/job:learner/task:0/cpu",
                    pin_global_variable_device="/job:learner/task:0/cpu")
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=build_options)
            else:
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=None)

            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))

                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
            if self.type == "actor":
                self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \
                    out_op_columns[0].op_records[0].op
Пример #15
0
    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()
Пример #16
0
    def test_large_impala_actor_component_without_agent(self):
        """
        Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input
        samples through it.
        """
        batch_size = 4
        time_steps = 1

        # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function).
        policy = SharedValueFunctionPolicy(
            LargeIMPALANetwork(worker_sample_size=time_steps),
            action_space=self.action_space,
            deterministic=False)
        actor_component = ActorComponent(preprocessor_spec=None,
                                         policy_spec=policy,
                                         exploration_spec=None)

        test = ComponentTest(actor_component,
                             input_spaces=dict(
                                 states=self.input_space,
                                 internal_states=self.internal_states_space),
                             action_space=self.action_space,
                             execution_spec=dict(disable_monitoring=True))

        # Send a sample through the network (sequence-length (time-rank) x batch-size).
        nn_dict_input = self.input_space.sample(size=(time_steps, batch_size))
        initial_internal_states = self.internal_states_space.zeros(
            size=batch_size)
        expected = None
        out = test.test(("get_preprocessed_state_and_action",
                         [nn_dict_input, initial_internal_states]),
                        expected_outputs=expected)
        print("First action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (time_steps, batch_size))
        self.assertEquals(out["last_internal_states"][0].shape,
                          (batch_size, 256))
        self.assertEquals(out["last_internal_states"][1].shape,
                          (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel).
        recursive_assert_almost_equal(
            out["preprocessed_state"],
            dict(
                RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"],
                INSTR=nn_dict_input["INSTR"],
                previous_action=nn_dict_input["previous_action"],
                previous_reward=nn_dict_input["previous_reward"],
            ))

        # Send another 1x1 sample through the network using the previous internal-state.
        next_nn_input = self.input_space.sample(size=(time_steps, batch_size))
        expected = None
        out = test.test(("get_preprocessed_state_and_action",
                         [next_nn_input, out["last_internal_states"]]),
                        expected_outputs=expected)
        print("Second action: {}".format(out["action"]))
        self.assertEquals(out["action"].shape, (time_steps, batch_size))
        self.assertEquals(out["last_internal_states"][0].shape,
                          (batch_size, 256))
        self.assertEquals(out["last_internal_states"][1].shape,
                          (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel, which gets divided by 255).
        recursive_assert_almost_equal(
            out["preprocessed_state"],
            dict(
                RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"],
                INSTR=next_nn_input["INSTR"],
                previous_action=next_nn_input["previous_action"],
                previous_reward=next_nn_input["previous_reward"],
            ))

        test.terminate()
Пример #17
0
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        # Reset the stepper.
        test.test("reset")
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[1][0].dtype == np.float32)  # preprocessed states
        #self.assertTrue(out[1][0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[1][0].max() <= 1.0)
        #self.assertTrue(out[1][1].dtype == np.int32)  # actions
        #self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        #self.assertTrue(out[1][3].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        self.assertTrue(out[1][2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[1][2].max() <= 1.0)

        # Check whether episode returns match single rewards (including resetting after each terminal signal).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[2][i]
        #    self.assertAlmostEqual(episode_returns, out[1][3][i])
        #    # Terminal: Reset accumulated episode-return before next step.
        #    if out[1][4][i] is np.bool_(True):
        #        episode_returns = 0.0

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #18
0
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        worker_sample_size = 100
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec. worker_sample_size=1 as its an actor network.
            dict(network_spec=LargeIMPALANetwork(worker_sample_size=1),
                 action_space=action_space))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=worker_sample_size,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(component=environment_stepper,
                             action_space=action_space,
                             execution_spec=dict(disable_monitoring=True))

        environment_stepper.environment_server.start_server()

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[1]["INSTR"].dtype == np.object)
        self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) +
            state_space["RGB_INTERLEAVED"].shape)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255)
        self.assertTrue(out[1]["previous_action"].dtype == np.int32)  # actions
        self.assertTrue(
            out[1]["previous_action"].shape == (worker_sample_size + 1, ))
        self.assertTrue(
            out[1]["previous_reward"].dtype == np.float32)  # rewards
        self.assertTrue(
            out[1]["previous_reward"].shape == (worker_sample_size + 1, ))
        # action probs (test whether sum to one).
        self.assertTrue(out[2].dtype == np.float32)
        self.assertTrue(out[2].shape == (100, action_space.num_categories))
        self.assertTrue(out[2].min() >= 0.0)
        self.assertTrue(out[2].max() <= 1.0)
        recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False),
                                      np.ones(shape=(worker_sample_size, )),
                                      decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256))
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256))

        environment_stepper.environment_server.stop_server()

        test.terminate()
Пример #19
0
    def test_large_impala_actor_component_without_agent(self):
        """
        Creates a large IMPALA architecture network inside a policy inside an actor component and runs a few input
        samples through it.
        """
        batch_size = 4

        # Use IMPALA paper's preprocessor of division by 255 (only for the Image).
        preprocessor_spec_for_actor_component = dict(
            type="dict-preprocessor-stack",
            preprocessors=dict(
                RGB_INTERLEAVED=[dict(type="divide", divisor=255)]))
        # IMPALA uses a baseline action adapter (v-trace off-policy PG with baseline value function).
        policy = Policy(
            LargeIMPALANetwork(),
            action_space=self.action_space,
            action_adapter_spec=dict(type="baseline_action_adapter"))
        exploration = Exploration(epsilon_spec=dict(
            decay_spec=dict(type="linear_decay",
                            from_=1.0,
                            to_=0.1,
                            start_timestep=0,
                            num_timesteps=100)))
        actor_component = ActorComponent(preprocessor_spec_for_actor_component,
                                         policy, exploration)

        test = ComponentTest(actor_component,
                             input_spaces=dict(
                                 states=self.input_space,
                                 internal_states=self.internal_states_space),
                             action_space=self.action_space)

        # Send a sample through the network (sequence-length (time-rank) x batch-size).
        nn_dict_input = self.input_space.sample(size=(batch_size, 1))
        initial_internal_states = self.internal_states_space.zeros(
            size=batch_size)
        expected = None
        preprocessed_states, actions, last_internal_states = test.test(
            ("get_preprocessed_state_and_action",
             [nn_dict_input, initial_internal_states]),
            expected_outputs=expected)
        print("First action: {}".format(actions))
        self.assertEquals(actions.shape, (batch_size, 1))
        self.assertEquals(last_internal_states[0].shape, (batch_size, 256))
        self.assertEquals(last_internal_states[1].shape, (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel).
        recursive_assert_almost_equal(
            preprocessed_states,
            dict(
                RGB_INTERLEAVED=nn_dict_input["RGB_INTERLEAVED"] / 255,
                INSTR=nn_dict_input["INSTR"],
                previous_action=nn_dict_input["previous_action"],
                previous_reward=nn_dict_input["previous_reward"],
            ))

        # Send another 1x1 sample through the network using the previous internal-state.
        next_nn_input = self.input_space.sample(size=(batch_size, 1))
        expected = None
        preprocessed_states, actions, last_internal_states = test.test(
            ("get_preprocessed_state_and_action",
             [next_nn_input, last_internal_states]),
            expected_outputs=expected)
        print("Second action: {}".format(actions))
        self.assertEquals(actions.shape, (batch_size, 1))
        self.assertEquals(last_internal_states[0].shape, (batch_size, 256))
        self.assertEquals(last_internal_states[1].shape, (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel, which gets divided by 255).
        recursive_assert_almost_equal(
            preprocessed_states,
            dict(
                RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"] / 255,
                INSTR=next_nn_input["INSTR"],
                previous_action=next_nn_input["previous_action"],
                previous_reward=next_nn_input["previous_reward"],
            ))

        # Send time x batch states through the network to simulate agent-type=learner behavior.
        # time-steps=20, batch=1 (must match last-internal-states)
        next_nn_input = self.input_space.sample(size=(batch_size, 20))
        expected = None
        preprocessed_states, actions, last_internal_states = test.test(
            ("get_preprocessed_state_and_action",
             [next_nn_input, last_internal_states]),
            expected_outputs=expected)
        print("Actions 3 to 22: {}".format(actions))
        self.assertEquals(actions.shape, (batch_size, 20))
        self.assertEquals(last_internal_states[0].shape, (batch_size, 256))
        self.assertEquals(last_internal_states[1].shape, (batch_size, 256))
        # Check preprocessed state (all the same except 'image' channel).
        recursive_assert_almost_equal(
            preprocessed_states,
            dict(
                RGB_INTERLEAVED=next_nn_input["RGB_INTERLEAVED"] / 255,
                INSTR=next_nn_input["INSTR"],
                previous_action=next_nn_input["previous_action"],
                previous_reward=next_nn_input["previous_reward"],
            ))
Пример #20
0
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    ## The images from the env  are divided by 255.
                    #RGB_INTERLEAVED=[dict(type="divide", divisor=255)],
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec.
            dict(network_spec=LargeIMPALANetwork(), action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=100,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0]["INSTR"].dtype == np.object)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >=
                        0.0)  # make sure we have pixels / 255
        self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0)
        self.assertTrue(out[1].dtype == np.int32)  # actions
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[3].dtype == np.float32)  # episode return
        self.assertTrue(out[4].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[5]["INSTR"].dtype ==
                        np.object)  # next state (raw, not preprocessed)
        self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype ==
                        np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(
            out[5]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255)
        # action probs (test whether sum to one).
        self.assertTrue(out[6].dtype == np.float32)
        self.assertTrue(out[6].min() >= 0.0)
        self.assertTrue(out[6].max() <= 1.0)
        recursive_assert_almost_equal(
            out[6].sum(axis=-1, keepdims=False),
            np.ones(shape=(environment_stepper.num_steps, )),
            decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[7][0].dtype == np.float32)
        self.assertTrue(out[7][1].dtype == np.float32)
        self.assertTrue(out[7][0].shape == (environment_stepper.num_steps,
                                            256))
        self.assertTrue(out[7][1].shape == (environment_stepper.num_steps,
                                            256))

        # Check whether episode returns match single rewards (including terminal signals).
        episode_returns = 0.0
        for i in range(environment_stepper.num_steps):
            episode_returns += out[2][i]
            self.assertAlmostEqual(episode_returns, out[3][i])
            # Terminal: Reset for next step.
            if out[4][i] is np.bool_(True):
                episode_returns = 0.0

        test.terminate()