def test_actor_component_with_lstm_network(self):
        # state space and internal state space
        state_space = FloatBox(shape=(2, ),
                               add_batch_rank=True,
                               add_time_rank=True,
                               time_major=False)
        internal_states_space = Tuple(FloatBox(shape=(3, )),
                                      FloatBox(shape=(3, )),
                                      add_batch_rank=True)
        time_step_space = IntBox()
        # action_space.
        action_space = IntBox(2, add_batch_rank=True, add_time_rank=True)

        preprocessor = PreprocessorStack.from_spec([
            dict(type="convert_type", to_dtype="float"),
            dict(type="divide", divisor=10)
        ])
        policy = Policy(
            network_spec=config_from_path("configs/test_lstm_nn.json"),
            action_space=action_space)
        exploration = Exploration(epsilon_spec=dict(
            decay_spec=dict(type="linear_decay",
                            from_=1.0,
                            to_=0.1,
                            start_timestep=0,
                            num_timesteps=100)))
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(component=actor_component,
                             input_spaces=dict(states=state_space,
                                               other_nn_inputs=Tuple(
                                                   internal_states_space,
                                                   add_batch_rank=True),
                                               time_step=time_step_space),
                             action_space=action_space)
        # Some state inputs (batch size=2, seq-len=1000; batch-major).
        np.random.seed(10)
        states = state_space.sample(size=(1000, 2))
        initial_internal_states = internal_states_space.zeros(
            size=2)  # only batch
        time_steps = time_step_space.sample(1000)

        # Run n times a single time-step to simulate acting and env interaction with an LSTM.
        preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float)
        actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int)
        for i, time_step in enumerate(time_steps):
            ret = test.test((
                "get_preprocessed_state_and_action",
                # expand time dim at 1st slot as we are time-major == False
                [
                    np.expand_dims(states[i], 1),
                    tuple([initial_internal_states]), time_step
                ]))
            preprocessed_states[i] = ret[
                "preprocessed_state"][:, 0, :]  # take out time-rank again ()
            actions[i] = ret["action"]
            # Check c/h-state shape.
            self.assertEqual(ret["nn_outputs"][1][0].shape,
                             (2, 3))  # batch-size=2, LSTM units=3
            self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3))

        # Check all preprocessed states (easy: just divided by 10).
        expected_preprocessed_state = states / 10
        recursive_assert_almost_equal(preprocessed_states,
                                      expected_preprocessed_state)

        # Check the exploration functionality over the actions.
        # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small.
        stddev_actions = actions.std()
        self.assertGreater(stddev_actions, 0.4)
        self.assertLess(stddev_actions, 0.6)
    def test_simple_actor_component(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5,), add_batch_rank=True)
        # action_space.
        action_space = IntBox(10)

        preprocessor = PreprocessorStack.from_spec(
            [dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=2)]
        )
        policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space)
        exploration = Exploration()  # no exploration
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(states=state_space),
            action_space=action_space
        )
        # Get and check some actions.
        actor_component_params = test.read_variable_values(actor_component.variable_registry)

        # Some state inputs (5 input nodes, batch size=2).
        states = state_space.sample(2)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params["actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_and_action", states), expected_outputs=dict(
            preprocessed_state=expected_preprocessed_state, action=expected_actions,
            nn_outputs=expected_nn_output
        ), decimals=5)

        # Get actions and action-probs by calling a different API-method.
        states = state_space.sample(5)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(actor_component.variable_registry)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params["actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # No reshape necessary (simple action space), softmax to get probs.
        expected_action_probs = softmax(expected_action_layer_output)
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_action_and_action_probs", states), expected_outputs=dict(
            preprocessed_state=expected_preprocessed_state, action=expected_actions, action_probs=expected_action_probs,
            nn_outputs=expected_nn_output
        ), decimals=5)