示例#1
0
    def test_multi_lstm_layer(self):
        return  # TODO: finish this test case
        # Tests a double MultiLSTMLayer.
        input_spaces = dict(inputs=FloatBox(shape=(3, ),
                                            add_batch_rank=True,
                                            add_time_rank=True),
                            initial_c_and_h_states=Tuple(
                                Tuple(FloatBox(shape=(5, )),
                                      FloatBox(shape=(5, ))),
                                Tuple(FloatBox(shape=(5, )),
                                      FloatBox(shape=(5, ))),
                                add_batch_rank=True))

        multi_lstm_layer = MultiLSTMLayer(
            num_lstms=2,
            units=5,
            # Full skip connections (x goes into both layers, out0 goes into layer1).
            skip_connections=[[True, False], [True, True]])

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=multi_lstm_layer,
                             input_spaces=input_spaces)

        # Batch of size=n, time-steps=m.
        input_ = input_spaces["inputs"].sample((2, 3))

        global_scope = "variational-auto-encoder/"
        # Calculate output manually.
        var_dict = test.read_variable_values(
            multi_lstm_layer.variable_registry)

        encoder_network_out = dense_layer(
            input_, var_dict[global_scope +
                             "encoder-network/encoder-layer/dense/kernel"],
            var_dict[global_scope +
                     "encoder-network/encoder-layer/dense/bias"])
        expected_mean = dense_layer(
            encoder_network_out,
            var_dict[global_scope + "mean-layer/dense/kernel"],
            var_dict[global_scope + "mean-layer/dense/bias"])
        expected_stddev = dense_layer(
            encoder_network_out,
            var_dict[global_scope + "stddev-layer/dense/kernel"],
            var_dict[global_scope + "stddev-layer/dense/bias"])
        out = test.test(("encode", input_), expected_outputs=None)
        recursive_assert_almost_equal(out["mean"], expected_mean, decimals=5)
        recursive_assert_almost_equal(out["stddev"],
                                      expected_stddev,
                                      decimals=5)
        self.assertTrue(out["z_sample"].shape == (3, 1))

        test.terminate()
示例#2
0
    def test_impala_actor_compilation(self):
        """
        Tests IMPALA agent compilation (actor).
        """
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env = DeepmindLabEnv(
            level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4
        )

        actor_agent = IMPALAAgent.from_spec(
            agent_config,
            type="actor",
            state_space=env.state_space,
            action_space=env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True),
            # Make session-creation hang in docker.
            execution_spec=dict(disable_monitoring=True)
        )
        # Start Specifiable Server with Env manually.
        actor_agent.environment_stepper.environment_server.start()
        print("Compiled IMPALA type=actor agent.")
        actor_agent.environment_stepper.environment_server.stop()
    def test_keras_style_one_container_input_space(self):
        # Define one container input Space.
        input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0])
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1])
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        # `inputs` is not needed  here as we only have one single input (the Tuple).
        neural_net = NeuralNetwork(outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = input_space.sample(4)

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", tuple([input_])), expected_outputs=expected)

        test.terminate()
    def test_calculate_gradients(self):
        return
        optimizer = GradientDescentOptimizer(learning_rate=0.01)

        x = tf.Variable(2, name='x', dtype=tf.float32)
        log_x = tf.log(x)
        loss = tf.square(x=log_x)

        test = ComponentTest(component=optimizer,
                             input_spaces=dict(
                                 loss=FloatBox(),
                                 variables=Dict({"x": FloatBox()}),
                                 loss_per_item=FloatBox(add_batch_rank=True),
                                 grads_and_vars=Tuple(Tuple(float, float))))

        print(
            test.test(("calculate_gradients", [dict(x=x), loss]),
                      expected_outputs=None))
    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self):
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1])
        expected = (
            np.array([False, False, True, False]),
            np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)),
            ]),  # action probs
            # internal states
            (
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])),
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))
            )
        )
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
示例#6
0
 def _prepare_loss_function_test(loss_function):
     test = ComponentTest(
         component=loss_function,
         input_spaces=dict(
             alpha=float,
             log_probs_next_sampled=FloatBox(shape=(1, ),
                                             add_batch_rank=True),
             q_values_next_sampled=Tuple(FloatBox(shape=(1, )),
                                         FloatBox(shape=(1, )),
                                         add_batch_rank=True),
             q_values=Tuple(FloatBox(shape=(1, )),
                            FloatBox(shape=(1, )),
                            add_batch_rank=True),
             log_probs_sampled=FloatBox(shape=(1, ), add_batch_rank=True),
             q_values_sampled=Tuple(FloatBox(shape=(1, )),
                                    FloatBox(shape=(1, )),
                                    add_batch_rank=True),
             rewards=FloatBox(add_batch_rank=True),
             terminals=BoolBox(add_batch_rank=True),
             loss_per_item=FloatBox(add_batch_rank=True)),
         action_space=IntBox(2, shape=(), add_batch_rank=True))
     return test
    def test_multi_input_stream_neural_network_with_tuple(self):
        # Space must contain batch dimension (otherwise, NNLayer will complain).
        input_space = Tuple(
            IntBox(3, shape=()),
            FloatBox(shape=(8,)),
            IntBox(4, shape=()),
            add_batch_rank=True
        )

        multi_input_nn = MultiInputStreamNeuralNetwork(
            input_network_specs=(
                [{"type": "reshape", "flatten": True, "flatten_categories": True}],  # intbox -> flatten
                [{"type": "dense", "units": 2}],  # floatbox -> dense
                [{"type": "reshape", "flatten": True, "flatten_categories": True}]  # inbox -> flatten
            ),
            post_network_spec=[{"type": "dense", "units": 3}],
        )

        test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        nn_inputs = input_space.sample(3)

        global_scope_pre = "multi-input-stream-nn/input-stream-nn-"
        global_scope_post = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/"
        # Calculate output manually.
        var_dict = test.read_variable_values()

        flat_0 = one_hot(nn_inputs[0], depth=3)
        dense_1 = dense_layer(
            nn_inputs[1], var_dict[global_scope_pre+"1/dense-layer/dense/kernel"],
            var_dict[global_scope_pre+"1/dense-layer/dense/bias"]
        )
        flat_2 = one_hot(nn_inputs[2], depth=4)
        concat_out = np.concatenate((flat_0, dense_1, flat_2), axis=-1)
        expected = dense_layer(concat_out, var_dict[global_scope_post+"kernel"], var_dict[global_scope_post+"bias"])

        test.test(("call", tuple([nn_inputs])), expected_outputs=expected)

        test.terminate()
示例#8
0
    def test_impala_actor_compilation(self):
        """
        Tests IMPALA agent compilation (actor).
        """
        return
        if get_backend() == "pytorch":
            return
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        agent = IMPALAAgent.from_spec(
            agent_config,
            type="actor",
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=False),
            environment_spec=default_dict(dict(type="deepmind-lab"), env_spec),
            # Make session-creation hang in docker.
            execution_spec=dict(
                session_config=dict(
                    type="monitored-training-session",
                    auto_start=False
                ),
                disable_monitoring=True
            )
        )
        # Start Specifiable Server with Env manually (monitoring is disabled).
        agent.environment_stepper.environment_server.start_server()
        print("Compiled {}".format(agent))
        agent.environment_stepper.environment_server.stop_server()
        agent.terminate()
示例#9
0
    def test_impala_learner_compilation(self):
        """
        Tests IMPALA agent compilation (learner).
        """
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("Deepmind Lab not installed: Will skip this test.")
            return

        agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json")
        env = DeepmindLabEnv(
            level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4
        )

        learner_agent = IMPALAAgent.from_spec(
            agent_config,
            type="learner",
            state_space=env.state_space,
            action_space=env.action_space,
            internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True),
        )

        print("Compiled IMPALA type=learner agent.")
示例#10
0
    def test_lstm_nn_with_custom_apply(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        units = 3
        batch_size = 2
        time_steps = 4
        input_nodes = 2
        input_space = FloatBox(shape=(input_nodes, ),
                               add_batch_rank=True,
                               add_time_rank=True)
        internal_states_space = Tuple(FloatBox(shape=(units, )),
                                      FloatBox(shape=(units, )),
                                      add_batch_rank=True)

        def custom_apply(self, input_, internal_states=None):
            d0_out = self.get_sub_component_by_name("d0").apply(input_)
            lstm_out = self.get_sub_component_by_name("lstm").apply(
                d0_out, internal_states)
            d1_out = self.get_sub_component_by_name("d1").apply(
                lstm_out["output"])
            return dict(output=d1_out,
                        last_internal_states=lstm_out["last_internal_states"])

        # Create a simple neural net with the above custom API-method.
        neural_net = NeuralNetwork(DenseLayer(units, scope="d0"),
                                   LSTMLayer(units, scope="lstm"),
                                   DenseLayer(units, scope="d1"),
                                   api_methods={("apply", custom_apply)})

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=neural_net,
                             input_spaces=dict(
                                 input_=input_space,
                                 internal_states=internal_states_space))

        # Batch of size=2, time-steps=3.
        input_ = input_space.sample((batch_size, time_steps))
        internal_states = internal_states_space.sample(batch_size)

        # Calculate output manually.
        w0_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d0/dense/kernel"])
        b0_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d0/dense/bias"])
        w1_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d1/dense/bias"])
        lstm_w_value = test.read_variable_values(
            neural_net.
            variable_registry["neural-network/lstm/lstm-cell/kernel"])
        lstm_b_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/lstm/lstm-cell/bias"])

        d0_out = dense_layer(input_, w0_value, b0_value)
        lstm_out, last_internal_states = lstm_layer(
            d0_out,
            lstm_w_value,
            lstm_b_value,
            initial_internal_states=internal_states,
            time_major=False)
        d1_out = dense_layer(lstm_out, w1_value, b1_value)

        expected = dict(output=d1_out,
                        last_internal_states=last_internal_states)
        test.test(("apply", [input_, internal_states]),
                  expected_outputs=expected,
                  decimals=5)

        test.terminate()
示例#11
0
    def __init__(self, file_name=None, worker_id=0, base_port=5005, seed=0, docker_training=False, no_graphics=False,
                 timeout_wait=30, train_mode=True, **kwargs):
        """
        Args:
            file_name (Optional[str]): Name of Unity environment binary.
            base_port (int): Port number to connect to Unity environment. `worker_id` increments on top of this.
            worker_id (int): Number to add to `base_port`. Used for asynchronous agent scenarios.
            docker_training (bool): Informs this class, whether the process is being run within a container.
                Default: False.
            no_graphics (bool): Whether to run the Unity simulator in no-graphics mode. Default: False.
            timeout_wait (int): Time (in seconds) to wait for connection from environment.
            train_mode (bool): Whether to run in training mode, speeding up the simulation. Default: True.
        """
        # First create the UnityMLAgentsEnvironment to get state and action spaces, then create RLgraph Environment
        # instance.
        self.mlagents_env = UnityEnvironment(
            file_name, worker_id, base_port, seed, docker_training, no_graphics
        )
        all_brain_info = self.mlagents_env.reset()
        # Get all possible information from AllBrainInfo.
        # TODO: Which scene do we pick?
        self.scene_key = next(iter(all_brain_info))
        first_brain_info = all_brain_info[self.scene_key]
        num_environments = len(first_brain_info.agents)

        state_space = {}
        if len(first_brain_info.vector_observations[0]) > 0:
            state_space["vector"] = get_space_from_op(first_brain_info.vector_observations[0])
            # TODO: This is a hack.
            if state_space["vector"].dtype == np.float64:
                state_space["vector"].dtype = np.float32
        if len(first_brain_info.visual_observations) > 0:
            state_space["visual"] = get_space_from_op(first_brain_info.visual_observations[0])
        if first_brain_info.text_observations[0]:
            state_space["text"] = get_space_from_op(first_brain_info.text_observations[0])

        if len(state_space) == 1:
            self.state_key = next(iter(state_space))
            state_space = state_space[self.state_key]
        else:
            self.state_key = None
            state_space = Dict(state_space)
        brain_params = next(iter(self.mlagents_env.brains.values()))
        if brain_params.vector_action_space_type == "discrete":
            highs = brain_params.vector_action_space_size
            # MultiDiscrete (Tuple(IntBox)).
            if any(h != highs[0] for h in highs):
                action_space = Tuple([IntBox(h) for h in highs])
            # Normal IntBox:
            else:
                action_space = IntBox(
                    low=np.zeros_like(highs, dtype=np.int32),
                    high=np.array(highs, dtype=np.int32),
                    shape=(len(highs),)
                )
        else:
            action_space = get_space_from_op(first_brain_info.action_masks[0])
        if action_space.dtype == np.float64:
            action_space.dtype = np.float32

        super(MLAgentsEnv, self).__init__(
            num_environments=num_environments, state_space=state_space, action_space=action_space, **kwargs
        )

        # Caches the last observation we made (after stepping or resetting).
        self.last_state = None
示例#12
0
class IMPALAAgent(Agent):
    """
    An Agent implementing the IMPALA algorithm described in [1]. The Agent contains both learner and actor
    API-methods, which will be put into the graph depending on the type ().

    [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer,
        Munos et al. - 2018 (https://arxiv.org/abs/1802.01561)
    """

    default_internal_states_space = Tuple(FloatBox(shape=(256, )),
                                          FloatBox(shape=(256, )),
                                          add_batch_rank=False)
    default_environment_spec = dict(type="deepmind_lab",
                                    level_id="seekavoid_arena_01",
                                    observations=["RGB_INTERLEAVED", "INSTR"],
                                    frameskip=4)

    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 worker_sample_size=100,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.

        Keyword Args:
            type (str): One of "single", "actor" or "learner". Default: "single".
        """
        type_ = kwargs.pop("type", "single")
        assert type_ in ["single", "actor", "learner"]
        self.type = type_
        self.worker_sample_size = worker_sample_size

        # Network-spec by default is a "large architecture" IMPALA network.
        self.network_spec = kwargs.pop(
            "network_spec",
            dict(
                type=
                "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork"
                .format("Large" if architecture == "large" else "Small")))
        if isinstance(self.network_spec, dict) and "type" in self.network_spec and \
                "IMPALANetwork" in self.network_spec["type"]:
            self.network_spec = default_dict(
                self.network_spec,
                dict(worker_sample_size=1 if self.type ==
                     "actor" else self.worker_sample_size + 1))

        # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need.
        self.exploration_spec = kwargs.pop("exploration_spec", None)
        optimizer_spec = kwargs.pop("optimizer_spec", None)
        observe_spec = kwargs.pop("observe_spec", None)

        self.feed_previous_action_through_nn = feed_previous_action_through_nn
        self.feed_previous_reward_through_nn = feed_previous_reward_through_nn

        # Run everything in a single process.
        if self.type == "single":
            environment_spec = environment_spec or self.default_environment_spec
            update_spec = kwargs.pop("update_spec", None)
        # Actors won't need to learn (no optimizer needed in graph).
        elif self.type == "actor":
            optimizer_spec = None
            update_spec = kwargs.pop("update_spec", dict(do_updates=False))
            environment_spec = environment_spec or self.default_environment_spec
        # Learners won't need to explore (act) or observe (insert into Queue).
        else:
            observe_spec = None
            update_spec = kwargs.pop("update_spec", None)
            environment_spec = None

        # Add previous-action/reward preprocessors to env-specific preprocessor spec.
        # TODO: remove this empty hard-coded preprocessor.
        self.preprocessing_spec = kwargs.pop(
            "preprocessing_spec",
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # Flatten actions.
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=kwargs.get(
                                 "action_space").num_categories)
                    ],
                    # Bump reward and convert to float32, so that it can be concatenated by the Concat layer.
                    previous_reward=[dict(type="reshape", new_shape=(1, ))])))

        # Limit communication in distributed mode between each actor and the learner (never between actors).
        execution_spec = kwargs.pop("execution_spec", None)
        if execution_spec is not None and execution_spec.get(
                "mode") == "distributed":
            default_dict(
                execution_spec["session_config"],
                dict(type="monitored-training-session",
                     allow_soft_placement=True,
                     device_filters=["/job:learner/task:0"] + ([
                         "/job:actor/task:{}".format(
                             execution_spec["distributed_spec"]["task_index"])
                     ] if self.type == "actor" else ["/job:learner/task:0"])))
            # If Actor, make non-chief in either case (even if task idx == 0).
            if self.type == "actor":
                execution_spec["distributed_spec"]["is_chief"] = False
                # Hard-set device to the CPU for actors.
                execution_spec["device_strategy"] = "custom"
                execution_spec[
                    "default_device"] = "/job:{}/task:{}/cpu".format(
                        self.type,
                        execution_spec["distributed_spec"]["task_index"])

        self.policy_spec = kwargs.pop("policy_spec", dict())
        # TODO: Create some auto-setting based on LSTM inside the NN.
        default_dict(
            self.policy_spec,
            dict(type="shared-value-function-policy",
                 deterministic=False,
                 reuse_variable_scope="shared-policy",
                 action_space=kwargs.get("action_space")))

        # Now that we fixed the Agent's spec, call the super constructor.
        super(IMPALAAgent,
              self).__init__(discount=discount,
                             preprocessing_spec=self.preprocessing_spec,
                             network_spec=self.network_spec,
                             policy_spec=self.policy_spec,
                             exploration_spec=self.exploration_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             execution_spec=execution_spec,
                             name=kwargs.pop(
                                 "name", "impala-{}-agent".format(self.type)),
                             **kwargs)
        # Always use 1st learner as the parameter server for all policy variables.
        if self.execution_spec["mode"] == "distributed" and self.execution_spec[
                "distributed_spec"]["cluster_spec"]:
            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu")))

        # Check whether we have an RNN.
        self.has_rnn = self.policy.neural_network.has_rnn()
        # Check, whether we are running with GPU.
        self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \
            self.execution_spec["gpu_spec"]["num_gpus"] > 0

        # Some FIFO-queue specs.
        self.fifo_queue_keys = ["terminals", "states"] + \
                               (["actions"] if not self.feed_previous_action_through_nn else []) + \
                               (["rewards"] if not self.feed_previous_reward_through_nn else []) + \
                               ["action_probs"] + \
                               (["initial_internal_states"] if self.has_rnn else [])
        # Define FIFO record space.
        # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain
        # num-steps items.
        self.fifo_record_space = Dict(
            {
                "terminals":
                bool,
                "action_probs":
                FloatBox(shape=(self.action_space.num_categories, )),
            },
            add_batch_rank=False,
            add_time_rank=self.worker_sample_size)
        self.fifo_record_space["states"] = self.state_space.with_time_rank(
            self.worker_sample_size + 1)
        # Add action and rewards to state or do they have an extra channel?
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_action"] = \
                self.action_space.with_time_rank(self.worker_sample_size + 1)
        else:
            self.fifo_record_space[
                "actions"] = self.action_space.with_time_rank(
                    self.worker_sample_size)
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_reward"] = FloatBox(
                add_time_rank=self.worker_sample_size + 1)
        else:
            self.fifo_record_space["rewards"] = FloatBox(
                add_time_rank=self.worker_sample_size)

        if self.has_rnn:
            self.fifo_record_space[
                "initial_internal_states"] = self.internal_states_space.with_time_rank(
                    False)

        # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue).
        self.fifo_queue = FIFOQueue.from_spec(
            fifo_queue_spec or dict(capacity=1),
            reuse_variable_scope="shared-fifo-queue",
            only_insert_single_records=True,
            record_space=self.fifo_record_space,
            device="/job:learner/task:0/cpu"
            if self.execution_spec["mode"] == "distributed"
            and self.execution_spec["distributed_spec"]["cluster_spec"] else
            None)

        # Remove `states` key from input_spaces: not needed.
        del self.input_spaces["states"]

        # Add all our sub-components to the core.
        if self.type == "single":
            pass

        elif self.type == "actor":
            # No learning, no loss function.
            self.loss_function = None
            # A Dict Splitter to split things from the EnvStepper.
            self.env_output_splitter = ContainerSplitter(
                tuple_length=4, scope="env-output-splitter")

            self.states_dict_splitter = None

            # Slice some data from the EnvStepper (e.g only first internal states are needed).
            self.internal_states_slicer = Slice(scope="internal-states-slicer",
                                                squeeze=True)
            # Merge back to insert into FIFO.
            self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

            # Dummy Flattener to calculate action-probs space.
            dummy_flattener = ReShape(
                flatten=True,
                flatten_categories=self.action_space.num_categories)
            self.environment_stepper = EnvironmentStepper(
                environment_spec=environment_spec,
                actor_component_spec=ActorComponent(self.preprocessor,
                                                    self.policy,
                                                    self.exploration),
                state_space=self.state_space.with_batch_rank(),
                reward_space=
                float,  # TODO <- float64 for deepmind? may not work for other envs
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_previous_action_to_state=True,
                add_previous_reward_to_state=True,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space))
            sub_components = [
                self.environment_stepper, self.env_output_splitter,
                self.internal_states_slicer, self.fifo_input_merger,
                self.fifo_queue
            ]
        # Learner.
        else:
            self.environment_stepper = None

            # A Dict splitter to split up items from the queue.
            self.fifo_input_merger = None
            self.fifo_output_splitter = ContainerSplitter(
                *self.fifo_queue_keys, scope="fifo-output-splitter")
            self.states_dict_splitter = ContainerSplitter(
                *list(self.fifo_record_space["states"].keys()),
                scope="states-dict-splitter")
            self.internal_states_slicer = None

            self.transposer = Transpose(
                scope="transposer", device=dict(ops="/job:learner/task:0/cpu"))
            self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

            # Create an IMPALALossFunction with some parameters.
            self.loss_function = IMPALALossFunction(
                discount=self.discount,
                weight_pg=weight_pg,
                weight_baseline=weight_baseline,
                weight_entropy=weight_entropy,
                slice_actions=self.feed_previous_action_through_nn,
                slice_rewards=self.feed_previous_reward_through_nn,
                device="/job:learner/task:0/gpu")

            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu",
                                 ops="/job:learner/task:0/gpu")))
            for component in [
                    self.staging_area, self.preprocessor, self.optimizer
            ]:
                component.propagate_sub_component_properties(
                    dict(device="/job:learner/task:0/gpu"))

            sub_components = [
                self.fifo_output_splitter, self.fifo_queue,
                self.states_dict_splitter, self.transposer, self.staging_area,
                self.preprocessor, self.policy, self.loss_function,
                self.optimizer
            ]

        if self.type != "single":
            # Add all the agent's sub-components to the root.
            self.root_component.add_components(*sub_components)

            # Define the Agent's (root Component's) API.
            self.define_graph_api(*sub_components)

        if self.type != "single" and self.auto_build:
            if self.type == "learner":
                build_options = dict(
                    build_device_context="/job:learner/task:0/cpu",
                    pin_global_variable_device="/job:learner/task:0/cpu")
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=build_options)
            else:
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=None)

            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))

                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
            if self.type == "actor":
                self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \
                    out_op_columns[0].op_records[0].op

    def define_graph_api(self, *sub_components):
        # TODO: Unify agents with/w/o synchronizable policy.
        # TODO: Unify Agents with/w/o get_action method (w/ env-stepper vs w/o).
        #global_scope_base = "environment-stepper/actor-component/" if self.type == "actor" else ""
        #super(IMPALAAgent, self).define_graph_api(
        #    global_scope_base+"policy",
        #    global_scope_base+"dict-preprocessor-stack"
        #)

        # Assemble the specific agent.
        if self.type == "single":
            pass
        elif self.type == "actor":
            self.define_graph_api_actor(*sub_components)
        else:
            self.define_graph_api_learner(*sub_components)

    def define_graph_api_actor(self, env_stepper, env_output_splitter,
                               internal_states_slicer, merger, fifo_queue):
        """
        Defines the API-methods used by an IMPALA actor. Actors only step through an environment (n-steps at
        a time), collect the results and push them into the FIFO queue. Results include: The actions actually
        taken, the discounted accumulated returns for each action, the probability of each taken action according to
        the behavior policy.

        Args:
            env_stepper (EnvironmentStepper): The EnvironmentStepper Component to setp through the Env n steps
                in a single op call.

            fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step).
        """
        # Perform n-steps in the env and insert the results into our FIFO-queue.
        @rlgraph_api(component=self.root_component)
        def perform_n_steps_and_insert_into_fifo(self_):
            # Take n steps in the environment.
            step_results = env_stepper.step()

            split_output = env_output_splitter.split(step_results)
            # Slice off the initial internal state (so the learner can re-feed-forward from that internal-state).
            initial_internal_states = internal_states_slicer.slice(
                split_output[-1], 0)  # -1=internal states
            to_merge = split_output[:-1] + (initial_internal_states, )
            record = merger.merge(*to_merge)

            # Insert results into the FIFOQueue.
            insert_op = fifo_queue.insert_records(record)

            return insert_op, split_output[0]  # 0=terminals

    def define_graph_api_learner(self, fifo_output_splitter, fifo_queue,
                                 states_dict_splitter, transposer,
                                 staging_area, preprocessor, policy,
                                 loss_function, optimizer):
        """
        Defines the API-methods used by an IMPALA learner. Its job is basically: Pull a batch from the
        FIFOQueue, split it up into its components and pass these through the loss function and into the optimizer for
        a learning update.

        Args:
            fifo_output_splitter (ContainerSplitter): The ContainerSplitter Component to split up a batch from the queue
                along its items.

            fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step).

            states_dict_splitter (ContainerSplitter): The ContainerSplitter Component to split the state components
                into its single parts.

            transposer (Transpose): A space-agnostic Transpose to flip batch- and time ranks of all state-components.
            staging_area (StagingArea): A possible GPU stating area component.

            preprocessor (PreprocessorStack): A preprocessing Component for the states (may be a DictPreprocessorStack
                as well).

            policy (Policy): The Policy Component, which to update.
            loss_function (IMPALALossFunction): The IMPALALossFunction Component.
            optimizer (Optimizer): The optimizer that we use to calculate an update and apply it.
        """
        @rlgraph_api(component=self.root_component)
        def get_queue_size(self_):
            return fifo_queue.get_size()

        @rlgraph_api(component=self.root_component)
        def update_from_memory(self_):
            # Pull n records from the queue.
            # Note that everything will come out as batch-major and must be transposed before the main-LSTM.
            # This is done by the network itself for all network inputs:
            # - preprocessed_s
            # - preprocessed_last_s_prime
            # But must still be done for actions, rewards, terminals here in this API-method via separate ReShapers.
            records = fifo_queue.get_records(self.update_spec["batch_size"])

            split_record = fifo_output_splitter.split(records)
            actions = None
            rewards = None
            if self.feed_previous_action_through_nn and self.feed_previous_reward_through_nn:
                terminals, states, action_probs_mu, initial_internal_states = split_record
            else:
                terminals, states, actions, rewards, action_probs_mu, initial_internal_states = split_record

            # Flip everything to time-major.
            # TODO: Create components that are less input-space sensitive (those that have no variables should
            # TODO: be reused for any kind of processing)
            states = transposer.apply(states)
            terminals = transposer.apply(terminals)
            action_probs_mu = transposer.apply(action_probs_mu)
            if self.feed_previous_action_through_nn is False:
                actions = transposer.apply(actions)
            if self.feed_previous_reward_through_nn is False:
                rewards = transposer.apply(rewards)

            # If we use a GPU: Put everything on staging area (adds 1 time step policy lag, but makes copying
            # data into GPU more efficient).
            if self.has_gpu:
                stage_op = staging_area.stage(states, terminals,
                                              action_probs_mu,
                                              initial_internal_states)
                # Get data from stage again and continue.
                states, terminals, action_probs_mu, initial_internal_states = staging_area.unstage(
                )
            else:
                # TODO: No-op component?
                stage_op = None

            # Preprocess actions and rewards inside the state (actions: flatten one-hot, rewards: expand).
            preprocessed_states = preprocessor.preprocess(states)

            # Only retrieve logits and do faster sparse softmax in loss.
            out = policy.get_state_values_logits_probabilities_log_probs(
                preprocessed_states, initial_internal_states)
            state_values_pi = out["state_values"]
            logits = out["logits"]
            #current_internal_states = out["last_internal_states"]

            # Isolate actions and rewards from states.
            if self.feed_previous_action_through_nn or self.feed_previous_reward_through_nn:
                states_split = states_dict_splitter.split(states)
                actions = states_split[-2]
                rewards = states_split[-1]

            # Calculate the loss.
            loss, loss_per_item = loss_function.loss(logits, action_probs_mu,
                                                     state_values_pi, actions,
                                                     rewards, terminals)
            policy_vars = policy._variables()

            # Pass vars and loss values into optimizer.
            step_op, loss, loss_per_item = optimizer.step(
                policy_vars, loss, loss_per_item)

            # Return optimizer op and all loss values.
            # TODO: Make it possible to return None from API-method without messing with the meta-graph.
            return step_op, (stage_op
                             if stage_op else step_op), loss, loss_per_item

    def get_action(self,
                   states,
                   internal_states=None,
                   use_exploration=True,
                   extra_returns=None):
        pass

    def _observe_graph(self, preprocessed_states, actions, internals, rewards,
                       terminals):
        self.graph_executor.execute(
            ("insert_records",
             [preprocessed_states, actions, rewards, terminals]))

    def update(self, batch=None):
        if batch is None:
            # Include stage_op or not?
            if self.has_gpu:
                return self.graph_executor.execute("update_from_memory")
            else:
                return self.graph_executor.execute(
                    ("update_from_memory", None, ([0, 2, 3, 4])))
        else:
            raise RLGraphError(
                "Cannot call update-from-batch on an IMPALA Agent.")

    def __repr__(self):
        return "IMPALAAgent(type={})".format(self.type)
class TestEnvironmentStepper(unittest.TestCase):
    """
    Tests for the EnvironmentStepper Component using a simple RandomEnv.
    """
    deterministic_env_state_space = FloatBox(shape=(1, ))
    deterministic_env_action_space = IntBox(2)
    deterministic_action_probs_space = FloatBox(shape=(2, ),
                                                add_batch_rank=True)

    internal_states_space = Tuple(FloatBox(shape=(256, )),
                                  FloatBox(shape=(256, )),
                                  add_batch_rank=True)
    internal_states_space_test_lstm = Tuple(FloatBox(shape=(3, )),
                                            FloatBox(shape=(3, )),
                                            add_batch_rank=True)

    action_probs_space = FloatBox(shape=(4, ), add_batch_rank=True)

    time_steps = 500

    def test_environment_stepper_on_deterministic_env(self):
        preprocessor_spec = None
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=5),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                np.array([True, False, False, False]),  # t_
                np.array([[0.0], [1.0], [2.0], [3.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, True, False]),  # t_
                np.array([[3.0], [4.0], [0.0], [1.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(
            self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        weights_hid = weights[
            "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[
            "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/bias"]
        weights_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"]
        biases_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"]

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                # t_
                np.array([True, False, False, False]),
                # s' (raw)
                np.array([[0.0], [1.0], [2.0], [3.0]]),
                # action probs
                np.array([
                    [0.0, 0.0],  # <- init (no input gets sent through NN).
                    softmax(
                        dense_layer(
                            dense_layer(np.array([0.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([0.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([1.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action))
                ])))
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, False, True]),
                np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
                np.array([
                    [0.0, 0.0],  # <- init (no input gets sent through NN).
                    softmax(
                        dense_layer(
                            dense_layer(np.array([1.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([2.0]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action)),
                    softmax(
                        dense_layer(
                            dense_layer(np.array([2.5]), weights_hid,
                                        biases_hid), weights_action,
                            biases_action))
                ])))
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(
            self):
        internal_states_space = Tuple(FloatBox(shape=(3, )),
                                      FloatBox(shape=(3, )))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        weights_lstm = weights[
            "environment-stepper/actor-component/policy/test-lstm-network/"
            "lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[
            "environment-stepper/actor-component/policy/test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"]
        biases_action = weights[
            "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"]

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm,
                            lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm,
                            lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm,
                            lstm_3[1])
        expected = (
            None,
            (
                np.array([True, False, False, True, False]),
                np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
                np.array([
                    [0.0, 0.0],
                    softmax(
                        dense_layer(np.squeeze(lstm_1[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_2[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_3[0]), weights_action,
                                    biases_action)),
                    softmax(
                        dense_layer(np.squeeze(lstm_4[0]), weights_action,
                                    biases_action)),
                ]),  # action probs
                # internal states
                (np.squeeze(
                    np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0],
                              lstm_3[1][0], lstm_4[1][0]])),
                 np.squeeze(
                     np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1],
                               lstm_3[1][1], lstm_4[1][1]])))))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        # Reset the stepper.
        test.test("reset")
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[1][0].dtype == np.float32)  # preprocessed states
        #self.assertTrue(out[1][0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[1][0].max() <= 1.0)
        #self.assertTrue(out[1][1].dtype == np.int32)  # actions
        #self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        #self.assertTrue(out[1][3].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        self.assertTrue(out[1][2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[1][2].max() <= 1.0)

        # Check whether episode returns match single rewards (including resetting after each terminal signal).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[2][i]
        #    self.assertAlmostEqual(episode_returns, out[1][3][i])
        #    # Terminal: Reset accumulated episode-return before next step.
        #    if out[1][4][i] is np.bool_(True):
        #        episode_returns = 0.0

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()

    def test_compare_with_non_env_stepper(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space.with_batch_rank()
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(states=state_space),
            action_space=action_space,
        )
        s = dummy_env.reset()
        time_start = time.monotonic()
        for i in range(self.time_steps):
            out = test.test(
                ("get_preprocessed_state_and_action", np.array([s])))
            #preprocessed_s = out["preprocessed_state"]
            a = out["action"]
            # Act in env.
            s, r, t, _ = dummy_env.step(a[0])  # remove batch
            if t is True:
                s = dummy_env.reset()
        time_end = time.monotonic()
        print("Done running {} steps in bare-metal env in {}sec.".format(
            self.time_steps, time_end - time_start))
        test.terminate()

    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()
示例#14
0
class TestFIFOQueue(unittest.TestCase):
    """
    Tests sampling and insertion behaviour of the FIFOQueue class.
    """
    record_space = Dict(states=dict(state1=float, state2=float, state3=bool),
                        actions=dict(action1=float,
                                     action2=Tuple(float, float)),
                        reward=float,
                        terminals=BoolBox(),
                        add_batch_rank=True)
    capacity = 10

    input_spaces = dict(records=record_space, num_records=int)

    def test_enqueue_dequeue(self):
        """
        Simply tests insert op without checking internal logic.
        """
        fifo_queue = FIFOQueue(capacity=self.capacity,
                               record_space=self.record_space)
        test = ComponentTest(component=fifo_queue,
                             input_spaces=self.input_spaces)

        first_record = self.record_space.sample(size=1)
        test.test(("insert_records", first_record), expected_outputs=None)
        test.test("get_size", expected_outputs=1)

        further_records = self.record_space.sample(size=5)
        test.test(("insert_records", further_records), expected_outputs=None)
        test.test("get_size", expected_outputs=6)

        expected = dict()
        for (k1, v1), (k2, v2) in zip(
                flatten_op(first_record).items(),
                flatten_op(further_records).items()):
            expected[k1] = np.concatenate((v1, v2[:4]))
        expected = unflatten_op(expected)

        test.test(("get_records", 5), expected_outputs=expected)
        test.test("get_size", expected_outputs=1)

    def test_capacity(self):
        """
        Tests if insert correctly blocks when capacity is reached.
        """
        fifo_queue = FIFOQueue(capacity=self.capacity,
                               record_space=self.record_space)
        test = ComponentTest(component=fifo_queue,
                             input_spaces=self.input_spaces)

        def run(expected_):
            # Wait n seconds.
            time.sleep(2)
            # Pull something out of the queue again to continue.
            test.test(("get_records", 2), expected_outputs=expected_)

        # Insert one more element than capacity
        records = self.record_space.sample(size=self.capacity + 1)

        expected = dict()
        for key, value in flatten_op(records).items():
            expected[key] = value[:2]
        expected = unflatten_op(expected)

        # Start thread to save this one from getting stuck due to capacity overflow.
        thread = threading.Thread(target=run, args=(expected, ))
        thread.start()

        print("Going over capacity: blocking ...")
        test.test(("insert_records", records), expected_outputs=None)
        print("Dequeued some items in another thread. Unblocked.")

        thread.join()

    def test_fifo_queue_with_distributed_tf(self):
        """
        Tests if FIFO is correctly shared between two processes running in distributed tf.
        """
        cluster_spec = dict(source=["localhost:22222"],
                            target=["localhost:22223"])

        def run1():
            fifo_queue_1 = FIFOQueue(capacity=self.capacity,
                                     device="/job:source/task:0/cpu",
                                     record_space=self.record_space)
            test_1 = ComponentTest(component=fifo_queue_1,
                                   input_spaces=self.input_spaces,
                                   execution_spec=dict(
                                       mode="distributed",
                                       distributed_spec=dict(
                                           job="source",
                                           task_index=0,
                                           cluster_spec=cluster_spec)))
            # Insert elements from source.
            records = self.record_space.sample(size=self.capacity)
            print("inserting into source-side queue ...")
            test_1.test(("insert_records", records), expected_outputs=None)
            print("size of source-side queue:")
            print(test_1.test("get_size", expected_outputs=None))
            # Pull one sample out.
            print("pulling from source-side queue:")
            print(test_1.test(("get_records", 2), expected_outputs=None))

            test_1.terminate()

        def run2():
            fifo_queue_2 = FIFOQueue(capacity=self.capacity,
                                     device="/job:source/task:0/cpu",
                                     record_space=self.record_space)
            test_2 = ComponentTest(component=fifo_queue_2,
                                   input_spaces=self.input_spaces,
                                   execution_spec=dict(
                                       mode="distributed",
                                       distributed_spec=dict(
                                           job="target",
                                           task_index=0,
                                           cluster_spec=cluster_spec)))
            # Dequeue elements in target.
            print("size of target-side queue:")
            print(test_2.test("get_size", expected_outputs=None))
            print("pulling from target-side queue:")
            print(test_2.test(("get_records", 5), expected_outputs=None))

            test_2.terminate()

        # Start thread to save this one from getting stuck due to capacity overflow.
        thread_1 = threading.Thread(target=run1)
        thread_2 = threading.Thread(target=run2)
        thread_1.start()
        thread_2.start()

        thread_1.join()
        thread_2.join()
    def test_keras_style_complex_multi_stream_nn(self):
        # 3 inputs.
        input_spaces = [
            Dict({
                "img": FloatBox(shape=(6, 6, 3)),
                "int": IntBox(3)
            }, add_batch_rank=True, add_time_rank=True),
            FloatBox(shape=(2,), add_batch_rank=True),
            Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True)
        ]

        # Same NN as in test above, only using some of the sub-Spaces from the input spaces.
        # Tests whether this NN can add automatically the correct splitters.
        folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1])
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")(
            embedding_out, sequence_length=lengths
        )
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1])

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"])
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"])
        unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 3 outputs.
        neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]),
                  input_spaces[2].sample(sample_shape)]

        out = test.test(("call", tuple(input_)), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape + (1,))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2,))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()