Пример #1
0
    def test_environment_stepper_on_2x2_grid_world(self):
        preprocessor_spec = [dict(
            type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space.num_categories
        )]
        network_spec = config_from_path("configs/test_simple_nn.json")
        # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal).
        network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]]
        network_spec["layers"][0]["biases_spec"] = False
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_adapter_spec=dict(
                weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]],
                biases_spec=False
            ), action_space=self.grid_world_2x2_action_space, deterministic=True),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="grid_world", world="2x2"),
            actor_component_spec=actor_component,
            state_space=self.grid_world_2x2_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.grid_world_2x2_action_probs_space,
            num_steps=5
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.grid_world_2x2_action_space,
        )

        # Step 5 times through the Env and collect results.
        expected = (
            np.array([False, True, False, True, False]),  # t_
            np.array([0, 1, 0, 1, 0, 1]),  # s' (raw)
            np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299]], dtype=np.float32)
        )
        out = test.test("step", expected_outputs=expected, decimals=2)
        print(out)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([True, False, True, False, True]),  # t_
            np.array([1, 0, 1, 0, 1, 0]),  # s' (raw)
            np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825]], dtype=np.float32)
        )
        out = test.test("step", expected_outputs=expected)
        print(out)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #2
0
    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        expected = (
            # t_
            np.array([False, False, False]),
            # s' (raw)
            np.array([[0.0], [1.0], [2.0], [3.0]]),
            # action probs
            np.array([
                softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([False, False, True]),
            np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #3
0
    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self):
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1])
        expected = (
            np.array([False, False, True, False]),
            np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)),
            ]),  # action probs
            # internal states
            (
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])),
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))
            )
        )
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #4
0
    def test_to_find_out_what_breaks_specifiable_server_start_via_thread_pools(
            self):
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # The images from the env  are divided by 255.
                    RGB_INTERLEAVED=[dict(type="divide", divisor=255)],
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec.
            dict(network_spec=LargeIMPALANetwork(), action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=100,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")
Пример #5
0
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai-gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_space=action_space,
                 **agent_config["policy_spec"]),
            agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1].max() <= 255)
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[2].max() <= 1.0)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #6
0
    def test_environment_stepper_on_2x2_grid_world_returning_actions_and_rewards(
            self):
        preprocessor_spec = [
            dict(type="reshape",
                 flatten=True,
                 flatten_categories=self.grid_world_2x2_action_space.
                 num_categories)
        ]
        network_spec = config_from_path("configs/test_simple_nn.json")
        # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal).
        network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1],
                                                     [-0.2, 0.2], [-0.4, 0.2]]
        network_spec["layers"][0]["biases_spec"] = False
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1],
                                                        [0.4, 0.2, -0.2, 0.2]],
                                          biases_spec=False),
                 action_space=self.grid_world_2x2_action_space,
                 deterministic=True), exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="grid_world", world="2x2"),
            actor_component_spec=actor_component,
            state_space=self.grid_world_2x2_state_space,
            reward_space="float32",
            add_action=True,
            add_reward=True,
            num_steps=5)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.grid_world_2x2_action_space,
        )

        # Step 5 times through the Env and collect results.
        expected = (
            np.array([False, True, False, True, False]),  # t_
            np.array([0, 1, 0, 1, 0, 1]),  # s' (raw)
            np.array([2, 1, 2, 1, 2]),  # actions taken
            np.array([-1.0, 1.0, -1.0, 1.0, -1.0])  # rewards
        )
        out = test.test("step", expected_outputs=expected, decimals=2)
        print(out)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
    def test_environment_stepper_on_deterministic_env(self):
        preprocessor_spec = None
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=5),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        # Reset the stepper.
        test.test("reset")

        # Step 3 times through the Env and collect results.
        expected = (
            None,
            (
                np.array([True, False, False, False]),  # t_
                np.array([[0.0], [1.0], [2.0], [3.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            None,
            (
                np.array([False, False, True, False]),  # t_
                np.array([[3.0], [4.0], [0.0], [1.0]]),  # s' (raw)
            ))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #8
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 num_workers=1,
                 worker_sample_size=100,
                 dynamic_batching=False,
                 visualize=False,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            num_workers (int): How many actors (workers) should be run in separate threads.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.
            dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the
                optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file).
                Default: False.
            visualize (Union[int,bool]): Whether and how many workers to visualize.
                Default: False (no visualization).
        """
        # Now that we fixed the Agent's spec, call the super constructor.
        super(SingleIMPALAAgent, self).__init__(
            type="single",
            discount=discount,
            architecture=architecture,
            fifo_queue_spec=fifo_queue_spec,
            environment_spec=environment_spec,
            feed_previous_action_through_nn=feed_previous_action_through_nn,
            feed_previous_reward_through_nn=feed_previous_reward_through_nn,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            worker_sample_size=worker_sample_size,
            name=kwargs.pop("name", "impala-single-agent"),
            **kwargs)
        self.dynamic_batching = dynamic_batching
        self.num_workers = num_workers
        self.visualize = visualize

        # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we
        # actually call below during our build.
        if self.dynamic_batching:
            self.policy = DynamicBatchingPolicy(policy_spec=self.policy,
                                                scope="")

        self.env_output_splitter = ContainerSplitter(
            tuple_length=3 if self.has_rnn is False else 4,
            scope="env-output-splitter")
        self.fifo_output_splitter = ContainerSplitter(
            *self.fifo_queue_keys, scope="fifo-output-splitter")
        self.states_dict_splitter = ContainerSplitter(
            *list(self.fifo_record_space["states"].keys(
            ) if isinstance(self.state_space, Dict) else "dummy"),
            scope="states-dict-splitter")

        self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

        # Slice some data from the EnvStepper (e.g only first internal states are needed).
        if self.has_rnn:
            internal_states_slicer = Slice(scope="internal-states-slicer",
                                           squeeze=True)
        else:
            internal_states_slicer = None

        self.transposer = Transpose(scope="transposer")

        # Create an IMPALALossFunction with some parameters.
        self.loss_function = IMPALALossFunction(
            discount=self.discount,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            slice_actions=self.feed_previous_action_through_nn,
            slice_rewards=self.feed_previous_reward_through_nn)

        # Merge back to insert into FIFO.
        self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

        # Dummy Flattener to calculate action-probs space.
        dummy_flattener = ReShape(
            flatten=True, flatten_categories=self.action_space.num_categories)

        self.environment_steppers = list()
        for i in range(self.num_workers):
            environment_spec_ = copy.deepcopy(environment_spec)
            if self.visualize is True or (isinstance(self.visualize, int)
                                          and i + 1 <= self.visualize):
                environment_spec_["visualize"] = True

            # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1.
            policy_spec = copy.deepcopy(self.policy_spec)
            if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \
                    "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]:
                policy_spec["network_spec"]["worker_sample_size"] = 1

            env_stepper = EnvironmentStepper(
                environment_spec=environment_spec_,
                actor_component_spec=ActorComponent(
                    preprocessor_spec=self.preprocessing_spec,
                    policy_spec=policy_spec,
                    exploration_spec=self.exploration_spec),
                state_space=self.state_space.with_batch_rank(),
                action_space=self.action_space.with_batch_rank(),
                reward_space=float,
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_action=not self.feed_previous_action_through_nn,
                add_reward=not self.feed_previous_reward_through_nn,
                add_previous_action_to_state=self.
                feed_previous_action_through_nn,
                add_previous_reward_to_state=self.
                feed_previous_reward_through_nn,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space),
                scope="env-stepper-{}".format(i))
            if self.dynamic_batching:
                env_stepper.actor_component.policy.parent_component = None
                env_stepper.actor_component.policy = DynamicBatchingPolicy(
                    policy_spec=env_stepper.actor_component.policy, scope="")
                env_stepper.actor_component.add_components(
                    env_stepper.actor_component.policy)

            self.environment_steppers.append(env_stepper)

        # Create the QueueRunners (one for each env-stepper).
        self.queue_runner = QueueRunner(
            self.fifo_queue,
            "step",
            -1,  # -1: Take entire return value of API-method `step` as record to insert.
            self.env_output_splitter,
            self.fifo_input_merger,
            internal_states_slicer,
            *self.environment_steppers)

        sub_components = [
            self.fifo_output_splitter, self.fifo_queue, self.queue_runner,
            self.transposer, self.staging_area, self.preprocessor,
            self.states_dict_splitter, self.policy, self.loss_function,
            self.optimizer
        ]

        # Add all the agent's sub-components to the root.
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              build_options=None)
            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))
                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
Пример #9
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 worker_sample_size=100,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.

        Keyword Args:
            type (str): One of "single", "actor" or "learner". Default: "single".
        """
        type_ = kwargs.pop("type", "single")
        assert type_ in ["single", "actor", "learner"]
        self.type = type_
        self.worker_sample_size = worker_sample_size

        # Network-spec by default is a "large architecture" IMPALA network.
        self.network_spec = kwargs.pop(
            "network_spec",
            dict(
                type=
                "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork"
                .format("Large" if architecture == "large" else "Small")))
        if isinstance(self.network_spec, dict) and "type" in self.network_spec and \
                "IMPALANetwork" in self.network_spec["type"]:
            self.network_spec = default_dict(
                self.network_spec,
                dict(worker_sample_size=1 if self.type ==
                     "actor" else self.worker_sample_size + 1))

        # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need.
        self.exploration_spec = kwargs.pop("exploration_spec", None)
        optimizer_spec = kwargs.pop("optimizer_spec", None)
        observe_spec = kwargs.pop("observe_spec", None)

        self.feed_previous_action_through_nn = feed_previous_action_through_nn
        self.feed_previous_reward_through_nn = feed_previous_reward_through_nn

        # Run everything in a single process.
        if self.type == "single":
            environment_spec = environment_spec or self.default_environment_spec
            update_spec = kwargs.pop("update_spec", None)
        # Actors won't need to learn (no optimizer needed in graph).
        elif self.type == "actor":
            optimizer_spec = None
            update_spec = kwargs.pop("update_spec", dict(do_updates=False))
            environment_spec = environment_spec or self.default_environment_spec
        # Learners won't need to explore (act) or observe (insert into Queue).
        else:
            observe_spec = None
            update_spec = kwargs.pop("update_spec", None)
            environment_spec = None

        # Add previous-action/reward preprocessors to env-specific preprocessor spec.
        # TODO: remove this empty hard-coded preprocessor.
        self.preprocessing_spec = kwargs.pop(
            "preprocessing_spec",
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # Flatten actions.
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=kwargs.get(
                                 "action_space").num_categories)
                    ],
                    # Bump reward and convert to float32, so that it can be concatenated by the Concat layer.
                    previous_reward=[dict(type="reshape", new_shape=(1, ))])))

        # Limit communication in distributed mode between each actor and the learner (never between actors).
        execution_spec = kwargs.pop("execution_spec", None)
        if execution_spec is not None and execution_spec.get(
                "mode") == "distributed":
            default_dict(
                execution_spec["session_config"],
                dict(type="monitored-training-session",
                     allow_soft_placement=True,
                     device_filters=["/job:learner/task:0"] + ([
                         "/job:actor/task:{}".format(
                             execution_spec["distributed_spec"]["task_index"])
                     ] if self.type == "actor" else ["/job:learner/task:0"])))
            # If Actor, make non-chief in either case (even if task idx == 0).
            if self.type == "actor":
                execution_spec["distributed_spec"]["is_chief"] = False
                # Hard-set device to the CPU for actors.
                execution_spec["device_strategy"] = "custom"
                execution_spec[
                    "default_device"] = "/job:{}/task:{}/cpu".format(
                        self.type,
                        execution_spec["distributed_spec"]["task_index"])

        self.policy_spec = kwargs.pop("policy_spec", dict())
        # TODO: Create some auto-setting based on LSTM inside the NN.
        default_dict(
            self.policy_spec,
            dict(type="shared-value-function-policy",
                 deterministic=False,
                 reuse_variable_scope="shared-policy",
                 action_space=kwargs.get("action_space")))

        # Now that we fixed the Agent's spec, call the super constructor.
        super(IMPALAAgent,
              self).__init__(discount=discount,
                             preprocessing_spec=self.preprocessing_spec,
                             network_spec=self.network_spec,
                             policy_spec=self.policy_spec,
                             exploration_spec=self.exploration_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             execution_spec=execution_spec,
                             name=kwargs.pop(
                                 "name", "impala-{}-agent".format(self.type)),
                             **kwargs)
        # Always use 1st learner as the parameter server for all policy variables.
        if self.execution_spec["mode"] == "distributed" and self.execution_spec[
                "distributed_spec"]["cluster_spec"]:
            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu")))

        # Check whether we have an RNN.
        self.has_rnn = self.policy.neural_network.has_rnn()
        # Check, whether we are running with GPU.
        self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \
            self.execution_spec["gpu_spec"]["num_gpus"] > 0

        # Some FIFO-queue specs.
        self.fifo_queue_keys = ["terminals", "states"] + \
                               (["actions"] if not self.feed_previous_action_through_nn else []) + \
                               (["rewards"] if not self.feed_previous_reward_through_nn else []) + \
                               ["action_probs"] + \
                               (["initial_internal_states"] if self.has_rnn else [])
        # Define FIFO record space.
        # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain
        # num-steps items.
        self.fifo_record_space = Dict(
            {
                "terminals":
                bool,
                "action_probs":
                FloatBox(shape=(self.action_space.num_categories, )),
            },
            add_batch_rank=False,
            add_time_rank=self.worker_sample_size)
        self.fifo_record_space["states"] = self.state_space.with_time_rank(
            self.worker_sample_size + 1)
        # Add action and rewards to state or do they have an extra channel?
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_action"] = \
                self.action_space.with_time_rank(self.worker_sample_size + 1)
        else:
            self.fifo_record_space[
                "actions"] = self.action_space.with_time_rank(
                    self.worker_sample_size)
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_reward"] = FloatBox(
                add_time_rank=self.worker_sample_size + 1)
        else:
            self.fifo_record_space["rewards"] = FloatBox(
                add_time_rank=self.worker_sample_size)

        if self.has_rnn:
            self.fifo_record_space[
                "initial_internal_states"] = self.internal_states_space.with_time_rank(
                    False)

        # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue).
        self.fifo_queue = FIFOQueue.from_spec(
            fifo_queue_spec or dict(capacity=1),
            reuse_variable_scope="shared-fifo-queue",
            only_insert_single_records=True,
            record_space=self.fifo_record_space,
            device="/job:learner/task:0/cpu"
            if self.execution_spec["mode"] == "distributed"
            and self.execution_spec["distributed_spec"]["cluster_spec"] else
            None)

        # Remove `states` key from input_spaces: not needed.
        del self.input_spaces["states"]

        # Add all our sub-components to the core.
        if self.type == "single":
            pass

        elif self.type == "actor":
            # No learning, no loss function.
            self.loss_function = None
            # A Dict Splitter to split things from the EnvStepper.
            self.env_output_splitter = ContainerSplitter(
                tuple_length=4, scope="env-output-splitter")

            self.states_dict_splitter = None

            # Slice some data from the EnvStepper (e.g only first internal states are needed).
            self.internal_states_slicer = Slice(scope="internal-states-slicer",
                                                squeeze=True)
            # Merge back to insert into FIFO.
            self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

            # Dummy Flattener to calculate action-probs space.
            dummy_flattener = ReShape(
                flatten=True,
                flatten_categories=self.action_space.num_categories)
            self.environment_stepper = EnvironmentStepper(
                environment_spec=environment_spec,
                actor_component_spec=ActorComponent(self.preprocessor,
                                                    self.policy,
                                                    self.exploration),
                state_space=self.state_space.with_batch_rank(),
                reward_space=
                float,  # TODO <- float64 for deepmind? may not work for other envs
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_previous_action_to_state=True,
                add_previous_reward_to_state=True,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space))
            sub_components = [
                self.environment_stepper, self.env_output_splitter,
                self.internal_states_slicer, self.fifo_input_merger,
                self.fifo_queue
            ]
        # Learner.
        else:
            self.environment_stepper = None

            # A Dict splitter to split up items from the queue.
            self.fifo_input_merger = None
            self.fifo_output_splitter = ContainerSplitter(
                *self.fifo_queue_keys, scope="fifo-output-splitter")
            self.states_dict_splitter = ContainerSplitter(
                *list(self.fifo_record_space["states"].keys()),
                scope="states-dict-splitter")
            self.internal_states_slicer = None

            self.transposer = Transpose(
                scope="transposer", device=dict(ops="/job:learner/task:0/cpu"))
            self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

            # Create an IMPALALossFunction with some parameters.
            self.loss_function = IMPALALossFunction(
                discount=self.discount,
                weight_pg=weight_pg,
                weight_baseline=weight_baseline,
                weight_entropy=weight_entropy,
                slice_actions=self.feed_previous_action_through_nn,
                slice_rewards=self.feed_previous_reward_through_nn,
                device="/job:learner/task:0/gpu")

            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu",
                                 ops="/job:learner/task:0/gpu")))
            for component in [
                    self.staging_area, self.preprocessor, self.optimizer
            ]:
                component.propagate_sub_component_properties(
                    dict(device="/job:learner/task:0/gpu"))

            sub_components = [
                self.fifo_output_splitter, self.fifo_queue,
                self.states_dict_splitter, self.transposer, self.staging_area,
                self.preprocessor, self.policy, self.loss_function,
                self.optimizer
            ]

        if self.type != "single":
            # Add all the agent's sub-components to the root.
            self.root_component.add_components(*sub_components)

            # Define the Agent's (root Component's) API.
            self.define_graph_api(*sub_components)

        if self.type != "single" and self.auto_build:
            if self.type == "learner":
                build_options = dict(
                    build_device_context="/job:learner/task:0/cpu",
                    pin_global_variable_device="/job:learner/task:0/cpu")
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=build_options)
            else:
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=None)

            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))

                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
            if self.type == "actor":
                self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \
                    out_op_columns[0].op_records[0].op
Пример #10
0
    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()
Пример #11
0
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        # Reset the stepper.
        test.test("reset")
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[1][0].dtype == np.float32)  # preprocessed states
        #self.assertTrue(out[1][0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[1][0].max() <= 1.0)
        #self.assertTrue(out[1][1].dtype == np.int32)  # actions
        #self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        #self.assertTrue(out[1][3].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        self.assertTrue(out[1][2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[1][2].max() <= 1.0)

        # Check whether episode returns match single rewards (including resetting after each terminal signal).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[2][i]
        #    self.assertAlmostEqual(episode_returns, out[1][3][i])
        #    # Terminal: Reset accumulated episode-return before next step.
        #    if out[1][4][i] is np.bool_(True):
        #        episode_returns = 0.0

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Пример #12
0
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    ## The images from the env  are divided by 255.
                    #RGB_INTERLEAVED=[dict(type="divide", divisor=255)],
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec.
            dict(network_spec=LargeIMPALANetwork(), action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=100,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0]["INSTR"].dtype == np.object)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32)
        self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >=
                        0.0)  # make sure we have pixels / 255
        self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0)
        self.assertTrue(out[1].dtype == np.int32)  # actions
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[3].dtype == np.float32)  # episode return
        self.assertTrue(out[4].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[5]["INSTR"].dtype ==
                        np.object)  # next state (raw, not preprocessed)
        self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype ==
                        np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(
            out[5]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255)
        # action probs (test whether sum to one).
        self.assertTrue(out[6].dtype == np.float32)
        self.assertTrue(out[6].min() >= 0.0)
        self.assertTrue(out[6].max() <= 1.0)
        recursive_assert_almost_equal(
            out[6].sum(axis=-1, keepdims=False),
            np.ones(shape=(environment_stepper.num_steps, )),
            decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[7][0].dtype == np.float32)
        self.assertTrue(out[7][1].dtype == np.float32)
        self.assertTrue(out[7][0].shape == (environment_stepper.num_steps,
                                            256))
        self.assertTrue(out[7][1].shape == (environment_stepper.num_steps,
                                            256))

        # Check whether episode returns match single rewards (including terminal signals).
        episode_returns = 0.0
        for i in range(environment_stepper.num_steps):
            episode_returns += out[2][i]
            self.assertAlmostEqual(episode_returns, out[3][i])
            # Terminal: Reset for next step.
            if out[4][i] is np.bool_(True):
                episode_returns = 0.0

        test.terminate()
Пример #13
0
    def test_environment_stepper_component_with_large_impala_architecture(
            self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        worker_sample_size = 100
        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED", "INSTR"],
                        frameskip=4)
        dummy_env = DeepmindLabEnv.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only for image and prev-action channel).
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # The prev. action/reward from the env must be flattened/bumped-up-to-(1,).
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=action_space.num_categories)
                    ],
                    previous_reward=[
                        dict(type="reshape", new_shape=(1, )),
                        dict(type="convert_type", to_dtype="float32")
                    ],
                )),
            # Policy spec. worker_sample_size=1 as its an actor network.
            dict(network_spec=LargeIMPALANetwork(worker_sample_size=1),
                 action_space=action_space))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space,
            num_steps=worker_sample_size,
            # Add both prev-action and -reward into the state sent through the network.
            add_previous_action_to_state=True,
            add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=self.action_probs_space)

        test = ComponentTest(component=environment_stepper,
                             action_space=action_space,
                             execution_spec=dict(disable_monitoring=True))

        environment_stepper.environment_server.start_server()

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.perf_counter()
        steps = 10
        for _ in range(steps):
            out = test.test("step")
        time_total = time.perf_counter() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)."
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(out[1]["INSTR"].dtype == np.object)
        self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) +
            state_space["RGB_INTERLEAVED"].shape)
        self.assertTrue(
            out[1]["RGB_INTERLEAVED"].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255)
        self.assertTrue(out[1]["previous_action"].dtype == np.int32)  # actions
        self.assertTrue(
            out[1]["previous_action"].shape == (worker_sample_size + 1, ))
        self.assertTrue(
            out[1]["previous_reward"].dtype == np.float32)  # rewards
        self.assertTrue(
            out[1]["previous_reward"].shape == (worker_sample_size + 1, ))
        # action probs (test whether sum to one).
        self.assertTrue(out[2].dtype == np.float32)
        self.assertTrue(out[2].shape == (100, action_space.num_categories))
        self.assertTrue(out[2].min() >= 0.0)
        self.assertTrue(out[2].max() <= 1.0)
        recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False),
                                      np.ones(shape=(worker_sample_size, )),
                                      decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256))
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256))

        environment_stepper.environment_server.stop_server()

        test.terminate()