def test_compare_with_non_env_stepper(self):
     environment_spec = dict(type="openai_gym",
                             gym_env="Pong-v0",
                             frameskip=4,
                             seed=10)
     dummy_env = Environment.from_spec(environment_spec)
     state_space = dummy_env.state_space.with_batch_rank()
     action_space = dummy_env.action_space
     agent_config = config_from_path("configs/dqn_agent_for_pong.json")
     actor_component = ActorComponent(
         agent_config["preprocessing_spec"],
         dict(network_spec=agent_config["network_spec"],
              action_adapter_spec=agent_config["action_adapter_spec"],
              action_space=action_space), agent_config["exploration_spec"])
     test = ComponentTest(
         component=actor_component,
         input_spaces=dict(states=state_space),
         action_space=action_space,
     )
     s = dummy_env.reset()
     time_start = time.monotonic()
     for i in range(self.time_steps):
         out = test.test(
             ("get_preprocessed_state_and_action", np.array([s])))
         #preprocessed_s = out["preprocessed_state"]
         a = out["action"]
         # Act in env.
         s, r, t, _ = dummy_env.step(a[0])  # remove batch
         if t is True:
             s = dummy_env.reset()
     time_end = time.monotonic()
     print("Done running {} steps in bare-metal env in {}sec.".format(
         self.time_steps, time_end - time_start))
     test.terminate()
Exemplo n.º 2
0
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai-gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_space=action_space,
                 **agent_config["policy_spec"]),
            agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(isinstance(
            out, DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        self.assertTrue(out[0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1].max() <= 255)
        self.assertTrue(out[2].dtype == np.float32)  # rewards
        self.assertTrue(out[2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[2].max() <= 1.0)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Exemplo n.º 3
0
    def __init__(self,
                 environment_spec,
                 actor_component_spec,
                 num_steps=20,
                 state_space=None,
                 reward_space=None,
                 internal_states_space=None,
                 add_action_probs=False,
                 action_probs_space=None,
                 add_action=False,
                 add_reward=False,
                 add_previous_action_to_state=False,
                 add_previous_reward_to_state=False,
                 scope="environment-stepper",
                 **kwargs):
        """
        Args:
            environment_spec (dict): A specification dict for constructing an Environment object that will be run
                inside a SpecifiableServer for in-graph stepping.
            actor_component_spec (Union[ActorComponent,dict]): A specification dict to construct this EnvStepper's
                ActionComponent (to generate actions) or an already constructed ActionComponent object.
            num_steps (int): The number of steps to perform per `step` call.
            state_space (Optional[Space]): The state Space of the Environment. If None, will construct a dummy
                environment to get the state Space from there.
            reward_space (Optional[Space]): The reward Space of the Environment. If None, will construct a dummy
                environment to get the reward Space from there.
            internal_states_space (Optional[Space]): The internal states Space (when using an RNN inside the
                ActorComponent).
            add_action_probs (bool): Whether to add all action probabilities for each step to the ActionComponent's
                outputs at each step. These will be added as additional tensor inside the
                Default: False.
            action_probs_space (Optional[Space]): If add_action_probs is True, the Space that the action_probs will have.
                This is usually just the flattened (one-hot) action space.
            add_action (bool): Whether to add the action to the output of the `step` API-method.
                Default: False.
            add_reward (bool): Whether to add the reward to the output of the `step` API-method.
                Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
            add_previous_action_to_state (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: False.
            add_previous_reward_to_state (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: False.
        """
        super(EnvironmentStepper, self).__init__(scope=scope, **kwargs)

        # Only to retrieve some information about the particular Env.
        dummy_env = Environment.from_spec(
            environment_spec)  # type: Environment

        # Create the SpecifiableServer with the given env spec.
        if state_space is None or reward_space is None:
            state_space = dummy_env.state_space
            if reward_space is None:
                _, reward, _, _ = dummy_env.step(
                    dummy_env.action_space.sample())
                # TODO: this may break on non 64-bit machines. tf seems to interpret a python float as tf.float64.
                reward_space = Space.from_spec(
                    "float64" if type(reward) == float else float,
                    shape=(1, )).with_batch_rank()
        else:
            reward_space = Space.from_spec(reward_space).with_batch_rank()

        self.reward_space = reward_space
        self.action_space = dummy_env.action_space

        dummy_env.terminate()

        # The state that the environment produces.
        self.state_space_env = state_space
        # The state that must be fed into the actor-component to produce an action.
        # May contain prev_action and prev_reward.
        self.state_space_actor = state_space
        self.add_previous_action_to_state = add_previous_action_to_state
        self.add_previous_reward_to_state = add_previous_reward_to_state

        # Circle actions and/or rewards in `step` API-method?
        self.add_action = add_action
        self.add_reward = add_reward

        # The Problem with ContainerSpaces here is that py_func (SpecifiableServer) cannot handle container
        # spaces, which is why we need to painfully convert these into flat spaces and tuples here whenever
        # we make a call to the env. So to keep things unified, we treat all container spaces
        # (state space, preprocessed state) from here on as tuples of primitive spaces sorted by their would be
        # flat-keys in a flattened dict).
        self.state_space_env_flattened = self.state_space_env.flatten()
        # Need to flatten the state-space in case it's a ContainerSpace for the return dtypes.
        self.state_space_env_list = list(
            self.state_space_env_flattened.values())

        # TODO: automate this by lookup from the NN Component
        self.internal_states_space = None
        if internal_states_space is not None:
            self.internal_states_space = internal_states_space.with_batch_rank(
                add_batch_rank=1)

        # Add the action/reward spaces to the state space (must be Dict).
        if self.add_previous_action_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_action_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_action"] = self.action_space
        if self.add_previous_reward_to_state is True:
            assert isinstance(self.state_space_actor, Dict),\
                "ERROR: If `add_previous_reward_to_state` is True as input, state_space must be a Dict!"
            self.state_space_actor["previous_reward"] = self.reward_space
        self.state_space_actor_flattened = self.state_space_actor.flatten()
        self.state_space_actor_list = list(
            self.state_space_actor_flattened.values())

        self.add_action_probs = add_action_probs
        self.action_probs_space = action_probs_space

        self.environment_spec = environment_spec
        self.environment_server = SpecifiableServer(
            class_=Environment,
            spec=environment_spec,
            output_spaces=dict(
                step_for_env_stepper=self.state_space_env_list +
                [self.reward_space, bool],
                reset_for_env_stepper=self.state_space_env_list),
            shutdown_method="terminate")
        # Add the sub-components.
        self.actor_component = ActorComponent.from_spec(
            actor_component_spec)  # type: ActorComponent
        self.preprocessed_state_space = self.actor_component.preprocessor.get_preprocessed_space(
            self.state_space_actor)

        self.num_steps = num_steps

        # Variables that hold information of last step through Env.
        self.current_terminal = None
        self.current_state = None
        self.current_action = None  # Only if self.add_action is True.
        self.current_reward = None  # Only if self.add_reward is True.
        self.current_internal_states = None
        self.current_action_probs = None
        self.time_step = 0

        self.has_rnn = self.actor_component.policy.neural_network.has_rnn()

        # Add all sub-components (only ActorComponent).
        self.add_components(self.actor_component)
    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()
    def test_environment_stepper_on_pong(self):
        environment_spec = dict(type="openai_gym",
                                gym_env="Pong-v0",
                                frameskip=4,
                                seed=10)
        dummy_env = Environment.from_spec(environment_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        actor_component = ActorComponent(
            agent_config["preprocessing_spec"],
            dict(network_spec=agent_config["network_spec"],
                 action_adapter_spec=agent_config["action_adapter_spec"],
                 action_space=action_space), agent_config["exploration_spec"])
        environment_stepper = EnvironmentStepper(
            environment_spec=environment_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float",
            add_reward=True,
            num_steps=self.time_steps)

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )

        # Step 30 times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        # Reset the stepper.
        test.test("reset")
        time_start = time.monotonic()
        out = test.test("step")
        time_end = time.monotonic()
        print("Done running {} steps in env-stepper env in {}sec.".format(
            environment_stepper.num_steps, time_end - time_start))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[1][0].dtype == np.float32)  # preprocessed states
        #self.assertTrue(out[1][0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[1][0].max() <= 1.0)
        #self.assertTrue(out[1][1].dtype == np.int32)  # actions
        #self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        #self.assertTrue(out[1][3].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        self.assertTrue(out[1][2].dtype == np.float32)  # rewards
        self.assertTrue(out[1][2].min() >= -1.0)  # -1.0 to 1.0
        self.assertTrue(out[1][2].max() <= 1.0)

        # Check whether episode returns match single rewards (including resetting after each terminal signal).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[2][i]
        #    self.assertAlmostEqual(episode_returns, out[1][3][i])
        #    # Terminal: Reset accumulated episode-return before next step.
        #    if out[1][4][i] is np.bool_(True):
        #        episode_returns = 0.0

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()