예제 #1
0
    def __init__(self,
                 preprocessor_spec,
                 policy_spec,
                 exploration_spec,
                 max_likelihood=None,
                 **kwargs):
        """
        Args:
            preprocessor_spec (Union[list,dict,PreprocessorSpec]):
                - A dict if the state from the Env will come in as a ContainerSpace (e.g. Dict). In this case, each
                    each key in this dict specifies, which value in the incoming dict should go through which PreprocessorStack.
                - A list with layer specs.
                - A PreprocessorStack object.
            policy_spec (Union[dict,Policy]): A specification dict for a Policy object or a Policy object directly.
            exploration_spec (Union[dict,Exploration]): A specification dict for an Exploration object or an Exploration
                object directly.
            max_likelihood (Optional[bool]): See Policy's property `max_likelihood`.
                If not None, overwrites the equally named setting in the Policy object (defined by `policy_spec`).
        """
        super(ActorComponent,
              self).__init__(scope=kwargs.pop("scope", "actor-component"),
                             **kwargs)

        self.preprocessor = PreprocessorStack.from_spec(preprocessor_spec)
        self.policy = Policy.from_spec(policy_spec)
        self.exploration = Exploration.from_spec(exploration_spec)

        self.max_likelihood = max_likelihood

        self.add_components(self.policy, self.exploration, self.preprocessor)
예제 #2
0
    def __init__(self,
                 preprocessor_spec,
                 policy_spec,
                 exploration_spec=None,
                 **kwargs):
        """
        Args:
            preprocessor_spec (Union[list,dict,PreprocessorSpec]):
                - A dict if the state from the Env will come in as a ContainerSpace (e.g. Dict). In this case, each
                    each key in this dict specifies, which value in the incoming dict should go through which PreprocessorStack.
                - A list with layer specs.
                - A PreprocessorStack object.

            policy_spec (Union[dict,Policy]): A specification dict for a Policy object or a Policy object directly.

            exploration_spec (Union[dict,Exploration]): A specification dict for an Exploration object or an Exploration
                object directly.
        """
        super(ActorComponent,
              self).__init__(scope=kwargs.pop("scope", "actor-component"),
                             **kwargs)

        self.preprocessor = PreprocessorStack.from_spec(preprocessor_spec)
        self.policy = Policy.from_spec(policy_spec)
        self.num_nn_inputs = self.policy.neural_network.num_inputs
        self.exploration = Exploration.from_spec(exploration_spec)

        self.tuple_merger = ContainerMerger(is_tuple=True,
                                            merge_tuples_into_one=True)
        self.tuple_splitter = ContainerSplitter(
            tuple_length=self.num_nn_inputs)

        self.add_components(self.policy, self.exploration, self.preprocessor,
                            self.tuple_merger, self.tuple_splitter)
    def test_actor_component_with_lstm_network(self):
        # state space and internal state space
        state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False)
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True)
        time_percentages_space = FloatBox()
        # action_space.
        action_space = IntBox(2, add_batch_rank=True, add_time_rank=True)

        preprocessor = PreprocessorStack.from_spec(
            [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)]
        )
        policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space)
        exploration = Exploration(epsilon_spec=dict(decay_spec=dict(
            type="linear_decay", from_=1.0, to_=0.1)
        ))
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(
                states=state_space,
                other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True),
                time_percentage=time_percentages_space
            ),
            action_space=action_space
        )
        # Some state inputs (batch size=2, seq-len=1000; batch-major).
        np.random.seed(10)
        states = state_space.sample(size=(1000, 2))
        initial_internal_states = internal_states_space.zeros(size=2)  # only batch
        time_percentages = time_percentages_space.sample(1000)

        # Run n times a single time-step to simulate acting and env interaction with an LSTM.
        preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float)
        actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int)
        for i, time_percentage in enumerate(time_percentages):
            ret = test.test((
                "get_preprocessed_state_and_action",
                # expand time dim at 1st slot as we are time-major == False
                [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage]
            ))
            preprocessed_states[i] = ret["preprocessed_state"][:, 0, :]  # take out time-rank again ()
            actions[i] = ret["action"]
            # Check c/h-state shape.
            self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3))  # batch-size=2, LSTM units=3
            self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3))

        # Check all preprocessed states (easy: just divided by 10).
        expected_preprocessed_state = states / 10
        recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state)

        # Check the exploration functionality over the actions.
        # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small.
        stddev_actions = actions.std()
        self.assertGreater(stddev_actions, 0.4)
        self.assertLess(stddev_actions, 0.6)
예제 #4
0
    def test_exploration_with_continuous_action_space(self):
        # TODO not portable, redo with more general mean/stddev checks over a sample of distributed outputs.
        return
        # 2x2 action-pick, each composite action with 5 categories.
        action_space = FloatBox(shape=(2,2), add_batch_rank=True)

        distribution = Normal()
        action_adapter = ActionAdapter(action_space=action_space)

        # Our distribution to go into the Exploration object.
        nn_output_space = FloatBox(shape=(13,), add_batch_rank=True)  # 13: Any flat nn-output should be ok.

        exploration = Exploration.from_spec(dict(noise_spec=dict(type="gaussian_noise", mean=10.0, stddev=2.0)))

        # The Component to test.
        exploration_pipeline = Component(scope="continuous-plus-noise")
        exploration_pipeline.add_components(action_adapter, distribution, exploration, scope="exploration-pipeline")

        @rlgraph_api(component=exploration_pipeline)
        def get_action(self_, nn_output):
            _, parameters, _ = action_adapter.get_logits_probabilities_log_probs(nn_output)
            sample_stochastic = distribution.sample_stochastic(parameters)
            sample_deterministic = distribution.sample_deterministic(parameters)
            action = exploration.get_action(sample_stochastic, sample_deterministic)
            return action

        @rlgraph_api(component=exploration_pipeline)
        def get_noise(self_):
            return exploration.noise_component.get_noise()

        test = ComponentTest(component=exploration_pipeline, input_spaces=dict(nn_output=nn_output_space),
                             action_space=action_space)

        # Collect outputs in `collected` list to compare moments.
        collected = list()
        for _ in range_(1000):
            test.test("get_noise", fn_test=lambda component_test, outs: collected.append(outs))

        self.assertAlmostEqual(10.0, np.mean(collected), places=1)
        self.assertAlmostEqual(2.0, np.std(collected), places=1)

        np.random.seed(10)
        input_ = nn_output_space.sample(size=3)
        expected = np.array([[[13.163095, 8.46925],
                              [10.375976, 5.4675055]],
                             [[13.239931, 7.990649],
                              [10.03761, 10.465796]],
                             [[10.280741, 7.2384844],
                              [10.040194, 8.248206]]], dtype=np.float32)
        test.test(("get_action", input_), expected_outputs=expected, decimals=3)
예제 #5
0
    def test_simple_actor_component(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)
        # action_space.
        action_space = IntBox(10)

        preprocessor = PreprocessorStack.from_spec([
            dict(type="convert_type", to_dtype="float"),
            dict(type="multiply", factor=2)
        ])
        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        exploration = Exploration()  # no exploration
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(component=actor_component,
                             input_spaces=dict(states=state_space),
                             action_space=action_space)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)

        # Some state inputs (5 input nodes, batch size=2).
        states = state_space.sample(2)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_and_action", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions))

        # Get actions and action-probs by calling a different API-method.
        states = state_space.sample(5)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # No reshape necessary (simple action space), softmax to get probs.
        expected_action_probs = softmax(expected_action_layer_output)
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_action_and_action_probs", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions,
                      action_probs=expected_action_probs))
예제 #6
0
    def test_exploration_with_discrete_action_space(self):
        nn_output_space = FloatBox(shape=(13, ), add_batch_rank=True)
        time_step_space = IntBox(10000)
        # 2x2 action-pick, each composite action with 5 categories.
        action_space = IntBox(5, shape=(2, 2), add_batch_rank=True)

        # Our distribution to go into the Exploration object.
        distribution = Categorical()
        action_adapter = ActionAdapter(action_space=action_space)

        exploration = Exploration.from_spec(
            dict(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                   from_=1.0,
                                                   to_=0.0,
                                                   start_timestep=0,
                                                   num_timesteps=10000))))
        # The Component to test.
        exploration_pipeline = Component(action_adapter,
                                         distribution,
                                         exploration,
                                         scope="exploration-pipeline")

        @rlgraph_api(component=exploration_pipeline)
        def get_action(self_, nn_output, time_step):
            out = action_adapter.get_logits_probabilities_log_probs(nn_output)
            sample = distribution.sample_deterministic(out["probabilities"])
            action = exploration.get_action(sample, time_step)
            return action

        test = ComponentTest(component=exploration_pipeline,
                             input_spaces=dict(nn_output=nn_output_space,
                                               time_step=int),
                             action_space=action_space)

        # With exploration: Check, whether actions are equally distributed.
        nn_outputs = nn_output_space.sample(2)
        time_steps = time_step_space.sample(30)
        # Collect action-batch-of-2 for each of our various random time steps.
        # Each action is an int box of shape=(2,2)
        actions = np.ndarray(shape=(30, 2, 2, 2), dtype=np.int)
        for i, time_step in enumerate(time_steps):
            actions[i] = test.test(("get_action", [nn_outputs, time_step]),
                                   expected_outputs=None)

        # Assert some distribution of the actions.
        mean_action = actions.mean()
        stddev_action = actions.std()
        self.assertAlmostEqual(mean_action, 2.0, places=0)
        self.assertAlmostEqual(stddev_action, 1.0, places=0)

        # Without exploration (epsilon is force-set to 0.0): Check, whether actions are always the same
        # (given same nn_output all the time).
        nn_outputs = nn_output_space.sample(2)
        time_steps = time_step_space.sample(30) + 10000
        # Collect action-batch-of-2 for each of our various random time steps.
        # Each action is an int box of shape=(2,2)
        actions = np.ndarray(shape=(30, 2, 2, 2), dtype=np.int)
        for i, time_step in enumerate(time_steps):
            actions[i] = test.test(("get_action", [nn_outputs, time_step]),
                                   expected_outputs=None)

        # Assert zero stddev of the single action components.
        stddev_action_a = actions[:, 0, 0, 0].std(
        )  # batch item 0, action-component (0,0)
        self.assertAlmostEqual(stddev_action_a, 0.0, places=1)
        stddev_action_b = actions[:, 1, 1, 0].std(
        )  # batch item 1, action-component (1,0)
        self.assertAlmostEqual(stddev_action_b, 0.0, places=1)
        stddev_action_c = actions[:, 0, 0, 1].std(
        )  # batch item 0, action-component (0,1)
        self.assertAlmostEqual(stddev_action_c, 0.0, places=1)
        stddev_action_d = actions[:, 1, 1, 1].std(
        )  # batch item 1, action-component (1,1)
        self.assertAlmostEqual(stddev_action_d, 0.0, places=1)
        self.assertAlmostEqual(actions.std(), 1.0, places=0)
예제 #7
0
    def test_exploration_with_discrete_container_action_space(self):
        nn_output_space = FloatBox(shape=(12, ), add_batch_rank=True)
        time_step_space = IntBox(10000)
        # Some container action space.
        action_space = Dict(dict(a=IntBox(3), b=IntBox(2), c=IntBox(4)),
                            add_batch_rank=True)

        # Our distribution to go into the Exploration object.
        distribution_a = Categorical(scope="d_a")
        distribution_b = Categorical(scope="d_b")
        distribution_c = Categorical(scope="d_c")
        action_adapter_a = ActionAdapter(action_space=action_space["a"],
                                         scope="aa_a")
        action_adapter_b = ActionAdapter(action_space=action_space["b"],
                                         scope="aa_b")
        action_adapter_c = ActionAdapter(action_space=action_space["c"],
                                         scope="aa_c")

        exploration = Exploration.from_spec(
            dict(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                   from_=1.0,
                                                   to_=0.0,
                                                   start_timestep=0,
                                                   num_timesteps=10000))))
        # The Component to test.
        exploration_pipeline = Component(action_adapter_a,
                                         action_adapter_b,
                                         action_adapter_c,
                                         distribution_a,
                                         distribution_b,
                                         distribution_c,
                                         exploration,
                                         scope="exploration-pipeline")

        @rlgraph_api(component=exploration_pipeline)
        def get_action(self_, nn_output, time_step):
            out_a = action_adapter_a.get_logits_probabilities_log_probs(
                nn_output)
            out_b = action_adapter_b.get_logits_probabilities_log_probs(
                nn_output)
            out_c = action_adapter_c.get_logits_probabilities_log_probs(
                nn_output)
            sample_a = distribution_a.sample_deterministic(
                out_a["probabilities"])
            sample_b = distribution_b.sample_deterministic(
                out_b["probabilities"])
            sample_c = distribution_c.sample_deterministic(
                out_c["probabilities"])
            sample = self_._graph_fn_merge_actions(sample_a, sample_b,
                                                   sample_c)
            action = exploration.get_action(sample, time_step)
            return action

        @graph_fn(component=exploration_pipeline)
        def _graph_fn_merge_actions(self, a, b, c):
            return DataOpDict(a=a, b=b, c=c)

        test = ComponentTest(component=exploration_pipeline,
                             input_spaces=dict(nn_output=nn_output_space,
                                               time_step=int),
                             action_space=action_space)

        # With exploration: Check, whether actions are equally distributed.
        batch_size = 2
        num_time_steps = 30
        nn_outputs = nn_output_space.sample(batch_size)
        time_steps = time_step_space.sample(num_time_steps)
        # Collect action-batch-of-2 for each of our various random time steps.
        actions_a = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        actions_b = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        actions_c = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        for i, t in enumerate(time_steps):
            a = test.test(("get_action", [nn_outputs, t]),
                          expected_outputs=None)
            actions_a[i] = a["a"]
            actions_b[i] = a["b"]
            actions_c[i] = a["c"]

        # Assert some distribution of the actions.
        mean_action_a = actions_a.mean()
        stddev_action_a = actions_a.std()
        self.assertAlmostEqual(mean_action_a, 1.0, places=0)
        self.assertAlmostEqual(stddev_action_a, 1.0, places=0)
        mean_action_b = actions_b.mean()
        stddev_action_b = actions_b.std()
        self.assertAlmostEqual(mean_action_b, 0.5, places=0)
        self.assertAlmostEqual(stddev_action_b, 0.5, places=0)
        mean_action_c = actions_c.mean()
        stddev_action_c = actions_c.std()
        self.assertAlmostEqual(mean_action_c, 1.5, places=0)
        self.assertAlmostEqual(stddev_action_c, 1.0, places=0)

        # Without exploration (epsilon is force-set to 0.0): Check, whether actions are always the same
        # (given same nn_output all the time).
        nn_outputs = nn_output_space.sample(batch_size)
        time_steps = time_step_space.sample(num_time_steps) + 10000
        # Collect action-batch-of-2 for each of our various random time steps.
        actions_a = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        actions_b = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        actions_c = np.ndarray(shape=(num_time_steps, batch_size),
                               dtype=np.int)
        for i, t in enumerate(time_steps):
            a = test.test(("get_action", [nn_outputs, t]),
                          expected_outputs=None)
            actions_a[i] = a["a"]
            actions_b[i] = a["b"]
            actions_c[i] = a["c"]

        # Assert zero stddev of the single action components.
        stddev_action = actions_a[:,
                                  0].std()  # batch item 0, action-component a
        self.assertAlmostEqual(stddev_action, 0.0, places=1)
        stddev_action = actions_a[:,
                                  1].std()  # batch item 1, action-component a
        self.assertAlmostEqual(stddev_action, 0.0, places=1)

        stddev_action = actions_b[:,
                                  0].std()  # batch item 0, action-component b
        self.assertAlmostEqual(stddev_action, 0.0, places=1)
        stddev_action = actions_b[:,
                                  1].std()  # batch item 1, action-component b
        self.assertAlmostEqual(stddev_action, 0.0, places=1)

        stddev_action = actions_c[:,
                                  0].std()  # batch item 0, action-component c
        self.assertAlmostEqual(stddev_action, 0.0, places=1)
        stddev_action = actions_c[:,
                                  1].std()  # batch item 1, action-component c
        self.assertAlmostEqual(stddev_action, 0.0, places=1)
    def test_environment_stepper_on_deepmind_lab(self):
        try:
            from rlgraph.environments.deepmind_lab import DeepmindLabEnv
        except ImportError:
            print("DeepmindLab not installed: Skipping this test case.")
            return

        env_spec = dict(type="deepmind_lab",
                        level_id="seekavoid_arena_01",
                        observations=["RGB_INTERLEAVED"],
                        frameskip=4)
        dummy_env = Environment.from_spec(env_spec)
        state_space = dummy_env.state_space
        action_space = dummy_env.action_space
        actor_component = ActorComponent(
            # Preprocessor spec (only divide and flatten the image).
            [{
                "type": "divide",
                "divisor": 255
            }, {
                "type": "reshape",
                "flatten": True
            }],
            # Policy spec.
            dict(network_spec="../configs/test_lstm_nn.json",
                 action_space=action_space),
            # Exploration spec.
            Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay",
                                                          from_=1.0,
                                                          to_=0.1,
                                                          start_timestep=0,
                                                          num_timesteps=100))))
        environment_stepper = EnvironmentStepper(
            environment_spec=env_spec,
            actor_component_spec=actor_component,
            state_space=state_space,
            reward_space="float32",
            internal_states_space=self.internal_states_space_test_lstm,
            num_steps=1000,
            # Add both prev-action and -reward into the state sent through the network.
            #add_previous_action_to_state=True,
            #add_previous_reward_to_state=True,
            add_action_probs=True,
            action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True))

        test = ComponentTest(
            component=environment_stepper,
            action_space=action_space,
        )
        # Reset the stepper.
        test.test("reset")

        # Step n times through the Env and collect results.
        # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each
        # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states.
        time_start = time.monotonic()
        steps = 10
        out = None
        for _ in range(steps):
            out = test.test("step")
        time_total = time.monotonic() - time_start
        print(
            "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)"
            .format(steps, environment_stepper.num_steps, time_total,
                    environment_stepper.num_steps * steps / time_total))

        # Check types of outputs.
        self.assertTrue(out[0] is None)
        self.assertTrue(isinstance(
            out[1], DataOpTuple))  # the step results as a tuple (see below)

        # Check types of single data.
        #self.assertTrue(out[0].dtype == np.float32)
        #self.assertTrue(out[0].min() >= 0.0)  # make sure we have pixels / 255
        #self.assertTrue(out[0].max() <= 1.0)
        #self.assertTrue(out[1].dtype == np.int32)  # actions
        #self.assertTrue(out[2].dtype == np.float32)  # rewards
        #self.assertTrue(out[0].dtype == np.float32)  # episode return
        self.assertTrue(out[1][0].dtype == np.bool_)  # next-state is terminal?
        self.assertTrue(
            out[1][1].dtype == np.uint8)  # next state (raw, not preprocessed)
        self.assertTrue(out[1][1].min() >= 0)  # make sure we have pixels
        self.assertTrue(out[1][1].max() <= 255)
        # action probs (test whether sum to one).
        #self.assertTrue(out[1][6].dtype == np.float32)
        #self.assertTrue(out[1][6].min() >= 0.0)
        #self.assertTrue(out[1][6].max() <= 1.0)
        #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False),
        #                              np.ones(shape=(environment_stepper.num_steps,)), decimals=4)
        # internal states (c- and h-state)
        self.assertTrue(out[3][0].dtype == np.float32)
        self.assertTrue(out[3][1].dtype == np.float32)
        self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3))
        self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3))

        # Check whether episode returns match single rewards (including terminal signals).
        #episode_returns = 0.0
        #for i in range(environment_stepper.num_steps):
        #    episode_returns += out[0][i]
        #    self.assertAlmostEqual(episode_returns, out[3][i])
        #    # Terminal: Reset for next step.
        #    if out[4][i] is np.bool_(True):
        #        episode_returns = 0.0

        test.terminate()