Exemplo n.º 1
0
    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self):
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1])
        expected = (
            np.array([False, False, True, False]),
            np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)),
                softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)),
            ]),  # action probs
            # internal states
            (
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])),
                np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))
            )
        )
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Exemplo n.º 2
0
    def test_simple_action_adapter(self):
        # Last NN layer.
        last_nn_layer_space = FloatBox(shape=(16, ), add_batch_rank=True)
        # Action Space.
        action_space = IntBox(2, shape=(3, 2))

        action_adapter = ActionAdapter(action_space=action_space,
                                       weights_spec=1.0,
                                       biases_spec=False,
                                       activation="relu")
        test = ComponentTest(component=action_adapter,
                             input_spaces=dict(nn_output=last_nn_layer_space),
                             action_space=action_space)
        action_adapter_params = test.read_variable_values(
            action_adapter.variables)

        # Batch of 2 samples.
        inputs = last_nn_layer_space.sample(2)

        expected_action_layer_output = np.matmul(
            inputs,
            action_adapter_params["action-adapter/action-layer/dense/kernel"])
        test.test(("get_action_layer_output", inputs),
                  expected_outputs=dict(output=expected_action_layer_output))

        expected_logits = np.reshape(expected_action_layer_output,
                                     newshape=(2, 3, 2, 2))
        expected_probabilities = softmax(expected_logits)
        expected_log_probs = np.log(expected_probabilities)
        test.test(("get_logits_probabilities_log_probs", inputs),
                  expected_outputs=dict(logits=expected_logits,
                                        probabilities=expected_probabilities,
                                        log_probs=expected_log_probs))
Exemplo n.º 3
0
    def test_action_adapter_with_complex_lstm_output(self):
        # Last NN layer (LSTM with time rank).
        last_nn_layer_space = FloatBox(shape=(4,), add_batch_rank=True, add_time_rank=True, time_major=True)
        # Action Space.
        action_space = IntBox(2, shape=(3, 2))

        action_adapter = ActionAdapter(action_space=action_space, biases_spec=False)
        test = ComponentTest(
            component=action_adapter, input_spaces=dict(
                nn_output=last_nn_layer_space,
                inputs=[last_nn_layer_space]
            ), action_space=action_space
        )
        action_adapter_params = test.read_variable_values(action_adapter.variables)

        # Batch of 2 samples, 3 timesteps.
        inputs = last_nn_layer_space.sample(size=(3, 2))
        # Fold time rank before the action layer pass through.
        inputs_reshaped = np.reshape(inputs, newshape=(6, -1))
        # Action layer pass through and unfolding of time rank.
        expected_action_layer_output = np.matmul(
            inputs_reshaped, action_adapter_params["action-adapter/action-network/action-layer/dense/kernel"]
        ).reshape((3, 2, -1))
        # Logits (already well reshaped (same as action space)).
        expected_logits = np.reshape(expected_action_layer_output, newshape=(3, 2, 3, 2, 2))
        test.test(("apply", inputs), expected_outputs=dict(output=expected_logits))
        test.test(("get_logits", inputs), expected_outputs=expected_logits)

        # Softmax (probs).
        expected_probabilities = softmax(expected_logits)
        # Log probs.
        expected_log_probs = np.log(expected_probabilities)
        test.test(("get_logits_probabilities_log_probs", inputs), expected_outputs=dict(
            logits=expected_logits, probabilities=expected_probabilities, log_probs=expected_log_probs
        ), decimals=5)
Exemplo n.º 4
0
    def test_categorical_cross_entropy_loss_wo_time_rank(self):
        #time_steps = 3
        labels_space = IntBox(
            2, shape=(), add_batch_rank=True)  #, add_time_rank=time_steps)
        parameters_space = labels_space.as_one_hot_float_space()
        loss_per_item_space = FloatBox(shape=(), add_batch_rank=True)
        #sequence_length_space = IntBox(low=1, high=time_steps+1, shape=(), add_batch_rank=True)

        categorical_x_entropy_loss_function = CategoricalCrossEntropyLoss()

        test = ComponentTest(
            component=categorical_x_entropy_loss_function,
            input_spaces=dict(
                labels=labels_space,
                loss_per_item=loss_per_item_space,
                #sequence_length=sequence_length_space,
                parameters=parameters_space))

        batch_size = 4
        parameters = parameters_space.sample(batch_size)  #, time_steps)))
        probs = softmax(parameters)
        positive_probs = probs[:, 1]  # parameters[:, :, 1]
        labels = labels_space.sample(batch_size)  #, time_steps))

        # Calculate binary x-entropy manually here: −[ylog(p) + (1-y)log(1-p)]
        # iff label (y) is 0: −log(1−[predicted prob for 1])
        # iff label (y) is 1: −log([predicted prob for 1])
        cross_entropy = np.where(labels == 0, -np.log(1.0 - positive_probs),
                                 -np.log(positive_probs))

        #sequence_length = sequence_length_space.sample(batch_size)

        # This code here must be adapted to the exact time-rank reduction schema set within the loss function
        # in case there is a time-rank. For now, test w/o time rank.
        #ces = []
        #for batch_item, sl in enumerate(sequence_length):
        #    weight = 0.5
        #    ce_sum = 0.0
        #    for ce in cross_entropy[batch_item][:sl]:
        #        ce_sum += ce * weight
        #        weight += 0.5 / sequence_length[batch_item]
        #    ces.append(ce_sum / sl)

        expected_loss_per_item = cross_entropy  # np.asarray(ces)
        expected_loss = np.mean(expected_loss_per_item, axis=0, keepdims=False)

        test.test(
            ("loss_per_item", [parameters, labels]),  #, sequence_length]),
            expected_outputs=expected_loss_per_item,
            decimals=4)
        test.test(("loss_average", expected_loss_per_item),
                  expected_outputs=expected_loss,
                  decimals=4)
        # Both.
        test.test(
            ("loss", [parameters, labels]),  #, sequence_length]),
            expected_outputs=[expected_loss, expected_loss_per_item],
            decimals=4)
Exemplo n.º 5
0
    def test_dueling_action_adapter(self):
        # Last NN layer.
        last_nn_layer_space = FloatBox(shape=(7, ), add_batch_rank=True)
        # Action Space.
        action_space = IntBox(4, shape=(2, ))

        action_adapter = DuelingActionAdapter(
            action_space=action_space,
            units_state_value_stream=5,
            units_advantage_stream=4,
            weights_spec_state_value_stream=1.0,
            weights_spec_advantage_stream=0.5,
            activation_advantage_stream="linear",
            scope="aa")
        test = ComponentTest(component=action_adapter,
                             input_spaces=dict(nn_output=last_nn_layer_space),
                             action_space=action_space)

        # Batch of 2 samples.
        batch_size = 2
        inputs = last_nn_layer_space.sample(size=batch_size)

        dueling_action_adapter_vars = test.read_variable_values(
            action_adapter.variables)

        # Expected action layer output are the advantage nodes.
        expected_raw_advantages = np.matmul(
            np.matmul(
                inputs, dueling_action_adapter_vars[
                    "aa/dense-layer-advantage-stream/dense/kernel"]),
            dueling_action_adapter_vars["aa/action-layer/dense/kernel"])
        expected_state_values = np.matmul(
            relu(
                np.matmul(
                    inputs, dueling_action_adapter_vars[
                        "aa/dense-layer-state-value-stream/dense/kernel"])),
            dueling_action_adapter_vars["aa/state-value-node/dense/kernel"])

        test.test(("get_action_layer_output", inputs),
                  expected_outputs=dict(state_value_node=expected_state_values,
                                        output=expected_raw_advantages),
                  decimals=5)

        expected_advantages = np.reshape(expected_raw_advantages,
                                         newshape=(batch_size, 2, 4))

        # Expected q-values/logits, probabilities (softmaxed q) and log(p).
        expanded_state_values = np.expand_dims(expected_state_values, axis=1)
        expected_q_values = expanded_state_values + expected_advantages - \
            np.mean(expected_advantages, axis=-1, keepdims=True)
        expected_probs = softmax(expected_q_values)

        test.test(("get_logits_probabilities_log_probs", inputs),
                  expected_outputs=dict(state_values=expected_state_values,
                                        logits=expected_q_values,
                                        probabilities=expected_probs,
                                        log_probs=np.log(expected_probs)),
                  decimals=3)
Exemplo n.º 6
0
    def test_neg_log_likelihood_loss_function_w_container_space(self):
        parameters_space = Dict(
            {
                # Make sure stddev params are not too crazy (just like our adapters do clipping for the raw NN output).
                "a":
                Tuple(FloatBox(shape=(2, 3)), FloatBox(
                    0.5, 1.0, shape=(2, 3))),  # normal (0.0 to 1.0)
                "b":
                FloatBox(shape=(4, ), low=-1.0, high=1.0)  # 4-discrete
            },
            add_batch_rank=True)
        labels_space = Dict({
            "a": FloatBox(shape=(2, 3)),
            "b": IntBox(4)
        },
                            add_batch_rank=True)
        loss_per_item_space = FloatBox(add_batch_rank=True)

        loss_function = NegativeLogLikelihoodLoss(
            distribution_spec=get_default_distribution_from_space(
                labels_space))

        test = ComponentTest(component=loss_function,
                             input_spaces=dict(
                                 parameters=parameters_space,
                                 labels=labels_space,
                                 loss_per_item=loss_per_item_space))

        parameters = parameters_space.sample(2)
        # Softmax the discrete params.
        probs_b = softmax(parameters["b"])
        #probs_b = parameters["b"]
        labels = labels_space.sample(2)

        # Expected loss: Sum of all -log(llh)
        log_prob_per_item_a = np.sum(np.log(
            sts.norm.pdf(labels["a"], parameters["a"][0], parameters["a"][1])),
                                     axis=(-1, -2))
        log_prob_per_item_b = np.array([
            np.log(probs_b[0][labels["b"][0]]),
            np.log(probs_b[1][labels["b"][1]])
        ])

        expected_loss_per_item = -(log_prob_per_item_a + log_prob_per_item_b)
        expected_loss = np.mean(expected_loss_per_item, axis=0, keepdims=False)

        test.test(("loss_per_item", [parameters, labels]),
                  expected_outputs=expected_loss_per_item,
                  decimals=4)
        test.test(("loss_average", expected_loss_per_item),
                  expected_outputs=expected_loss,
                  decimals=4)
        # Both.
        test.test(("loss", [parameters, labels]),
                  expected_outputs=[expected_loss, expected_loss_per_item],
                  decimals=4)
Exemplo n.º 7
0
    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec, action_space=self.deterministic_env_action_space),
            exploration_spec
        )
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env", steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_hid = weights[policy_scope+"test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[policy_scope+"test-network/hidden-layer/dense/bias"]
        weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        expected = (
            # t_
            np.array([False, False, False]),
            # s' (raw)
            np.array([[0.0], [1.0], [2.0], [3.0]]),
            # action probs
            np.array([
                softmax(dense_layer(dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([False, False, True]),
            np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
            np.array([
                softmax(dense_layer(dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)),
                softmax(dense_layer(dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action))
            ])
        )
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Exemplo n.º 8
0
    def test_v_trace_function_more_complex(self):
        v_trace_function = VTraceFunction()
        v_trace_function_reference = VTraceFunction(backend="python")

        action_space = IntBox(9,
                              add_batch_rank=True,
                              add_time_rank=True,
                              time_major=True)
        action_space_flat = FloatBox(shape=(9, ),
                                     add_batch_rank=True,
                                     add_time_rank=True,
                                     time_major=True)
        input_spaces = dict(logits_actions_pi=self.time_x_batch_x_9_space,
                            log_probs_actions_mu=self.time_x_batch_x_9_space,
                            actions=action_space,
                            actions_flat=action_space_flat,
                            discounts=self.time_x_batch_x_1_space,
                            rewards=self.time_x_batch_x_1_space,
                            values=self.time_x_batch_x_1_space,
                            bootstrapped_values=self.time_x_batch_x_1_space)

        test = ComponentTest(component=v_trace_function,
                             input_spaces=input_spaces)

        size = (100, 16)
        logits_actions_pi = self.time_x_batch_x_9_space.sample(size=size)
        logits_actions_mu = self.time_x_batch_x_9_space.sample(size=size)
        log_probs_actions_mu = np.log(softmax(logits_actions_mu))
        actions = action_space.sample(size=size)
        actions_flat = one_hot(actions, depth=action_space.num_categories)
        # Set some discounts to 0.0 (these will mark the end of episodes, where the value is 0.0).
        discounts = np.random.choice([0.0, 0.99],
                                     size=size + (1, ),
                                     p=[0.1, 0.9])
        rewards = self.time_x_batch_x_1_space.sample(size=size)
        values = self.time_x_batch_x_1_space.sample(size=size)
        bootstrapped_values = self.time_x_batch_x_1_space.sample(
            size=(1, size[1]))

        input_ = [
            logits_actions_pi, log_probs_actions_mu, actions, actions_flat,
            discounts, rewards, values, bootstrapped_values
        ]

        vs_expected, pg_advantages_expected = v_trace_function_reference._graph_fn_calc_v_trace_values(
            *input_)

        test.test(("calc_v_trace_values", input_),
                  expected_outputs=[vs_expected, pg_advantages_expected],
                  decimals=4)
Exemplo n.º 9
0
    def test_categorical(self):
        # Create 5 categorical distributions of 3 categories each.
        param_space = FloatBox(shape=(5, 3),
                               low=-1.0,
                               high=2.0,
                               add_batch_rank=True)
        values_space = IntBox(3, shape=(5, ), add_batch_rank=True)

        # The Component to test.
        categorical = Categorical(switched_off_apis={"kl_divergence"})
        input_spaces = dict(
            parameters=param_space,
            values=values_space,
            deterministic=bool,
        )
        test = ComponentTest(component=categorical, input_spaces=input_spaces)

        # Batch of size=3 and deterministic (True).
        input_ = [input_spaces["parameters"].sample(3), True]
        expected = np.argmax(input_[0], axis=-1)
        # Sample n times, expect always max value (max likelihood for deterministic draw).
        for _ in range(10):
            test.test(("draw", input_), expected_outputs=expected)
            test.test(("sample_deterministic", input_[0]),
                      expected_outputs=expected)

        # Batch of size=3 and non-deterministic -> expect roughly the mean.
        input_ = [input_spaces["parameters"].sample(3), False]
        outs = []
        for _ in range(20):
            out = test.test(("draw", input_))
            outs.append(out)
            out = test.test(("sample_stochastic", input_[0]))
            outs.append(out)

        recursive_assert_almost_equal(np.mean(outs), 1.0, decimals=1)

        # Test log-likelihood outputs.
        input_ = param_space.sample(1)
        labels = values_space.sample(1)
        probs = softmax(input_)
        test.test(("log_prob", [input_, labels]),
                  expected_outputs=np.log(
                      np.array([[
                          probs[0][0][labels[0][0]], probs[0][1][labels[0][1]],
                          probs[0][2][labels[0][2]], probs[0][3][labels[0][3]],
                          probs[0][4][labels[0][4]]
                      ]])),
                  decimals=4)
Exemplo n.º 10
0
    def test_simple_action_adapter_with_batch_apply(self):
        # Last NN layer.
        previous_nn_layer_space = FloatBox(shape=(16, ),
                                           add_batch_rank=True,
                                           add_time_rank=True,
                                           time_major=True)
        logits_space = FloatBox(shape=(3, 2, 2), add_batch_rank=True)
        # Action Space.
        action_space = IntBox(2, shape=(3, 2))

        action_adapter = CategoricalDistributionAdapter(
            action_space=action_space,
            weights_spec=1.0,
            biases_spec=False,
            fold_time_rank=True,
            unfold_time_rank=True,
            activation="relu")
        test = ComponentTest(component=action_adapter,
                             input_spaces=dict(
                                 nn_input=previous_nn_layer_space,
                                 logits=logits_space),
                             action_space=action_space)
        action_adapter_params = test.read_variable_values(
            action_adapter.variable_registry)

        # Batch of (4, 5).
        inputs = previous_nn_layer_space.sample(size=(4, 5))
        inputs_folded = np.reshape(inputs, newshape=(20, -1))

        expected_action_layer_output = np.matmul(
            inputs_folded, action_adapter_params[
                "action-adapter/action-network/action-layer/dense/kernel"])
        expected_logits = np.reshape(expected_action_layer_output,
                                     newshape=(4, 5, 3, 2, 2))

        test.test(("apply", inputs),
                  expected_outputs=dict(output=expected_logits),
                  decimals=4)
        test.test(("get_logits", inputs),
                  expected_outputs=expected_logits,
                  decimals=4)

        expected_parameters = softmax(expected_logits)
        expected_log_probs = np.log(expected_parameters)
        test.test(("get_logits_parameters_log_probs", inputs),
                  expected_outputs=dict(logits=expected_logits,
                                        parameters=expected_parameters,
                                        log_probs=expected_log_probs),
                  decimals=4)
Exemplo n.º 11
0
    def test_simple_action_adapter(self):
        # Last NN layer.
        previous_nn_layer_space = FloatBox(shape=(16, ), add_batch_rank=True)
        adapter_outputs_space = FloatBox(shape=(3, 2, 2), add_batch_rank=True)
        # Action Space.
        action_space = IntBox(2, shape=(3, 2))

        action_adapter = CategoricalDistributionAdapter(
            action_space=action_space,
            weights_spec=1.0,
            biases_spec=False,
            activation="relu")
        test = ComponentTest(component=action_adapter,
                             input_spaces=dict(
                                 inputs=previous_nn_layer_space,
                                 adapter_outputs=adapter_outputs_space,
                             ),
                             action_space=action_space)
        action_adapter_params = test.read_variable_values(
            action_adapter.variable_registry)

        # Batch of 2 samples.
        inputs = previous_nn_layer_space.sample(2)

        expected_action_layer_output = np.matmul(
            inputs, action_adapter_params[
                "action-adapter/action-network/action-layer/dense/kernel"])
        expected_logits = np.reshape(expected_action_layer_output,
                                     newshape=(2, 3, 2, 2))
        test.test(("call", inputs),
                  expected_outputs=expected_logits,
                  decimals=5)
        #test.test(("get_logits", inputs), expected_outputs=expected_logits, decimals=5)  # w/o the dict

        expected_probs = softmax(expected_logits)
        expected_log_probs = np.log(expected_probs)
        test.test(("get_parameters", inputs),
                  expected_outputs=dict(adapter_outputs=expected_logits,
                                        parameters=expected_logits,
                                        probabilities=expected_probs,
                                        log_probs=expected_log_probs),
                  decimals=5)
Exemplo n.º 12
0
    def test_simple_actor_component(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)
        # action_space.
        action_space = IntBox(10)

        preprocessor = PreprocessorStack.from_spec([
            dict(type="convert_type", to_dtype="float"),
            dict(type="multiply", factor=2)
        ])
        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        exploration = Exploration()  # no exploration
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(component=actor_component,
                             input_spaces=dict(states=state_space),
                             action_space=action_space)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)

        # Some state inputs (5 input nodes, batch size=2).
        states = state_space.sample(2)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_and_action", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions))

        # Get actions and action-probs by calling a different API-method.
        states = state_space.sample(5)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # No reshape necessary (simple action space), softmax to get probs.
        expected_action_probs = softmax(expected_action_layer_output)
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_action_and_action_probs", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions,
                      action_probs=expected_action_probs))
Exemplo n.º 13
0
    def test_joint_cumulative_distribution(self):
        param_space = Dict(
            {
                "a":
                FloatBox(shape=(4, )),  # 4-discrete
                "b":
                Dict({
                    "ba":
                    Tuple([
                        FloatBox(shape=(3, )),
                        FloatBox(0.1, 1.0, shape=(3, ))
                    ]),  # 3-variate normal
                    "bb":
                    Tuple([FloatBox(shape=(2, )),
                           FloatBox(shape=(2, ))]),  # beta -1 to 1
                    "bc":
                    Tuple([
                        FloatBox(shape=(4, )),
                        FloatBox(0.1, 1.0, shape=(4, ))
                    ]),  # normal (dim=4)
                })
            },
            add_batch_rank=True)

        values_space = Dict(
            {
                "a":
                IntBox(4),
                "b":
                Dict({
                    "ba": FloatBox(shape=(3, )),
                    "bb": FloatBox(shape=(2, )),
                    "bc": FloatBox(shape=(4, ))
                })
            },
            add_batch_rank=True)

        input_spaces = dict(parameters=param_space,
                            values=values_space,
                            deterministic=bool)

        low, high = -1.0, 1.0
        joined_cumulative_distribution = JointCumulativeDistribution(
            distribution_specs={
                "/a": Categorical(),
                "/b/ba": MultivariateNormal(),
                "/b/bb": Beta(low=low, high=high),
                "/b/bc": Normal()
            },
            switched_off_apis={"kl_divergence"})
        test = ComponentTest(component=joined_cumulative_distribution,
                             input_spaces=input_spaces)

        # Batch of size=2 and deterministic (True).
        input_ = [param_space.sample(2), True]
        input_[0]["a"] = softmax(input_[0]["a"])
        expected_mean = {
            "a": np.argmax(input_[0]["a"], axis=-1),
            "b": {
                "ba":
                input_[0]["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 /
                 (1.0 + input_[0]["b"]["bb"][1] / input_[0]["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_[0]["b"]["bc"][0],
            }
        }
        # Sample n times, expect always mean value (deterministic draw).
        for _ in range(50):
            test.test(("draw", input_), expected_outputs=expected_mean)
            test.test(("sample_deterministic", tuple([input_[0]])),
                      expected_outputs=expected_mean)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = [param_space.sample(1), False]
        input_[0]["a"] = softmax(input_[0]["a"])
        expected_mean = {
            "a": np.sum(input_[0]["a"] * np.array([0, 1, 2, 3])),
            "b": {
                "ba":
                input_[0]["b"]["ba"][0],  # [0]=Mean
                # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] * range + low
                "bb":
                (1.0 /
                 (1.0 + input_[0]["b"]["bb"][1] / input_[0]["b"]["bb"][0])) *
                (high - low) + low,
                "bc":
                input_[0]["b"]["bc"][0],
            }
        }

        outs = []
        for _ in range(100):
            out = test.test(("draw", input_))
            outs.append(out)
            out = test.test(("sample_stochastic", tuple([input_[0]])))
            outs.append(out)

        recursive_assert_almost_equal(np.mean(np.stack(
            [o["a"][0] for o in outs], axis=0),
                                              axis=0),
                                      expected_mean["a"],
                                      atol=0.2)
        recursive_assert_almost_equal(np.mean(np.stack(
            [o["b"]["ba"][0] for o in outs], axis=0),
                                              axis=0),
                                      expected_mean["b"]["ba"][0],
                                      decimals=1)
        recursive_assert_almost_equal(np.mean(np.stack(
            [o["b"]["bb"][0] for o in outs], axis=0),
                                              axis=0),
                                      expected_mean["b"]["bb"][0],
                                      decimals=1)
        recursive_assert_almost_equal(np.mean(np.stack(
            [o["b"]["bc"][0] for o in outs], axis=0),
                                              axis=0),
                                      expected_mean["b"]["bc"][0],
                                      decimals=1)

        # Test log-likelihood outputs.
        params = param_space.sample(1)
        params["a"] = softmax(params["a"])
        # Make sure beta-values are within 0.0 and 1.0 for the numpy calculation (which doesn't have scaling).
        values = values_space.sample(1)
        log_prob_beta = np.log(
            beta.pdf(values["b"]["bb"], params["b"]["bb"][0],
                     params["b"]["bb"][1]))
        # Now do the scaling for b/bb (beta values).
        values["b"]["bb"] = values["b"]["bb"] * (high - low) + low
        expected_log_llh = np.log(params["a"][0][values["a"][0]]) + \
            np.sum(np.log(norm.pdf(values["b"]["ba"][0], params["b"]["ba"][0], params["b"]["ba"][1]))) + \
            np.sum(log_prob_beta) + \
            np.sum(np.log(norm.pdf(values["b"]["bc"][0], params["b"]["bc"][0], params["b"]["bc"][1])))

        test.test(("log_prob", [params, values]),
                  expected_outputs=expected_log_llh,
                  decimals=1)
Exemplo n.º 14
0
    def test_mixture(self):
        # Create a mixture distribution consisting of 3 bivariate normals.
        num_distributions = 3
        num_events_per_multivariate = 2  # 2=bivariate
        param_space = Dict(
            {
                "categorical":
                FloatBox(shape=(num_distributions, ), low=-1.5, high=2.3),
                "parameters0":
                Tuple(
                    FloatBox(shape=(num_events_per_multivariate, )),  # mean
                    FloatBox(shape=(num_events_per_multivariate, )),  # diag
                ),
                "parameters1":
                Tuple(
                    FloatBox(shape=(num_events_per_multivariate, )),  # mean
                    FloatBox(shape=(num_events_per_multivariate, )),  # diag
                ),
                "parameters2":
                Tuple(
                    FloatBox(shape=(num_events_per_multivariate, )),  # mean
                    FloatBox(shape=(num_events_per_multivariate, )),  # diag
                ),
            },
            add_batch_rank=True)
        values_space = FloatBox(shape=(num_events_per_multivariate, ),
                                add_batch_rank=True)
        input_spaces = dict(
            parameters=param_space,
            values=values_space,
            deterministic=bool,
        )

        # The Component to test.
        mixture = MixtureDistribution(
            # Try different spec types.
            MultivariateNormal(),
            "multi-variate-normal",
            "multivariate_normal",
            switched_off_apis={"entropy", "kl_divergence"})
        test = ComponentTest(component=mixture, input_spaces=input_spaces)

        # Batch of size=n and deterministic (True).
        input_ = [input_spaces["parameters"].sample(1), True]
        # Make probs for categorical.
        categorical_probs = softmax(input_[0]["categorical"])

        # Note: Usually, the deterministic draw should return the max-likelihood value
        # Max-likelihood for a 3-Mixed Bivariate: mean-of-argmax(categorical)()
        # argmax = np.argmax(input_[0]["categorical"], axis=-1)
        #expected = np.array([input_[0]["parameters{}".format(idx)][0][i] for i, idx in enumerate(argmax)])
        #    input_[0]["categorical"][:, 1:2] * input_[0]["parameters1"][0] + \
        #    input_[0]["categorical"][:, 2:3] * input_[0]["parameters2"][0]

        # The mean value is a 2D vector (bivariate distribution).
        expected = categorical_probs[:, 0:1] * input_[0]["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_[0]["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_[0]["parameters2"][0]

        for _ in range(50):
            test.test(("draw", input_), expected_outputs=expected)
            test.test(("sample_deterministic", tuple([input_[0]])),
                      expected_outputs=expected)

        # Batch of size=1 and non-deterministic -> expect roughly the mean.
        input_ = [input_spaces["parameters"].sample(1), False]
        # Make probs for categorical.
        categorical_probs = softmax(input_[0]["categorical"])

        expected = categorical_probs[:, 0:1] * input_[0]["parameters0"][0] + \
            categorical_probs[:, 1:2] * input_[0]["parameters1"][0] + \
            categorical_probs[:, 2:3] * input_[0]["parameters2"][0]
        outs = []
        for _ in range(50):
            out = test.test(("draw", input_))
            outs.append(out)
            out = test.test(("sample_stochastic", tuple([input_[0]])))
            outs.append(out)

        recursive_assert_almost_equal(np.mean(np.array(outs), axis=0),
                                      expected,
                                      decimals=1)

        # Test log-likelihood outputs (against scipy).
        params = param_space.sample(1)
        # Make sure categorical params are softmaxed.
        category_probs = softmax(params["categorical"][0])
        values = values_space.sample(1)
        expected = \
            category_probs[0] * \
            np.sum(np.log(norm.pdf(values[0], params["parameters0"][0][0], params["parameters0"][1][0])), axis=-1) + \
            category_probs[1] * \
            np.sum(np.log(norm.pdf(values[0], params["parameters1"][0][0], params["parameters1"][1][0])), axis=-1) + \
            category_probs[2] * \
            np.sum(np.log(norm.pdf(values[0], params["parameters2"][0][0], params["parameters2"][1][0])), axis=-1)
        test.test(("log_prob", [params, values]),
                  expected_outputs=np.array([expected]),
                  decimals=1)
Exemplo n.º 15
0
    def _graph_fn_calc_v_trace_values(self, logits_actions_pi, log_probs_actions_mu, actions, actions_flat,
                                      discounts, rewards,
                                      values, bootstrapped_values):
        """
        Returns the V-trace values calculated from log importance weights (see [1] for details).
        Calculation:
        vs = V(xs) + SUM[t=s to s+N-1]( gamma^t-s * ( PROD[i=s to t-1](ci) ) * dt_V )
        with:
            dt_V = rho_t * (rt + gamma V(xt+1) - V(xt))
            rho_t and ci being the clipped IS weights

        Args:
            logits_actions_pi (SingleDataOp): The raw logits output of the pi-network (one logit per discrete action).
            log_probs_actions_mu (SingleDataOp): The log-probs of the mu-network (one log-prob per discrete action).
            actions (SingleDataOp): The (int encoded) actually taken actions.
            actions_flat (SingleDataOp): The one-hot converted actually taken actions.
            discounts (SingleDataOp): DataOp (time x batch x values) holding the discounts collected when stepping
                through the environment (for the timesteps s=t to s=t+N-1).
            rewards (SingleDataOp): DataOp (time x batch x values) holding the rewards collected when stepping
                through the environment (for the timesteps s=t to s=t+N-1).
            values (SingleDataOp): DataOp (time x batch x values) holding the the value function estimates
                wrt. the learner's policy (pi) (for the timesteps s=t to s=t+N-1).
            bootstrapped_values (SingleDataOp): DataOp (time(1) x batch x values) holding the last (bootstrapped)
                value estimate to use as a value function estimate after n time steps (V(xs) for s=t+N).

        Returns:
            tuple:
                - v-trace values (vs) in time x batch dimensions used to train the value-function (baseline).
                - PG-advantage values in time x batch dimensions used for training via policy gradient with baseline.
        """
        # Simplified (not performance optimized!) numpy implementation of v-trace for testing purposes.
        if get_backend() == "python" or self.backend == "python":
            probs_actions_pi = softmax(logits_actions_pi, axis=-1)
            log_probs_actions_pi = np.log(probs_actions_pi)

            log_is_weights = log_probs_actions_pi - log_probs_actions_mu  # log(a/b) = log(a) - log(b)
            log_is_weights_actions_taken = np.sum(log_is_weights * actions_flat, axis=-1, keepdims=True)
            is_weights = np.exp(log_is_weights_actions_taken)

            # rho_t = min(rho_bar, is_weights) = [1.0, 1.0], [0.67032005, 1.0], [1.0, 0.36787944]
            if self.rho_bar is not None:
                rho_t = np.minimum(self.rho_bar, is_weights)
            else:
                rho_t = is_weights

            # Same for rho-PG (policy gradients).
            if self.rho_bar_pg is not None:
                rho_t_pg = np.minimum(self.rho_bar_pg, is_weights)
            else:
                rho_t_pg = is_weights

            # Calculate ci terms for all timesteps:
            # ci = min(c_bar, is_weights) = [1.0, 1.0], [0.67032005, 1.0], [1.0, 0.36787944]
            if self.c_bar is not None:
                c_i = np.minimum(self.c_bar, is_weights)
            else:
                c_i = is_weights

            # Values t+1 -> shift by one time step.
            values_t_plus_1 = np.concatenate((values[1:], bootstrapped_values), axis=0)
            deltas = rho_t * (rewards + discounts * values_t_plus_1 - values)

            # Reverse everything for recursive v_s calculation.
            discounts_reversed = discounts[::-1]
            c_i_reversed = c_i[::-1]
            deltas_reversed = deltas[::-1]

            vs_minus_v_xs = [np.zeros_like(np.squeeze(bootstrapped_values, axis=0))]

            # Do the recursive calculations.
            for d, c, delta in zip(discounts_reversed, c_i_reversed, deltas_reversed):
                vs_minus_v_xs.append(delta + d * c * vs_minus_v_xs[-1])

            # Convert into numpy array and revert back.
            vs_minus_v_xs = np.array(vs_minus_v_xs[::-1])[:-1]

            # Add V(x_s) to get v_s.
            vs = vs_minus_v_xs + values

            # Advantage for policy gradient.
            vs_t_plus_1 = np.concatenate([vs[1:], bootstrapped_values], axis=0)
            pg_advantages = (rho_t_pg * (rewards + discounts * vs_t_plus_1 - values))

            return vs, pg_advantages

        elif get_backend() == "tf":
            # Calculate the log IS-weight values via: logIS = log(pi(a|s)) - log(mu(a|s)).
            # Use the action_probs_pi values only of the actions actually taken.
            log_probs_actions_taken_pi = tf.expand_dims(-tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits_actions_pi, labels=actions
            ), axis=-1)
            log_probs_actions_taken_mu = tf.reduce_sum(
                input_tensor=log_probs_actions_mu * actions_flat, axis=-1, keepdims=True, name="log-probs-actions-taken-mu"
            )
            log_is_weights = log_probs_actions_taken_pi - log_probs_actions_taken_mu

            is_weights = tf.exp(x=log_is_weights, name="is-weights-from-logs")

            # Apply rho-bar (also for PG) and c-bar clipping to all IS-weights.
            if self.rho_bar is not None:
                rho_t = tf.minimum(x=self.rho_bar, y=is_weights, name="clip-rho-bar")
            else:
                rho_t = is_weights

            if self.rho_bar_pg is not None:
                rho_t_pg = tf.minimum(x=self.rho_bar_pg, y=is_weights, name="clip-rho-bar-pg")
            else:
                rho_t_pg = is_weights

            if self.c_bar is not None:
                c_i = tf.minimum(x=self.c_bar, y=is_weights, name="clip-c-bar")
            else:
                c_i = is_weights

            # This is the same vector as `values` except that it will be shifted by 1 timestep to the right and
            # include - as the last item - the bootstrapped V value at s=t+N.
            values_t_plus_1 = tf.concat(values=[values[1:], bootstrapped_values], axis=0, name="values-t-plus-1")
            # Calculate the temporal difference terms (delta-t-V in the paper) for each s=t to s=t+N-1.
            dt_vs = rho_t * (rewards + discounts * values_t_plus_1 - values)

            # V-trace values can be calculated recursively (starting from the end of a trajectory) via:
            #    vs = V(xs) + dsV + gamma * cs * (vs+1 - V(s+1))
            # => (vs - V(xs)) = dsV + gamma * cs * (vs+1 - V(s+1))
            # We will thus calculate all terms: [vs - V(xs)] for all timesteps first, then add V(xs) again to get the
            # v-traces.
            elements = (
                tf.reverse(tensor=discounts, axis=[0], name="revert-discounts"),
                tf.reverse(tensor=c_i, axis=[0], name="revert-c-i"),
                tf.reverse(tensor=dt_vs, axis=[0], name="revert-dt-vs")
            )

            def scan_func(vs_minus_v_xs_, elements_):
                gamma_t, c_t, dt_v = elements_
                return dt_v + gamma_t * c_t * vs_minus_v_xs_

            vs_minus_v_xs = tf.scan(
                fn=scan_func,
                elems=elements,
                initializer=tf.zeros_like(tensor=tf.squeeze(bootstrapped_values, axis=0)),
                parallel_iterations=1,
                back_prop=False,
                name="v-trace-scan"
            )
            # Reverse the results back to original order.
            vs_minus_v_xs = tf.reverse(tensor=vs_minus_v_xs, axis=[0], name="revert-vs-minus-v-xs")

            # Add V(xs) to get vs.
            vs = tf.add(x=vs_minus_v_xs, y=values)

            # Calculate the advantage values (for policy gradient loss term) according to:
            # A = Q - V with Q based on vs (v-trace) values: qs = rs + gamma * vs and V being the
            # approximate value function output.
            vs_t_plus_1 = tf.concat(values=[vs[1:], bootstrapped_values], axis=0)
            pg_advantages = rho_t_pg * (rewards + discounts * vs_t_plus_1 - values)

            # Return v-traces and policy gradient advantage values based on: A=r+gamma*v-trace(s+1) - V(s).
            # With `r+gamma*v-trace(s+1)` also called `qs` in the paper.
            return tf.stop_gradient(vs), tf.stop_gradient(pg_advantages)