Пример #1
0
def test_tanh_distribution():
    with tf.Graph().as_default():
        logits = tf.Variable(initial_value=[[0, 0]],
                             trainable=True,
                             dtype=tf.float32)
        distribution = GaussianDistribution(logits,
                                            act_size=VECTOR_ACTION_SPACE,
                                            reparameterize=False,
                                            tanh_squash=True)
        sess = tf.Session()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            output = sess.run(distribution.sample)
            for _ in range(10):
                output = sess.run(
                    [distribution.sample, distribution.log_probs])
                for out in output:
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                # Assert action never exceeds [-1,1]
                action = output[0][0]
                for act in action:
                    assert act >= -1 and act <= 1
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
Пример #2
0
def test_gaussian_distribution():
    with tf.Graph().as_default():
        logits = tf.Variable(initial_value=[[1, 1]],
                             trainable=True,
                             dtype=tf.float32)
        distribution = GaussianDistribution(
            logits,
            act_size=VECTOR_ACTION_SPACE,
            reparameterize=False,
            tanh_squash=False,
        )
        sess = tf.Session()
        with tf.Session() as sess:
            init = tf.global_variables_initializer()
            sess.run(init)
            output = sess.run(distribution.sample)
            for _ in range(10):
                output = sess.run(
                    [distribution.sample, distribution.log_probs])
                for out in output:
                    assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                output = sess.run([distribution.total_log_probs])
                assert output[0].shape[0] == 1
            # Test entropy is correct
            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
                "log_std/BiasAdd:0")
            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
            # Entropy with log_std of 1.0 should be 2.42
            assert pytest.approx(entropy[0], 0.01) == 2.42
Пример #3
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            distribution = GaussianDistribution(
                hidden_policy,
                self.act_size,
                reparameterize=reparameterize,
                tanh_squash=tanh_squash,
                condition_sigma=condition_sigma_on_obs,
            )

        if tanh_squash:
            self.output_pre = distribution.sample
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = distribution.sample
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(distribution.log_probs,
                                         name="action_probs")
        self.entropy = distribution.entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.total_log_probs = distribution.total_log_probs