示例#1
0
    def create_gradient_magnitude(self) -> tf.Tensor:
        """
        Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
        for off-policy. Compute gradients w.r.t randomly interpolated input.
        """
        expert = [self.encoded_expert, self.expert_action, self.done_expert]
        policy = [
            self.encoded_policy,
            self.policy_model.selected_actions,
            self.done_policy,
        ]
        interp = []
        for _expert_in, _policy_in in zip(expert, policy):
            alpha = tf.random_uniform(tf.shape(_expert_in))
            interp.append(alpha * _expert_in + (1 - alpha) * _policy_in)

        grad_estimate, _, grad_input = self.create_encoder(
            interp[0], interp[1], interp[2], reuse=True
        )

        grad = tf.gradients(grad_estimate, [grad_input])[0]

        # Norm's gradient could be NaN at 0. Use our own safe_norm
        safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON)
        gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2))

        return gradient_mag
    def _create_sampled_policy(
        self, encoded: "GaussianDistribution.MuSigmaTensors"
    ) -> tf.Tensor:
        epsilon = tf.random_normal(tf.shape(encoded.mu))
        sampled_policy = encoded.mu + encoded.sigma * epsilon

        return sampled_policy
示例#3
0
    def create_normalizer_update(
        vector_input: tf.Tensor,
        steps: tf.Tensor,
        running_mean: tf.Tensor,
        running_variance: tf.Tensor,
    ) -> tf.Operation:
        """
        Creates the update operation for the normalizer.
        :param vector_input: Vector observation to use for updating the running mean and variance.
        :param running_mean: Tensorflow tensor representing the current running mean.
        :param running_variance: Tensorflow tensor representing the current running variance.
        :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
        :return: A TF operation that updates the normalization based on vector_input.
        """
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, running_mean)
        new_mean = running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(running_mean, new_mean)
        update_variance = tf.assign(running_variance, new_variance)
        update_norm_step = tf.assign(steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
示例#4
0
    def create_encoder(
        self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool
    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
        """
        Creates the encoder for the discriminator
        :param state_in: The encoded observation input
        :param action_in: The action input
        :param done_in: The done flags input
        :param reuse: If true, the weights will be shared with the previous encoder created
        """
        with tf.variable_scope("GAIL_model"):
            if self.use_actions:
                concat_input = tf.concat([state_in, action_in, done_in], axis=1)
            else:
                concat_input = state_in

            hidden_1 = tf.layers.dense(
                concat_input,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_1",
                reuse=reuse,
            )

            hidden_2 = tf.layers.dense(
                hidden_1,
                self.h_size,
                activation=ModelUtils.swish,
                name="gail_d_hidden_2",
                reuse=reuse,
            )

            z_mean = None
            if self.use_vail:
                # Latent representation
                z_mean = tf.layers.dense(
                    hidden_2,
                    self.z_size,
                    reuse=reuse,
                    name="gail_z_mean",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )

                self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)

                # Sampled latent code
                self.z = z_mean + self.z_sigma * self.noise * self.use_noise
                estimate_input = self.z
            else:
                estimate_input = hidden_2

            estimate = tf.layers.dense(
                estimate_input,
                1,
                activation=tf.nn.sigmoid,
                name="gail_d_estimate",
                reuse=reuse,
            )
            return estimate, z_mean, concat_input
示例#5
0
    def create_normalizer_update(self, vector_input):
        # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
        # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
        steps_increment = tf.shape(vector_input)[0]
        total_new_steps = tf.add(self.normalization_steps, steps_increment)

        # Compute the incremental update and divide by the number of new steps.
        input_to_old_mean = tf.subtract(vector_input, self.running_mean)
        new_mean = self.running_mean + tf.reduce_sum(
            input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32),
            axis=0)
        # Compute difference of input to the new mean for Welford update
        input_to_new_mean = tf.subtract(vector_input, new_mean)
        new_variance = self.running_variance + tf.reduce_sum(
            input_to_new_mean * input_to_old_mean, axis=0)
        update_mean = tf.assign(self.running_mean, new_mean)
        update_variance = tf.assign(self.running_variance, new_variance)
        update_norm_step = tf.assign(self.normalization_steps, total_new_steps)
        return tf.group([update_mean, update_variance, update_norm_step])
示例#6
0
    def create_cc_actor(self, hidden_policy, scope):
        """
        Creates Continuous control actor for SAC.
        :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs).
        :param num_layers: TF scope to assign whatever is created in this block.
        """
        # Create action input (continuous)
        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")
        self.external_action_in = self.action_holder

        scope = self.join_scopes(scope, "policy")

        with tf.variable_scope(scope):
            hidden_policy = self.create_vector_observation_encoder(
                hidden_policy,
                self.h_size,
                self.activ_fn,
                self.num_layers,
                "encoder",
                False,
            )
        if self.use_recurrent:
            hidden_policy, memory_out = self.create_recurrent_encoder(
                hidden_policy,
                self.policy_memory_in,
                self.sequence_length,
                name="lstm_policy",
            )
            self.policy_memory_out = memory_out
        with tf.variable_scope(scope):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            # Policy-dependent log_sigma_sq
            log_sigma_sq = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="log_std",
                kernel_initializer=LearningModel.scaled_init(0.01),
            )

            self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN,
                                                 LOG_STD_MAX)

            sigma_sq = tf.exp(self.log_sigma_sq)

            # Do the reparameterization trick
            policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq

            _gauss_pre = -0.5 * (((policy_ - mu) /
                                  (tf.exp(self.log_sigma_sq) + EPSILON))**2 +
                                 2 * self.log_sigma_sq + np.log(2 * np.pi))

            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

            self.entropy = tf.reduce_sum(self.log_sigma_sq +
                                         0.5 * np.log(2.0 * np.pi * np.e),
                                         axis=-1)

            # Squash probabilities
            # Keep deterministic around in case we want to use it.
            self.deterministic_output = tf.tanh(mu)

            # Note that this is just for symmetry with PPO.
            self.output_pre = tf.tanh(policy_)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)

            self.all_log_probs = all_probs
            self.selected_actions = tf.stop_gradient(self.output_pre)

            self.action_probs = all_probs

        # Extract output for Barracuda
        self.output = tf.identity(self.output_pre, name="action")

        # Get all policy vars
        self.policy_vars = self.get_vars(scope)
示例#7
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=ModelUtils.scaled_init(0.01),
                reuse=tf.AUTO_REUSE,
            )

            # Policy-dependent log_sigma
            if condition_sigma_on_obs:
                log_sigma = tf.layers.dense(
                    hidden_policy,
                    self.act_size[0],
                    activation=None,
                    name="log_sigma",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            else:
                log_sigma = tf.get_variable(
                    "log_sigma",
                    [self.act_size[0]],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min,
                                         self.log_std_max)

            sigma = tf.exp(log_sigma)

            epsilon = tf.random_normal(tf.shape(mu))

            sampled_policy = mu + sigma * epsilon

            # Stop gradient if we're not doing the resampling trick
            if not reparameterize:
                sampled_policy_probs = tf.stop_gradient(sampled_policy)
            else:
                sampled_policy_probs = sampled_policy

            # Compute probability of model output.
            _gauss_pre = -0.5 * (
                ((sampled_policy_probs - mu) /
                 (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi))
            all_probs = _gauss_pre
            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

        if tanh_squash:
            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = sampled_policy
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + 2 * log_sigma)
        # Make entropy the right shape
        self.entropy = tf.ones_like(tf.reshape(mu[:, 0],
                                               [-1])) * single_dim_entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)

        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")