Exemplo n.º 1
0
    def create_recurrent_encoder(input_state,
                                 memory_in,
                                 sequence_length,
                                 name="lstm"):
        """
        Builds a recurrent encoder for either state or observations (LSTM).
        :param sequence_length: Length of sequence to unroll.
        :param input_state: The input tensor to the LSTM cell.
        :param memory_in: The input memory to the LSTM cell.
        :param name: The scope of the LSTM cell.
        """
        s_size = input_state.get_shape().as_list()[1]
        m_size = memory_in.get_shape().as_list()[1]
        lstm_input_state = tf.reshape(input_state,
                                      shape=[-1, sequence_length, s_size])
        memory_in = tf.reshape(memory_in[:, :], [-1, m_size])
        half_point = int(m_size / 2)
        with tf.variable_scope(name):
            rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point)
            lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple(
                memory_in[:, :half_point], memory_in[:, half_point:])
            recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(
                rnn_cell, lstm_input_state, initial_state=lstm_vector_in)

        recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
        return recurrent_output, tf.concat(
            [lstm_state_out.c, lstm_state_out.h], axis=1)
Exemplo n.º 2
0
 def _create_entropy(
         self, encoded: "GaussianDistribution.MuSigmaTensors") -> tf.Tensor:
     single_dim_entropy = 0.5 * tf.reduce_mean(
         tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma)
     # Make entropy the right shape
     return tf.ones_like(tf.reshape(encoded.mu[:, 0],
                                    [-1])) * single_dim_entropy
Exemplo n.º 3
0
 def create_network(self) -> None:
     """
     Helper for creating the intrinsic reward nodes
     """
     if self.use_vail:
         self.z_sigma = tf.get_variable(
             "gail_sigma_vail",
             self.z_size,
             dtype=tf.float32,
             initializer=tf.ones_initializer(),
         )
         self.z_sigma_sq = self.z_sigma * self.z_sigma
         self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
         self.use_noise = tf.placeholder(
             shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
         )
     self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
         self.encoded_expert, self.expert_action, self.done_expert, reuse=False
     )
     self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
         self.encoded_policy,
         self.policy.selected_actions,
         self.done_policy,
         reuse=True,
     )
     self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
     self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
     self.discriminator_score = tf.reshape(
         self.policy_estimate, [-1], name="gail_reward"
     )
     self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
Exemplo n.º 4
0
    def __init__(
        self,
        brain,
        lr=1e-4,
        lr_schedule=LearningRateSchedule.LINEAR,
        h_size=128,
        epsilon=0.2,
        beta=1e-3,
        max_step=5e6,
        normalize=False,
        use_recurrent=False,
        num_layers=2,
        m_size=None,
        seed=0,
        stream_names=None,
        vis_encode_type=EncoderType.SIMPLE,
    ):
        """
        Takes a Unity environment and model-specific hyper-parameters and returns the
        appropriate PPO agent model for the environment.
        :param brain: brain parameters used to generate specific network graph.
        :param lr: Learning rate.
        :param lr_schedule: Learning rate decay schedule.
        :param h_size: Size of hidden layers
        :param epsilon: Value for policy-divergence threshold.
        :param beta: Strength of entropy regularization.
        :param max_step: Total number of training steps.
        :param normalize: Whether to normalize vector observation input.
        :param use_recurrent: Whether to use an LSTM layer in the network.
        :param num_layers Number of hidden layers between encoded input and policy & value layers
        :param m_size: Size of brain memory.
        :param seed: Seed to use for initialization of model.
        :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used.
        :return: a sub-class of PPOAgent tailored to the environment.
        """
        LearningModel.__init__(self, m_size, normalize, use_recurrent, brain,
                               seed, stream_names)

        self.optimizer: Optional[tf.train.AdamOptimizer] = None
        self.grads = None
        self.update_batch: Optional[tf.Operation] = None

        if num_layers < 1:
            num_layers = 1
        if brain.vector_action_space_type == "continuous":
            self.create_cc_actor_critic(h_size, num_layers, vis_encode_type)
            self.entropy = tf.ones_like(tf.reshape(self.value,
                                                   [-1])) * self.entropy
        else:
            self.create_dc_actor_critic(h_size, num_layers, vis_encode_type)
        self.learning_rate = self.create_learning_rate(lr_schedule, lr,
                                                       self.global_step,
                                                       max_step)
        self.create_losses(
            self.log_probs,
            self.old_log_probs,
            self.value_heads,
            self.entropy,
            beta,
            epsilon,
            lr,
            max_step,
        )
Exemplo n.º 5
0
    def _create_cc_actor(
        self,
        encoded: tf.Tensor,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ) -> None:
        """
        Creates Continuous control actor-critic model.
        :param h_size: Size of hidden linear layers.
        :param num_layers: Number of hidden linear layers.
        :param vis_encode_type: Type of visual encoder to use if visual input.
        :param tanh_squash: Whether to use a tanh function, or a clipped output.
        :param reparameterize: Whether we are using the resampling trick to update the policy.
        """
        if self.use_recurrent:
            self.memory_in = tf.placeholder(shape=[None, self.m_size],
                                            dtype=tf.float32,
                                            name="recurrent_in")
            hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
                encoded,
                self.memory_in,
                self.sequence_length_ph,
                name="lstm_policy")

            self.memory_out = tf.identity(memory_policy_out,
                                          name="recurrent_out")
        else:
            hidden_policy = encoded

        with tf.variable_scope("policy"):
            mu = tf.layers.dense(
                hidden_policy,
                self.act_size[0],
                activation=None,
                name="mu",
                kernel_initializer=ModelUtils.scaled_init(0.01),
                reuse=tf.AUTO_REUSE,
            )

            # Policy-dependent log_sigma
            if condition_sigma_on_obs:
                log_sigma = tf.layers.dense(
                    hidden_policy,
                    self.act_size[0],
                    activation=None,
                    name="log_sigma",
                    kernel_initializer=ModelUtils.scaled_init(0.01),
                )
            else:
                log_sigma = tf.get_variable(
                    "log_sigma",
                    [self.act_size[0]],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                )
            log_sigma = tf.clip_by_value(log_sigma, self.log_std_min,
                                         self.log_std_max)

            sigma = tf.exp(log_sigma)

            epsilon = tf.random_normal(tf.shape(mu))

            sampled_policy = mu + sigma * epsilon

            # Stop gradient if we're not doing the resampling trick
            if not reparameterize:
                sampled_policy_probs = tf.stop_gradient(sampled_policy)
            else:
                sampled_policy_probs = sampled_policy

            # Compute probability of model output.
            _gauss_pre = -0.5 * (
                ((sampled_policy_probs - mu) /
                 (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi))
            all_probs = _gauss_pre
            all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True)

        if tanh_squash:
            self.output_pre = tf.tanh(sampled_policy)

            # Squash correction
            all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 +
                                              EPSILON),
                                       axis=1,
                                       keepdims=True)
            self.output = tf.identity(self.output_pre, name="action")
        else:
            self.output_pre = sampled_policy
            # Clip and scale output to ensure actions are always within [-1, 1] range.
            output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
            self.output = tf.identity(output_post, name="action")

        self.selected_actions = tf.stop_gradient(self.output)

        self.all_log_probs = tf.identity(all_probs, name="action_probs")

        single_dim_entropy = 0.5 * tf.reduce_mean(
            tf.log(2 * np.pi * np.e) + 2 * log_sigma)
        # Make entropy the right shape
        self.entropy = tf.ones_like(tf.reshape(mu[:, 0],
                                               [-1])) * single_dim_entropy

        # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
        self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)),
                                       axis=1,
                                       keepdims=True)

        self.action_holder = tf.placeholder(shape=[None, self.act_size[0]],
                                            dtype=tf.float32,
                                            name="action_holder")