def create_recurrent_encoder(input_state, memory_in, sequence_length, name="lstm"): """ Builds a recurrent encoder for either state or observations (LSTM). :param sequence_length: Length of sequence to unroll. :param input_state: The input tensor to the LSTM cell. :param memory_in: The input memory to the LSTM cell. :param name: The scope of the LSTM cell. """ s_size = input_state.get_shape().as_list()[1] m_size = memory_in.get_shape().as_list()[1] lstm_input_state = tf.reshape(input_state, shape=[-1, sequence_length, s_size]) memory_in = tf.reshape(memory_in[:, :], [-1, m_size]) half_point = int(m_size / 2) with tf.variable_scope(name): rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point) lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple( memory_in[:, :half_point], memory_in[:, half_point:]) recurrent_output, lstm_state_out = tf.nn.dynamic_rnn( rnn_cell, lstm_input_state, initial_state=lstm_vector_in) recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point]) return recurrent_output, tf.concat( [lstm_state_out.c, lstm_state_out.h], axis=1)
def _create_entropy( self, encoded: "GaussianDistribution.MuSigmaTensors") -> tf.Tensor: single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma) # Make entropy the right shape return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
def create_network(self) -> None: """ Helper for creating the intrinsic reward nodes """ if self.use_vail: self.z_sigma = tf.get_variable( "gail_sigma_vail", self.z_size, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.z_sigma_sq = self.z_sigma * self.z_sigma self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) self.use_noise = tf.placeholder( shape=[1], dtype=tf.float32, name="gail_NoiseLevel" ) self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( self.encoded_expert, self.expert_action, self.done_expert, reuse=False ) self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( self.encoded_policy, self.policy.selected_actions, self.done_policy, reuse=True, ) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
def __init__( self, brain, lr=1e-4, lr_schedule=LearningRateSchedule.LINEAR, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, seed=0, stream_names=None, vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: brain parameters used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param epsilon: Value for policy-divergence threshold. :param beta: Strength of entropy regularization. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers Number of hidden layers between encoded input and policy & value layers :param m_size: Size of brain memory. :param seed: Seed to use for initialization of model. :param stream_names: List of names of value streams. Usually, a list of the Reward Signals being used. :return: a sub-class of PPOAgent tailored to the environment. """ LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed, stream_names) self.optimizer: Optional[tf.train.AdamOptimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None if num_layers < 1: num_layers = 1 if brain.vector_action_space_type == "continuous": self.create_cc_actor_critic(h_size, num_layers, vis_encode_type) self.entropy = tf.ones_like(tf.reshape(self.value, [-1])) * self.entropy else: self.create_dc_actor_critic(h_size, num_layers, vis_encode_type) self.learning_rate = self.create_learning_rate(lr_schedule, lr, self.global_step, max_step) self.create_losses( self.log_probs, self.old_log_probs, self.value_heads, self.entropy, beta, epsilon, lr, max_step, )
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=ModelUtils.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) # Policy-dependent log_sigma if condition_sigma_on_obs: log_sigma = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_sigma", kernel_initializer=ModelUtils.scaled_init(0.01), ) else: log_sigma = tf.get_variable( "log_sigma", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max) sigma = tf.exp(log_sigma) epsilon = tf.random_normal(tf.shape(mu)) sampled_policy = mu + sigma * epsilon # Stop gradient if we're not doing the resampling trick if not reparameterize: sampled_policy_probs = tf.stop_gradient(sampled_policy) else: sampled_policy_probs = sampled_policy # Compute probability of model output. _gauss_pre = -0.5 * ( ((sampled_policy_probs - mu) / (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi)) all_probs = _gauss_pre all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) if tanh_squash: self.output_pre = tf.tanh(sampled_policy) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = sampled_policy # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(all_probs, name="action_probs") single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + 2 * log_sigma) # Make entropy the right shape self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder")