def create_sac_value_head(self, stream_names, hidden_input, num_layers, h_size, scope): """ Creates one value estimator head for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for value network :param h_size: size of hidden layers for value network :param scope: TF scope for value network. """ with tf.variable_scope(scope): value_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "encoder", False) if self.use_recurrent: value_hidden, memory_out = ModelUtils.create_recurrent_encoder( value_hidden, self.value_memory_in, self.sequence_length_ph, name="lstm_value", ) self.value_memory_out = memory_out self.create_value_heads(stream_names, value_hidden)
def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: """ Creates state encoders for current and future observations. Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction See https://arxiv.org/abs/1705.05363 for more details. :return: current and future state encoder tensors. """ encoded_state_list = [] encoded_next_state_list = [] # Create input ops for next (t+1) visual observations. self.next_vector_in, self.next_visual_in = ModelUtils.create_input_placeholders( self.policy.behavior_spec.observation_shapes, name_prefix="curiosity_next_") if self.next_visual_in: visual_encoders = [] next_visual_encoders = [] for i, (vis_in, next_vis_in) in enumerate( zip(self.policy.visual_in, self.next_visual_in)): # Create the encoder ops for current and next visual input. # Note that these encoders are siamese. encoded_visual = ModelUtils.create_visual_observation_encoder( vis_in, self.encoding_size, ModelUtils.swish, 1, f"curiosity_stream_{i}_visual_obs_encoder", False, ) encoded_next_visual = ModelUtils.create_visual_observation_encoder( next_vis_in, self.encoding_size, ModelUtils.swish, 1, f"curiosity_stream_{i}_visual_obs_encoder", True, ) visual_encoders.append(encoded_visual) next_visual_encoders.append(encoded_next_visual) hidden_visual = tf.concat(visual_encoders, axis=1) hidden_next_visual = tf.concat(next_visual_encoders, axis=1) encoded_state_list.append(hidden_visual) encoded_next_state_list.append(hidden_next_visual) if self.policy.vec_obs_size > 0: encoded_vector_obs = ModelUtils.create_vector_observation_encoder( self.policy.vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", False, ) encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder( self.next_vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", True, ) encoded_state_list.append(encoded_vector_obs) encoded_next_state_list.append(encoded_next_vector_obs) encoded_state = tf.concat(encoded_state_list, axis=1) encoded_next_state = tf.concat(encoded_next_state_list, axis=1) return encoded_state, encoded_next_state
def create_q_heads( self, stream_names, hidden_input, num_layers, h_size, scope, reuse=False, num_outputs=1, ): """ Creates two q heads for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for Q network :param h_size: size of hidden layers for Q network :param scope: TF scope for Q network. :param reuse: Whether or not to reuse variables. Useful for creating Q of policy. :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions. """ with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse): q1_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse) if self.use_recurrent: q1_hidden, memory_out = ModelUtils.create_recurrent_encoder( q1_hidden, self.q1_memory_in, self.sequence_length_ph, name="lstm_q1", ) self.q1_memory_out = memory_out q1_heads = {} for name in stream_names: _q1 = tf.layers.dense(q1_hidden, num_outputs, name=f"{name}_q1") q1_heads[name] = _q1 q1 = tf.reduce_mean(list(q1_heads.values()), axis=0) with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse): q2_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse) if self.use_recurrent: q2_hidden, memory_out = ModelUtils.create_recurrent_encoder( q2_hidden, self.q2_memory_in, self.sequence_length_ph, name="lstm_q2", ) self.q2_memory_out = memory_out q2_heads = {} for name in stream_names: _q2 = tf.layers.dense(q2_hidden, num_outputs, name=f"{name}_q2") q2_heads[name] = _q2 q2 = tf.reduce_mean(list(q2_heads.values()), axis=0) return q1_heads, q2_heads, q1, q2