def create_normalizer_update( vector_input: tf.Tensor, steps: tf.Tensor, running_mean: tf.Tensor, running_variance: tf.Tensor, ) -> tf.Operation: """ Creates the update operation for the normalizer. :param vector_input: Vector observation to use for updating the running mean and variance. :param running_mean: Tensorflow tensor representing the current running mean. :param running_variance: Tensorflow tensor representing the current running variance. :param steps: Tensorflow tensor representing the current number of steps that have been normalized. :return: A TF operation that updates the normalization based on vector_input. """ # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(steps, steps_increment) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, running_mean) new_mean = running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(running_mean, new_mean) update_variance = tf.assign(running_variance, new_variance) update_norm_step = tf.assign(steps, total_new_steps) return tf.group([update_mean, update_variance, update_norm_step])
def __init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names=None ): tf.set_random_seed(seed) self.brain = brain self.vector_in = None self.global_step, self.increment_step, self.steps_to_increment = ( self.create_global_steps() ) self.visual_in = [] self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length = tf.placeholder( shape=None, dtype=tf.int32, name="sequence_length" ) self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") self.mask = tf.cast(self.mask_input, tf.int32) self.stream_names = stream_names or [] self.use_recurrent = use_recurrent if self.use_recurrent: self.m_size = m_size else: self.m_size = 0 self.normalize = normalize self.act_size = brain.vector_action_space_size self.vec_obs_size = brain.vector_observation_space_size self.vis_obs_size = brain.number_visual_observations tf.Variable( int(brain.vector_action_space_type == "continuous"), name="is_continuous_control", trainable=False, dtype=tf.int32, ) tf.Variable( self._version_number_, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) if brain.vector_action_space_type == "continuous": tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, ) self.value_heads: Dict[str, tf.Tensor] = {} self.normalization_steps: Optional[tf.Variable] = None self.running_mean: Optional[tf.Variable] = None self.running_variance: Optional[tf.Variable] = None self.update_normalization: Optional[tf.Operation] = None self.value: Optional[tf.Tensor] = None
def normalize_vector_obs(self, vector_obs): normalized_state = tf.clip_by_value( (vector_obs - self.running_mean) / tf.sqrt(self.running_variance / (tf.cast(self.normalization_steps, tf.float32) + 1)), -5, 5, name="normalized_state", ) return normalized_state
def create_normalizer_update(self, vector_input): mean_current_observation = tf.reduce_mean(vector_input, axis=0) new_mean = self.running_mean + ( mean_current_observation - self.running_mean) / tf.cast( tf.add(self.normalization_steps, 1), tf.float32) new_variance = self.running_variance + ( mean_current_observation - new_mean) * (mean_current_observation - self.running_mean) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign(self.normalization_steps, self.normalization_steps + 1) return tf.group([update_mean, update_variance, update_norm_step])
def create_normalizer_update(self, vector_input): # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(self.normalization_steps, steps_increment) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, self.running_mean) new_mean = self.running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = self.running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign(self.normalization_steps, total_new_steps) return tf.group([update_mean, update_variance, update_norm_step])
def normalize_vector_obs( vector_obs: tf.Tensor, running_mean: tf.Tensor, running_variance: tf.Tensor, normalization_steps: tf.Tensor, ) -> tf.Tensor: """ Create a normalized version of an input tensor. :param vector_obs: Input vector observation tensor. :param running_mean: Tensorflow tensor representing the current running mean. :param running_variance: Tensorflow tensor representing the current running variance. :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps. :return: A normalized version of vector_obs. """ normalized_state = tf.clip_by_value( (vector_obs - running_mean) / tf.sqrt(running_variance / (tf.cast(normalization_steps, tf.float32) + 1)), -5, 5, name="normalized_state", ) return normalized_state
def create_input_placeholders(self): with self.graph.as_default(): ( self.global_step, self.increment_step_op, self.steps_to_increment, ) = ModelUtils.create_global_steps() self.vector_in, self.visual_in = ModelUtils.create_input_placeholders( self.behavior_spec.observation_shapes) if self.normalize: self.first_normalization_update = True normalization_tensors = ModelUtils.create_normalizer( self.vector_in) self.update_normalization_op = normalization_tensors.update_op self.init_normalization_op = normalization_tensors.init_op self.normalization_steps = normalization_tensors.steps self.running_mean = normalization_tensors.running_mean self.running_variance = normalization_tensors.running_variance self.processed_vector_in = ModelUtils.normalize_vector_obs( self.vector_in, self.running_mean, self.running_variance, self.normalization_steps, ) else: self.processed_vector_in = self.vector_in self.update_normalization_op = None self.batch_size_ph = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length_ph = tf.placeholder(shape=None, dtype=tf.int32, name="sequence_length") self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") # Only needed for PPO, but needed for BC module self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") self.mask = tf.cast(self.mask_input, tf.int32) tf.Variable( int(self.behavior_spec.is_action_continuous()), name="is_continuous_control", trainable=False, dtype=tf.int32, ) int_version = TFPolicy._convert_version_string(__version__) major_ver_t = tf.Variable( int_version[0], name="trainer_major_version", trainable=False, dtype=tf.int32, ) minor_ver_t = tf.Variable( int_version[1], name="trainer_minor_version", trainable=False, dtype=tf.int32, ) patch_ver_t = tf.Variable( int_version[2], name="trainer_patch_version", trainable=False, dtype=tf.int32, ) self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t) tf.Variable( MODEL_FORMAT_VERSION, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) if self.behavior_spec.is_action_continuous(): tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, )
def create_input_placeholders(self): with self.graph.as_default(): ( self.global_step, self.increment_step_op, self.steps_to_increment, ) = ModelUtils.create_global_steps() self.visual_in = ModelUtils.create_visual_input_placeholders( self.brain.camera_resolutions ) self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size) if self.normalize: normalization_tensors = ModelUtils.create_normalizer(self.vector_in) self.update_normalization_op = normalization_tensors.update_op self.normalization_steps = normalization_tensors.steps self.running_mean = normalization_tensors.running_mean self.running_variance = normalization_tensors.running_variance self.processed_vector_in = ModelUtils.normalize_vector_obs( self.vector_in, self.running_mean, self.running_variance, self.normalization_steps, ) else: self.processed_vector_in = self.vector_in self.update_normalization_op = None self.batch_size_ph = tf.placeholder( shape=None, dtype=tf.int32, name="batch_size" ) self.sequence_length_ph = tf.placeholder( shape=None, dtype=tf.int32, name="sequence_length" ) self.mask_input = tf.placeholder( shape=[None], dtype=tf.float32, name="masks" ) # Only needed for PPO, but needed for BC module self.epsilon = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon" ) self.mask = tf.cast(self.mask_input, tf.int32) tf.Variable( int(self.brain.vector_action_space_type == "continuous"), name="is_continuous_control", trainable=False, dtype=tf.int32, ) tf.Variable( self._version_number_, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) if self.brain.vector_action_space_type == "continuous": tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, )
def __init__( self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False, seed=0, ): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) num_streams = 1 hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder( dtype=tf.float32, shape=[], name="dropout_rate" ) hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in" ) hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in, self.sequence_length ) self.memory_out = tf.identity(self.memory_out, name="recurrent_out") if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_reg, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling(0.01), ) ) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs", ) self.action_masks = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" ) self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer( tf.concat(policy_branches, axis=1), self.action_masks, self.act_size ) tf.identity(normalized_logits, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action", ) self.action_oh = tf.concat( [ tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.loss = tf.reduce_sum( -tf.log(self.action_probs + 1e-10) * self.action_oh ) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action, ), tf.float32, ) ) else: self.policy = tf.layers.dense( hidden_reg, self.act_size[0], activation=None, use_bias=False, name="pre_action", kernel_initializer=tf.initializers.variance_scaling(0.01), ) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action" ) self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1) self.loss = tf.reduce_sum( tf.squared_difference(self.clipped_true_action, self.sample_action) ) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)