def create_normalizer_update( vector_input: tf.Tensor, steps: tf.Tensor, running_mean: tf.Tensor, running_variance: tf.Tensor, ) -> tf.Operation: """ Creates the update operation for the normalizer. :param vector_input: Vector observation to use for updating the running mean and variance. :param running_mean: Tensorflow tensor representing the current running mean. :param running_variance: Tensorflow tensor representing the current running variance. :param steps: Tensorflow tensor representing the current number of steps that have been normalized. :return: A TF operation that updates the normalization based on vector_input. """ # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(steps, steps_increment) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, running_mean) new_mean = running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(running_mean, new_mean) update_variance = tf.assign(running_variance, new_variance) update_norm_step = tf.assign(steps, total_new_steps) return tf.group([update_mean, update_variance, update_norm_step])
def copy_normalization(self, mean, variance, steps): """ Copies the mean, variance, and steps into the normalizers of the input of this SACNetwork. Used to copy the normalizer from the policy network to the target network. param mean: Tensor containing the mean. param variance: Tensor containing the variance param steps: Tensor containing the number of steps. """ update_mean = tf.assign(self.running_mean, mean) update_variance = tf.assign(self.running_variance, variance) update_norm_step = tf.assign(self.normalization_steps, steps) return tf.group([update_mean, update_variance, update_norm_step])
def create_normalizer_update(self, vector_input): mean_current_observation = tf.reduce_mean(vector_input, axis=0) new_mean = self.running_mean + ( mean_current_observation - self.running_mean) / tf.cast( tf.add(self.normalization_steps, 1), tf.float32) new_variance = self.running_variance + ( mean_current_observation - new_mean) * (mean_current_observation - self.running_mean) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign(self.normalization_steps, self.normalization_steps + 1) return tf.group([update_mean, update_variance, update_norm_step])
def create_normalizer_update(self, vector_input): # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here: # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates steps_increment = tf.shape(vector_input)[0] total_new_steps = tf.add(self.normalization_steps, steps_increment) # Compute the incremental update and divide by the number of new steps. input_to_old_mean = tf.subtract(vector_input, self.running_mean) new_mean = self.running_mean + tf.reduce_sum( input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0) # Compute difference of input to the new mean for Welford update input_to_new_mean = tf.subtract(vector_input, new_mean) new_variance = self.running_variance + tf.reduce_sum( input_to_new_mean * input_to_old_mean, axis=0) update_mean = tf.assign(self.running_mean, new_mean) update_variance = tf.assign(self.running_variance, new_variance) update_norm_step = tf.assign(self.normalization_steps, total_new_steps) return tf.group([update_mean, update_variance, update_norm_step])
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: Brain parameters used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope(""): super().__init__(policy, trainer_params) hyperparameters: SACSettings = cast( SACSettings, trainer_params.hyperparameters) lr = hyperparameters.learning_rate lr_schedule = hyperparameters.learning_rate_schedule max_step = trainer_params.max_steps self.tau = hyperparameters.tau self.init_entcoef = hyperparameters.init_entcoef self.policy = policy self.act_size = policy.act_size policy_network_settings = policy.network_settings h_size = policy_network_settings.hidden_units num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.tau = hyperparameters.tau self.burn_in_ratio = 0.0 # Non-exposed SAC parameters self.discrete_target_entropy_scale = ( 0.2 # Roughly equal to e-greedy 0.05 ) self.continuous_target_entropy_scale = 1.0 stream_names = list(self.reward_signals.keys()) # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.gammas = [ _val.gamma for _val in trainer_params.reward_signals.values() ] self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( policy=self.policy, m_size=self.policy.m_size, # 3x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( policy=self.policy, m_size=self.policy.m_size, # 1x policy.m_size h_size=h_size, normalize=self.policy.normalize, use_recurrent=self.policy.use_recurrent, num_layers=num_layers, stream_names=stream_names, vis_encode_type=vis_encode_type, ) # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value) self.m_size = 3 * self.policy.m_size self._create_inputs_and_outputs() self.learning_rate = ModelUtils.create_schedule( lr_schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, int(max_step), stream_names, discrete=not self.policy.use_continuous_act, ) self._create_sac_optimizer_ops() self.selected_actions = (self.policy.selected_actions ) # For GAIL and other reward signals if self.policy.normalize: target_update_norm = self.target_network.copy_normalization( self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) # Update the normalization of the optimizer when the policy does. self.policy.update_normalization_op = tf.group([ self.policy.update_normalization_op, target_update_norm ]) self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Losses/Q1 Loss": "q1_loss", "Losses/Q2 Loss": "q2_loss", "Policy/Entropy Coeff": "entropy_coef", "Policy/Learning Rate": "learning_rate", } self.update_dict = { "value_loss": self.total_value_loss, "policy_loss": self.policy_loss, "q1_loss": self.q1_loss, "q2_loss": self.q2_loss, "entropy_coef": self.ent_coef, "update_batch": self.update_batch_policy, "update_value": self.update_batch_value, "update_entropy": self.update_batch_entropy, "learning_rate": self.learning_rate, }
def __init__( self, brain, lr=1e-4, lr_schedule=LearningRateSchedule.CONSTANT, h_size=128, init_entcoef=0.1, max_step=5e6, normalize=False, use_recurrent=False, num_layers=2, m_size=None, seed=0, stream_names=None, tau=0.005, gammas=None, vis_encode_type=EncoderType.SIMPLE, ): """ Takes a Unity environment and model-specific hyper-parameters and returns the appropriate PPO agent model for the environment. :param brain: BrainInfo used to generate specific network graph. :param lr: Learning rate. :param lr_schedule: Learning rate decay schedule. :param h_size: Size of hidden layers :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster, set higher to explore more. :return: a sub-class of PPOAgent tailored to the environment. :param max_step: Total number of training steps. :param normalize: Whether to normalize vector observation input. :param use_recurrent: Whether to use an LSTM layer in the network. :param num_layers: Number of hidden layers between encoded input and policy & value layers :param tau: Strength of soft-Q update. :param m_size: Size of brain memory. """ self.tau = tau self.gammas = gammas self.brain = brain self.init_entcoef = init_entcoef if stream_names is None: stream_names = [] # Use to reduce "survivor bonus" when using Curiosity or GAIL. self.use_dones_in_backup = { name: tf.Variable(1.0) for name in stream_names } self.disable_use_dones = { name: self.use_dones_in_backup[name].assign(0.0) for name in stream_names } LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed, stream_names) if num_layers < 1: num_layers = 1 self.target_init_op: List[tf.Tensor] = [] self.target_update_op: List[tf.Tensor] = [] self.update_batch_policy: Optional[tf.Operation] = None self.update_batch_value: Optional[tf.Operation] = None self.update_batch_entropy: Optional[tf.Operation] = None self.policy_network = SACPolicyNetwork( brain=brain, m_size=m_size, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.target_network = SACTargetNetwork( brain=brain, m_size=m_size // 4 if m_size else None, h_size=h_size, normalize=normalize, use_recurrent=use_recurrent, num_layers=num_layers, seed=seed, stream_names=stream_names, vis_encode_type=vis_encode_type, ) self.create_inputs_and_outputs() self.learning_rate = self.create_learning_rate(lr_schedule, lr, self.global_step, max_step) self.create_losses( self.policy_network.q1_heads, self.policy_network.q2_heads, lr, max_step, stream_names, discrete=self.brain.vector_action_space_type == "discrete", ) self.selected_actions = (self.policy_network.selected_actions ) # For GAIL and other reward signals if normalize: target_update_norm = self.target_network.copy_normalization( self.policy_network.running_mean, self.policy_network.running_variance, self.policy_network.normalization_steps, ) self.update_normalization = tf.group( [self.policy_network.update_normalization, target_update_norm]) self.running_mean = self.policy_network.running_mean self.running_variance = self.policy_network.running_variance self.normalization_steps = self.policy_network.normalization_steps