def loss_fn(self, policy=None, value=None): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") value_old = tf.placeholder(tf.float32, [None], name="value_old") if not self.subenvs: ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) value_err = (self.value - returns)**2 if self.clip_value > 0.0: clipped_value = tf.clip_by_value(self.value, value_old - self.clip_value, value_old + self.clip_value) clipped_value_err = (clipped_value - returns)**2 value_err = tf.maximum(value_err, clipped_value_err) policy_loss = -tf.reduce_mean( tf.minimum(adv * ratio, adv * clipped_ratio)) value_loss = tf.reduce_mean(value_err) * self.value_coef entropy_loss = tf.reduce_mean( self.policy.entropy) * self.entropy_coef else: assert policy is not None and value is not None, "Missing variables representing <policy> and <value>" ratio = tf.exp(policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) value_err = (value - returns)**2 if self.clip_value > 0.0: clipped_value = tf.clip_by_value(value, value_old - self.clip_value, value_old + self.clip_value) clipped_value_err = (clipped_value - returns)**2 value_err = tf.maximum(value_err, clipped_value_err) policy_loss = -tf.reduce_mean( tf.minimum(adv * ratio, adv * clipped_ratio)) value_loss = tf.reduce_mean(value_err) * self.value_coef entropy_loss = tf.reduce_mean(policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old, value_old]
def learning_rate_schedule_noam(train_steps, warmup_steps=10000, linear_decay_fraction=0.1, multiplier=1.0): """Noam's favorite learning-rate schedule. (rsqrt(max(step_num, warmup_steps)) * multiplier * min(1.0, (train_steps-step_num)/(train_steps*linear_decay_fraction))) Args: train_steps: a number warmup_steps: a number linear_decay_fraction: a number multiplier: a number Returns: a tf.scalar """ train_steps = float(train_steps) step_num = tf.cast(tf.get_global_step(), tf.float32) learning_rate = tf.math.rsqrt(tf.maximum(step_num, warmup_steps)) learning_rate *= multiplier if linear_decay_fraction > 0: learning_rate *= tf.minimum(1.0, (train_steps - step_num) / (train_steps * linear_decay_fraction)) return learning_rate
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1-self.clip_ratio, 1+self.clip_ratio) policy_loss = -tf.reduce_mean(tf.minimum(adv * ratio, adv * clipped_ratio)) # TODO clip value loss value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old]
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) policy_loss = -tf.reduce_mean( tf.minimum(adv * ratio, adv * clipped_ratio)) # TODO clip value loss value_loss = tf.reduce_mean( (self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old]
def pad_or_clip(mesh_inputs, view_indices_2d_inputs, pad_or_clip_size): """Pads and clips the points and correspondences.""" if standard_fields.InputDataFields.point_positions not in mesh_inputs: return num_valid_points = tf.shape( mesh_inputs[standard_fields.InputDataFields.point_positions])[0] if pad_or_clip_size: num_valid_points = tf.minimum(num_valid_points, pad_or_clip_size) for key in sorted(mesh_inputs.keys()): num_channels = mesh_inputs[key].get_shape().as_list()[1] mesh_inputs[key] = shape_utils.pad_or_clip_nd( tensor=mesh_inputs[key], output_shape=[pad_or_clip_size, num_channels]) for key in sorted(view_indices_2d_inputs): num_images = view_indices_2d_inputs[key].get_shape().as_list()[0] if num_images is None: num_images = tf.shape(view_indices_2d_inputs[key])[0] view_indices_2d_inputs[key] = shape_utils.pad_or_clip_nd( tensor=(view_indices_2d_inputs[key] + 1), output_shape=[num_images, pad_or_clip_size, 2]) - 1 mesh_inputs[ standard_fields.InputDataFields.num_valid_points] = num_valid_points