def testAverages(self): with self.cached_session() as session: scale = 2. grad = array_ops.ones([3, 4]) * scale log_norm = np.log( np.sqrt(scale**2 * grad.get_shape().num_elements())) grads_and_vars = [(grad, grad)] grads_and_vars = optimizers_lib.adaptive_clipping_fn( decay=0.5)(grads_and_vars) var_dict = {} for var in variables.global_variables(): if var.name.startswith("AdaptiveMaxNorm"): var_dict[var.name.split(":")[0]] = var self.assertEqual(2, len(var_dict)) moving_mean = var_dict["AdaptiveMaxNorm/mean"] moving_sq_mean = var_dict["AdaptiveMaxNorm/sq_mean"] variables.global_variables_initializer().run() mean, sq_mean = session.run([moving_mean, moving_sq_mean]) self.assertEqual([0], mean) self.assertEqual([0], sq_mean) for i in range(20): mean, sq_mean, _ = session.run( [moving_mean, moving_sq_mean, grads_and_vars[0][0]]) if i == 0: self.assertLess(mean, 0.9 * log_norm) self.assertLess(sq_mean, 0.9 * log_norm**2) self.assertAlmostEqual(float(mean), log_norm, places=4) self.assertAlmostEqual(float(sq_mean), log_norm**2, places=4)
def testClip(self): with self.test_session() as session: spike = 1000. multiplier = array_ops.placeholder(dtypes.float32, [], "multiplier") step = array_ops.placeholder(dtypes.int32, [], "step") grad = array_ops.ones([3, 4]) * multiplier grads_and_vars = [(grad, grad)] grads_and_vars = optimizers_lib.adaptive_clipping_fn( decay=0.9, global_step=step)(grads_and_vars) variables.global_variables_initializer().run() def run(scale, i): return session.run(grads_and_vars[0][0], feed_dict={multiplier: scale, step: i}) for i in range(20): scale = [1., -2.][i % 2] clipped_grad = run(scale, i) if i > 3: self.assertAllClose(np.ones(clipped_grad.shape) * scale, clipped_grad) # assert that the spike will have low influence. clipped_grad = run(spike, 20) self.assertTrue((clipped_grad < 25.).all()) # assert that a repeated spike will converge to this new value. for i in range(10): clipped_grad = run(spike, i + 21) self.assertAllClose(np.ones(clipped_grad.shape) * spike, clipped_grad)
def testAverages(self): with self.test_session() as session: scale = 2. grad = array_ops.ones([3, 4]) * scale log_norm = np.log(np.sqrt(scale**2 * grad.get_shape().num_elements())) grads_and_vars = [(grad, grad)] grads_and_vars = optimizers_lib.adaptive_clipping_fn( decay=0.5)(grads_and_vars) var_dict = {} for var in variables.global_variables(): if var.name.startswith("AdaptiveMaxNorm"): var_dict[var.name.split(":")[0]] = var self.assertEqual(2, len(var_dict)) moving_mean = var_dict["AdaptiveMaxNorm/mean"] moving_sq_mean = var_dict["AdaptiveMaxNorm/sq_mean"] variables.global_variables_initializer().run() mean, sq_mean = session.run([moving_mean, moving_sq_mean]) self.assertEqual([0], mean) self.assertEqual([0], sq_mean) for i in range(20): mean, sq_mean, _ = session.run( [moving_mean, moving_sq_mean, grads_and_vars[0][0]]) if i == 0: self.assertLess(mean, 0.9 * log_norm) self.assertLess(sq_mean, 0.9 * log_norm**2) self.assertAlmostEqual(float(mean), log_norm, places=4) self.assertAlmostEqual(float(sq_mean), log_norm**2, places=4)
def testClip(self): with self.test_session() as session: spike = 1000. multiplier = array_ops.placeholder(dtypes.float32, [], "multiplier") step = array_ops.placeholder(dtypes.int32, [], "step") grad = array_ops.ones([3, 4]) * multiplier grads_and_vars = [(grad, grad)] grads_and_vars = optimizers_lib.adaptive_clipping_fn( decay=0.9, global_step=step)(grads_and_vars) variables.global_variables_initializer().run() def run(scale, i): return session.run(grads_and_vars[0][0], feed_dict={multiplier: scale, step: i}) for i in range(20): scale = [1., -2.][i % 2] clipped_grad = run(scale, i) if i > 3: self.assertAllClose(np.ones(clipped_grad.shape) * scale, clipped_grad) # assert that the spike will have low influence. clipped_grad = run(spike, 20) self.assertTrue((clipped_grad < 25.).all()) # assert that a repeated spike will converge to this new value. for i in range(10): clipped_grad = run(spike, i + 21) self.assertAllClose(np.ones(clipped_grad.shape) * spike, clipped_grad)
def testAdaptiveGradientClip(self): with self.cached_session() as session: x, var, loss, global_step = _setup_model() clip_gradients = optimizers_lib.adaptive_clipping_fn() train = optimizers_lib.optimize_loss(loss, global_step, learning_rate=0.1, optimizer="SGD", clip_gradients=clip_gradients) variables.global_variables_initializer().run() session.run(train, feed_dict={x: 5}) var_value, global_step_value = session.run([var, global_step]) self.assertAlmostEqual(var_value, 9.8916, 4) self.assertEqual(global_step_value, 1) var_count = 0 for var in variables.global_variables(): if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"): var_count += 1 self.assertEqual(2, var_count)
def testAdaptiveGradientClip(self): with self.test_session() as session: x, var, loss, global_step = _setup_model() clip_gradients = optimizers_lib.adaptive_clipping_fn() train = optimizers_lib.optimize_loss( loss, global_step, learning_rate=0.1, optimizer="SGD", clip_gradients=clip_gradients) variables.global_variables_initializer().run() session.run(train, feed_dict={x: 5}) var_value, global_step_value = session.run([var, global_step]) self.assertAlmostEqual(var_value, 9.8916, 4) self.assertEqual(global_step_value, 1) var_count = 0 for var in variables.global_variables(): if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"): var_count += 1 self.assertEqual(2, var_count)
def __init__(self, meta_lr, score_fn, **kwargs): super(MetaRLAgent, self).__init__(**kwargs) if score_fn == 'simple_linear': tf.logging.info('Using simple linear score function.') self.score_fn = nn_model.SimpleLinearNN() elif score_fn == 'linear': tf.logging.info('Using linear score function with priors.') self.score_fn = nn_model.LinearNN() else: raise NotImplementedError self._init_score_fn() self.score_optimizer = contrib_optimizer_v2.AdamOptimizer( learning_rate=meta_lr) self._meta_train = True # Adaptive gradient clipping self._score_grad_clipping = optimizers_lib.adaptive_clipping_fn( decay=0.9, report_summary=self.log_summaries, static_max_norm=self.max_grad_norm / 2.0, global_step=self.global_step)
def _make_train_op(loss, hparams): """Create train op.""" def learning_rate_decay_fn(learning_rate, global_step): learning_rate = tf.train.exponential_decay(learning_rate, global_step, hparams.lr_decay_steps, hparams.lr_decay_rate) learning_rate = learning_rate * tf.minimum( tf.cast(global_step / hparams.lr_warmup_steps, tf.float32), tf.constant(1.)) return learning_rate return contrib_layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), clip_gradients=optimizers_lib.adaptive_clipping_fn( decay=hparams.gradient_clipping_decay, report_summary=True, ), learning_rate=hparams.learning_rate, learning_rate_decay_fn=learning_rate_decay_fn, optimizer='Adam')
def simple_model_fn(features, labels, mode, params): """Model function for LN model.""" features['alphas'] = tf.reshape(features['alphas'], (params['batch_size'], 1)) features['neuron_ids'] = tf.reshape( features['neuron_ids'], (params['batch_size'], params['window_size'])) features['w'] = tf.reshape(features['w'], (params['batch_size'], params['window_size'])) features['global_features'] = tf.reshape( features['global_features'], (params['batch_size'], params['N_global_features'])) features['X'] = tf.reshape(features['X'], (params['batch_size'], params['window_size'] + 2 * params['window_padding'])) outputs, normalizers = params['network_fn'](features, mode, params) if mode == learn.ModeKeys.TRAIN: summarize_layer('labels', labels) output = outputs['output'] summarize_layer('output', output) if params['use_normalizer']: multipliers = tf.gather(normalizers, features['neuron_ids']) output_norm = multipliers * output else: output_norm = tf.identity(output) output_norm_pre = output_norm output_norm = output_norm * features['alphas'] loss = None train_op = None zero_fraction = tf.identity(tf.reduce_mean( tf.nn.zero_fraction(output_norm), keep_dims=True), name="zero_fraction_tracker") # Calculate Loss (for both TRAIN and EVAL modes) if mode != learn.ModeKeys.INFER: loss = tf.losses.mean_squared_error(labels=tf.reshape(labels, shape=(-1, )), predictions=tf.reshape( output_norm, shape=(-1, )), weights=tf.reshape(features['w'], shape=(-1, ))) # Configure the Training Op (for TRAIN mode) if mode == learn.ModeKeys.TRAIN: global_step = tf.contrib.framework.get_global_step() learning_rate = tf.train.exponential_decay(params['alpha'], global_step, params['decay_every'], params['decay_multiplier'], staircase=True, name='learning_rate') summary.scalar('learning_rate', learning_rate) summary.scalar('sum_weights_train', tf.reduce_sum(labels[:, 1])) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=learning_rate, optimizer="Adam", clip_gradients=optimizers_lib.adaptive_clipping_fn()) elif mode == learn.ModeKeys.EVAL: summary.scalar('sum_weights_eval', tf.reduce_sum(labels[:, 1])) # Generate Predictions predictions = { "relu_output": tf.identity(output_norm, name='relu_output'), "relu_coarse": tf.cast(tf.round(tf.reshape(output_norm, (-1, )) * 1e4), tf.int32, name='relu_coarse'), } if mode != learn.ModeKeys.INFER: eval_metric_ops = { "mse": tf.metrics.mean_squared_error(labels=tf.reshape(labels, (-1, )), predictions=tf.reshape( output_norm, (-1, )), weights=tf.reshape(features['w'], shape=(-1, ))), } else: eval_metric_ops = None # Return a ModelFnOps object return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)