def test_interpolation_gradient(self): """Make sure that backprop can run. Correctness of gradients is assumed. Here, we create a use a small 'training' set and a more densely-sampled set of query points, for which we know the true value in advance. The goal is to choose x locations for the training data such that interpolating using this training data yields the best reconstruction for the function values at the query points. The training data locations are optimized iteratively using gradient descent. """ tp = _QuadraticPlusSinProblemND() (query_points, query_values, train_points, train_values) = tp.get_problem(optimizable=True) regularization = 0.001 for interpolation_order in (1, 2, 3, 4): interpolator = interpolate_spline.interpolate_spline( train_points, train_values, query_points, interpolation_order, regularization) loss = math_ops.reduce_mean( math_ops.square(query_values - interpolator)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [train_points]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [train_points])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(100): sess.run([loss, opt_func])
def train_step(model, optimizer, dataset, step_counter, ep, class_weights, params, log_interval=None): start = time.time() steps = 0 total_loss = 0 # params for step, ((days, prices, day_lens, news_lens), labels) in enumerate(dataset): steps += 1 with tf.GradientTape() as tape: logits = model(days, prices, day_lens, news_lens, training=True) loss_value = loss(logits, labels, class_weights) total_loss += loss_value grads = tape.gradient(loss_value, model.trainable_weights) grads, _ = clip_ops.clip_by_global_norm(grads, params['clip_norm']) optimizer.apply_gradients(zip(grads, model.trainable_weights), global_step=step_counter) params['train_losses'].append(total_loss / steps)
def testThatBackpropRuns(self): """Run optimization to ensure that gradients can be computed.""" batch_size = 1 image_height = 9 image_width = 12 image = variables.Variable( np.float32( np.random.uniform(size=[batch_size, image_height, image_width, 3]))) control_point_locations = [[3., 3.]] control_point_locations = constant_op.constant( np.float32(np.expand_dims(control_point_locations, 0))) control_point_displacements = [[0.25, -0.5]] control_point_displacements = constant_op.constant( np.float32(np.expand_dims(control_point_displacements, 0))) warped_image, _ = sparse_image_warp.sparse_image_warp( image, control_point_locations, control_point_locations + control_point_displacements, num_boundary_points=3) loss = math_ops.reduce_mean(math_ops.abs(warped_image - image)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [image]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [image])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(5): sess.run([loss, opt_func])
def _get_train_ops(self, features, targets): """See base class.""" global_step = contrib_variables.get_global_step() assert global_step logits = self._logits(features, is_training=True) with ops.control_dependencies([ self._centered_bias_step(targets, self._get_weight_tensor(features)) ]): loss = self._loss(logits, targets, self._get_weight_tensor(features)) logging_ops.scalar_summary("loss", loss) linear_vars = self._get_linear_vars() dnn_vars = self._get_dnn_vars() grads = gradients.gradients(loss, dnn_vars + linear_vars) if self._gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm) dnn_grads = grads[0:len(dnn_vars)] linear_grads = grads[len(dnn_vars):] train_ops = self._get_linear_training_ops( linear_grads, linear_vars) + self._get_dnn_training_ops( dnn_grads, dnn_vars) train_step = control_flow_ops.group(*train_ops, name="combined_training_op") with ops.control_dependencies([train_step]): with ops.get_default_graph().colocate_with(global_step): return state_ops.assign_add(global_step, 1).op, loss
def testThatBackpropRuns(self): """Run optimization to ensure that gradients can be computed.""" batch_size = 1 image_height = 9 image_width = 12 image = variables.Variable( np.float32( np.random.uniform( size=[batch_size, image_height, image_width, 3]))) control_point_locations = [[3., 3.]] control_point_locations = constant_op.constant( np.float32(np.expand_dims(control_point_locations, 0))) control_point_displacements = [[0.25, -0.5]] control_point_displacements = constant_op.constant( np.float32(np.expand_dims(control_point_displacements, 0))) warped_image, _ = sparse_image_warp.sparse_image_warp( image, control_point_locations, control_point_locations + control_point_displacements, num_boundary_points=3) loss = math_ops.reduce_mean(math_ops.abs(warped_image - image)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [image]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [image])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(5): sess.run([loss, opt_func])
def gradients(opt, loss, vars, step, max_gradient_norm=None, dont_clip=[]): ''' Function for calculating and applying gradients on all trainable parameters ''' gradients = opt.compute_gradients(loss, vars) if max_gradient_norm is not None: to_clip = [(g, v) for g, v in gradients if v.name not in dont_clip] not_clipped = [(g, v) for g, v in gradients if v.name in dont_clip] gradients, variables = zip(*to_clip) clipped_gradients, _ = clip_ops.clip_by_global_norm( gradients, max_gradient_norm) gradients = list(zip(clipped_gradients, variables)) + not_clipped # Add histograms for variables, gradients and gradient norms for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is None: print('warning: missing gradient: {}'.format(variable.name)) if grad_values is not None: tf.summary.histogram(variable.name, variable) tf.summary.histogram(variable.name + '/gradients', grad_values) tf.summary.histogram(variable.name + '/gradient_norm', clip_ops.global_norm([grad_values])) return opt.apply_gradients(gradients, global_step=step)
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.): """Clips gradients of a multitask loss by their global norm. Ignores all-zero tensors when computing the global norm. Args: gradients_variables: a list of pairs (gradient, variable). clip_norm: a float Tensor, the global norm to clip on. Default is 20.0. Returns: list: A list of pairs of the same type as gradients_variables,. fixed_global_norm: A 0-D (scalar) Tensor representing the global norm. """ gradients, variables = six.moves.zip(*gradients_variables) def _replace_nonexisting_grad(grad): if grad is None: return grad all_zeros = _is_all_zeros(grad) return control_flow_ops.cond( all_zeros, lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)), lambda: grad) nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients] fixed_global_norm = clip_ops.global_norm(nonzero_gradients) gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm, use_norm=fixed_global_norm) return list(six.moves.zip(gradients, variables)), fixed_global_norm
def test_interpolation_gradient(self): """Make sure that backprop can run. Correctness of gradients is assumed. Here, we create a use a small 'training' set and a more densely-sampled set of query points, for which we know the true value in advance. The goal is to choose x locations for the training data such that interpolating using this training data yields the best reconstruction for the function values at the query points. The training data locations are optimized iteratively using gradient descent. """ tp = _QuadraticPlusSinProblemND() (query_points, query_values, train_points, train_values) = tp.get_problem(optimizable=True) regularization = 0.001 for interpolation_order in (1, 2, 3, 4): interpolator = interpolate_spline.interpolate_spline( train_points, train_values, query_points, interpolation_order, regularization) loss = math_ops.reduce_mean(math_ops.square(query_values - interpolator)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [train_points]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [train_points])) init_op = variables.global_variables_initializer() with self.cached_session() as sess: sess.run(init_op) for _ in range(100): sess.run([loss, opt_func])
def train(model, optimizer, dataset, step_counter, ep, class_weights, log_interval=None): """Trains model on `dataset` using `optimizer`.""" start = time.time() for step, ((days, day_lens, news_lens), labels) in enumerate(dataset): with tf.contrib.summary.record_summaries_every_n_global_steps( 50, global_step=step_counter): # Record the operations used to compute the loss given the input, # so that the gradient of the loss with respect to the variables # can be computed. with tf.GradientTape() as tape: logits = model(days, day_lens, news_lens, training=True) loss_value = loss(logits, labels, class_weights) tf.contrib.summary.scalar('loss', loss_value) tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels)) grads = tape.gradient(loss_value, model.trainable_weights) grads, _ = clip_ops.clip_by_global_norm(grads, model.flags.clip_norm) optimizer.apply_gradients(zip(grads, model.trainable_weights), global_step=step_counter) if log_interval and (step + 1) % log_interval == 0: rate = log_interval / (time.time() - start) print('Epoch #%d\tStep #%d\tLoss: %.6f (%.1f steps/sec)' % (ep + 1, step, loss_value, rate)) start = time.time() if ep == 0 and step == 0: print('#trainable_params', get_num_trainable_params(model))
def _get_train_ops(self, features, targets): """See base class.""" global_step = contrib_variables.get_global_step() assert global_step logits = self._logits(features, is_training=True) if self._enable_centered_bias: centered_bias_step = [self._centered_bias_step(targets, features)] else: centered_bias_step = [] with ops.control_dependencies(centered_bias_step): loss = self._loss(logits, targets, features) logging_ops.scalar_summary("loss", loss) linear_vars = self._get_linear_vars() dnn_vars = self._get_dnn_vars() grads = gradients.gradients(loss, dnn_vars + linear_vars) if self._gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm) dnn_grads = grads[0 : len(dnn_vars)] linear_grads = grads[len(dnn_vars) :] train_ops = self._get_linear_training_ops(linear_grads, linear_vars) + self._get_dnn_training_ops( dnn_grads, dnn_vars ) train_step = control_flow_ops.group(*train_ops, name="combined_training_op") with ops.control_dependencies([train_step]): with ops.get_default_graph().colocate_with(global_step): return state_ops.assign_add(global_step, 1).op, loss
def apply_update(self, optimizer, grads_and_vars): (grads, vars) = zip(*grads_and_vars) # Gradient clipping if CustomTrainer.GRADIENT_CLIP in self.train_hypers: grads, global_norm = clip_ops.clip_by_global_norm( grads, self.train_hypers[CustomTrainer.GRADIENT_CLIP]) # Gradient noise if CustomTrainer.GRADIENT_NOISE in self.train_hypers: sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE] if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers: sigma_sqr /= tf.pow( 1.0 + tf.to_float(self.global_step), self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY]) grads_tmp = [] for g in grads: if g is not None: noisy_grad = g + tf.sqrt(sigma_sqr) * tf.random_normal( tf.shape(g)) grads_tmp.append(noisy_grad) else: grads_tmp.append(g) grads = grads_tmp train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.global_step) return train_op
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.): """Clips gradients of a multitask loss by their global norm. Ignores all-zero tensors when computing the global norm. Args: gradients_variables: a list of pairs (gradient, variable). clip_norm: a float Tensor, the global norm to clip on. Default is 20.0. Returns: list: A list of pairs of the same type as gradients_variables,. fixed_global_norm: A 0-D (scalar) Tensor representing the global norm. """ gradients, variables = six.moves.zip(*gradients_variables) def _replace_nonexisting_grad(grad): if grad is None: return grad all_zeros = _is_all_zeros(grad) return control_flow_ops.cond(all_zeros, lambda: array_ops.zeros( [], dtype=dtypes.as_dtype(grad.dtype)), lambda: grad) nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients] fixed_global_norm = clip_ops.global_norm(nonzero_gradients) gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm, use_norm=fixed_global_norm) return list(six.moves.zip(gradients, variables)), fixed_global_norm
def _train_op_fn(loss): global_step = training_util.get_global_step() my_vars = ops.get_collection(parent_scope) grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) return (_get_optimizer(optimizer).apply_gradients( zip(grads, my_vars), global_step=global_step))
def _process_gradients(self, gradients_vars): """Process gradients (e.g. clipping) before applying them to weights.""" with ops.name_scope('process_gradients'): gradients, variables = zip(*gradients_vars) if self._gradient_clipping_norm is not None: gradients, _ = clip_ops.clip_by_global_norm( gradients, self._gradient_clipping_norm) return zip(gradients, variables)
def _train_op_fn(loss): global_step = contrib_variables.get_global_step() my_vars = ops.get_collection(parent_scope) grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) return (_get_optimizer(optimizer).apply_gradients( zip(grads, my_vars), global_step=global_step))
def _train_op_fn(loss): global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) return (_get_optimizer(optimizer).apply_gradients( zip(grads, my_vars), global_step=global_step))
def testClipByGlobalNormPreservesDenseShape(self): dense_shape = (1, ) slices = ops.IndexedSlices(constant_op.constant([1.0]), constant_op.constant([0]), dense_shape=dense_shape) ans, _ = clip_ops.clip_by_global_norm([slices], 1.0) modified_slices = ans[0] self.assertEqual(dense_shape, slices.dense_shape) self.assertEqual(dense_shape, modified_slices.dense_shape)
def testClipByGlobalNormPreservesDenseShape(self): dense_shape = (1,) slices = ops.IndexedSlices( constant_op.constant([1.0]), constant_op.constant([0]), dense_shape=dense_shape) ans, _ = clip_ops.clip_by_global_norm([slices], 1.0) modified_slices = ans[0] self.assertEqual(dense_shape, slices.dense_shape) self.assertEqual(dense_shape, modified_slices.dense_shape)
def __init__(self, loss, global_step, optimizer, learning_rate, clip_gradients=5.0): """Build a trainer part of graph. Args: loss: Tensor that evaluates to model's loss. global_step: Tensor with global step of the model. optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class. learning_rate: If this is constant float value, no decay function is used. Instead, a customized decay function can be passed that accepts global_step as parameter and returns a Tensor. e.g. exponential decay function: def exp_decay(global_step): return tf.train.exponential_decay( learning_rate=0.1, global_step=global_step, decay_steps=2, decay_rate=0.001) Raises: ValueError: if learning_rate is not a float or a callable. """ self.loss = loss self.global_step = global_step # pylint: disable=redefined-variable-type if isinstance(learning_rate, float): self._learning_rate = vs.get_variable( "learning_rate", [], initializer=init_ops.constant_initializer(learning_rate)) elif callable(learning_rate): self._learning_rate = learning_rate(self.global_step) else: raise ValueError( "learning_rate should be a float or a callable function.") params = variables.trainable_variables() self.gradients = gradients.gradients(loss, params) if clip_gradients > 0.0: self.gradients, self.gradients_norm = clip_ops.clip_by_global_norm( self.gradients, clip_gradients) grads_and_vars = zip(self.gradients, params) if isinstance(optimizer, str): self._optimizer = OPTIMIZER_CLS_NAMES[optimizer]( self._learning_rate) else: self._optimizer = optimizer(self._learning_rate) self.trainer = self._optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train") # Update ops during training, e.g. batch_norm_ops self.trainer = control_flow_ops.group( self.trainer, *ops.get_collection('update_ops')) # Get all initializers for all trainable variables. self._initializers = variables.initialize_all_variables()
def gradient_clipnorm_fn(grads_and_vars): if isinstance(distribute_ctx.get_strategy(), central_storage_strategy.CentralStorageStrategy): raise ValueError( "`global_clipnorm` is not supported with `CenteralStorageStrategy`" ) grads, variables = zip(*grads_and_vars) clipped_grads, _ = clip_ops.clip_by_global_norm(grads, clipnorm) clipped_grads_and_vars = list(zip(clipped_grads, variables)) return clipped_grads_and_vars
def testClipByGlobalNormInf(self): with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"): self.evaluate(norm) with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"): ans[0].eval() with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"): ans[1].eval()
def __init__(self, loss, global_step, optimizer, learning_rate, clip_gradients=5.0): """Build a trainer part of graph. Args: loss: Tensor that evaluates to model's loss. global_step: Tensor with global step of the model. optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class. learning_rate: If this is constant float value, no decay function is used. Instead, a customized decay function can be passed that accepts global_step as parameter and returns a Tensor. e.g. exponential decay function: def exp_decay(global_step): return tf.train.exponential_decay( learning_rate=0.1, global_step=global_step, decay_steps=2, decay_rate=0.001) Raises: ValueError: if learning_rate is not a float or a callable. """ self.loss = loss self.global_step = global_step # pylint: disable=redefined-variable-type if isinstance(learning_rate, float): self._learning_rate = vs.get_variable( "learning_rate", [], initializer=init_ops.constant_initializer(learning_rate)) elif callable(learning_rate): self._learning_rate = learning_rate(self.global_step) else: raise ValueError("learning_rate should be a float or a callable function.") params = variables.trainable_variables() self.gradients = gradients.gradients(loss, params) if clip_gradients > 0.0: self.gradients, self.gradients_norm = clip_ops.clip_by_global_norm( self.gradients, clip_gradients) grads_and_vars = zip(self.gradients, params) if isinstance(optimizer, str): self._optimizer = OPTIMIZER_CLS_NAMES[ optimizer](self._learning_rate) else: self._optimizer = optimizer(self._learning_rate) self.trainer = self._optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train") # Update ops during training, e.g. batch_norm_ops self.trainer = control_flow_ops.group(self.trainer, *ops.get_collection('update_ops')) # Get all initializers for all trainable variables. self._initializers = variables.initialize_all_variables()
def testClipByGlobalNormInf(self): # Expect all NaNs when global norm is inf. with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = self.evaluate(norm) self.assertAllEqual(tf_norm, float('inf')) self.assertAllEqual(tf_ans_1, np.full([2, 3], float('nan'))) self.assertAllEqual(tf_ans_2, np.full([2], float('nan')))
def get_train_step(self, loss): """Returns the ops to run to perform a training step on this estimator. Args: loss: The loss to use when calculating gradients. Returns: The ops to run to perform a training step. """ my_vars = self._get_vars() if not (self._get_feature_columns() or my_vars): return [] grads = gradients.gradients(loss, my_vars) if self._gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm) return [self._get_optimizer().apply_gradients(zip(grads, my_vars))]
def testClipByGlobalNormZero(self): # No norm clipping when norm = 0 with self.test_session(use_gpu=True): x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([0.0, 0.0]) # Norm = 0, no changes np_ans_0 = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] np_ans_1 = [0.0, 0.0] clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = norm.eval() self.assertAllClose(tf_norm, 0.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormZero(self): # No norm clipping when norm = 0 with self.session(use_gpu=True): x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([0.0, 0.0]) # Norm = 0, no changes np_ans_0 = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] np_ans_1 = [0.0, 0.0] clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = self.evaluate(norm) self.assertAllClose(tf_norm, 0.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormNotClipped(self): # No norm clipping when clip_norm >= 5 with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 np_ans_0 = [[-2.0, 0.0, 0.0], [4.0, 0.0, 0.0]] np_ans_1 = [1.0, -2.0] clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = self.evaluate(norm) self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormNotClipped(self): # No norm clipping when clip_norm >= 5 with self.test_session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 np_ans_0 = [[-2.0, 0.0, 0.0], [4.0, 0.0, 0.0]] np_ans_1 = [1.0, -2.0] clip_norm = 6.0 ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = norm.eval() self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormClippedTensor(self): # Norm clipping when clip_norm < 5 with self.test_session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = constant_op.constant(4.0) # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = norm.eval() self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormClippedTensor(self): # Norm clipping when clip_norm < 5 with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = constant_op.constant(4.0) # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].eval() tf_norm = self.evaluate(norm) self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormWithIndexedSlicesClipped(self): # Norm clipping when clip_norm < 5 with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = ops.IndexedSlices( constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4])) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = 4.0 # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].values.eval() tf_norm = self.evaluate(norm) self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormWithIndexedSlicesClipped(self): # Norm clipping when clip_norm < 5 with self.test_session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = ops.IndexedSlices( constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4])) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = 4.0 # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[1].values.eval() tf_norm = norm.eval() self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testClipByGlobalNormSupportsNone(self): # Norm clipping when clip_norm < 5 with self.session(use_gpu=True): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = 4.0 # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm((x0, None, x1, None), clip_norm) self.assertTrue(ans[1] is None) self.assertTrue(ans[3] is None) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[2].eval() tf_norm = self.evaluate(norm) self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def apply_update(self, optimizer, grads_and_vars): (grads, vars) = zip(*grads_and_vars) # Gradient clipping if CustomTrainer.GRADIENT_CLIP in self.train_hypers: grads, global_norm = clip_ops.clip_by_global_norm(grads, self.train_hypers[CustomTrainer.GRADIENT_CLIP]) # Gradient noise if CustomTrainer.GRADIENT_NOISE in self.train_hypers: sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE] if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers: sigma_sqr /= tf.pow(1.0 + tf.to_float(self.global_step), self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY]) grads_tmp = [] for g in grads: if g is not None: noisy_grad = g + tf.sqrt(sigma_sqr)*tf.random_normal(tf.shape(g)) grads_tmp.append(noisy_grad) else: grads_tmp.append(g) grads = grads_tmp train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.global_step) return train_op
def testClipByGlobalNormSupportsNone(self): # Norm clipping when clip_norm < 5 with self.test_session(): x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) x1 = constant_op.constant([1.0, -2.0]) # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5 clip_norm = 4.0 # Answers are the original tensors scaled by 4.0/5.0 np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]] np_ans_1 = [0.8, -1.6] ans, norm = clip_ops.clip_by_global_norm((x0, None, x1, None), clip_norm) self.assertTrue(ans[1] is None) self.assertTrue(ans[3] is None) tf_ans_1 = ans[0].eval() tf_ans_2 = ans[2].eval() tf_norm = norm.eval() self.assertAllClose(tf_norm, 5.0) self.assertAllClose(np_ans_0, tf_ans_1) self.assertAllClose(np_ans_1, tf_ans_2)
def testModelWithBuckets(self): """Larger tests that does full sequence-to-sequence model training.""" # We learn to copy 10 symbols in 2 buckets: length 4 and length 8. classes = 10 buckets = [(4, 4), (8, 8)] perplexities = [[], []] # Results for each bucket. random_seed.set_random_seed(111) random.seed(111) np.random.seed(111) with self.test_session() as sess: # We use sampled softmax so we keep output projection separate. w = variable_scope.get_variable("proj_w", [24, classes]) w_t = array_ops.transpose(w) b = variable_scope.get_variable("proj_b", [classes]) # Here comes a sample Seq2Seq model using GRU cells. def SampleGRUSeq2Seq(enc_inp, dec_inp, weights): """Example sequence-to-sequence model that uses GRU cells.""" def GRUSeq2Seq(enc_inp, dec_inp): cell = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(24) for _ in range(2)], state_is_tuple=True) return seq2seq_lib.embedding_attention_seq2seq( enc_inp, dec_inp, cell, num_encoder_symbols=classes, num_decoder_symbols=classes, embedding_size=24, output_projection=(w, b)) targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0] def SampledLoss(labels, inputs): labels = array_ops.reshape(labels, [-1, 1]) return nn_impl.sampled_softmax_loss( weights=w_t, biases=b, labels=labels, inputs=inputs, num_sampled=8, num_classes=classes) return seq2seq_lib.model_with_buckets( enc_inp, dec_inp, targets, weights, buckets, GRUSeq2Seq, softmax_loss_function=SampledLoss) # Now we construct the copy model. batch_size = 8 inp = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] out = [ array_ops.placeholder( dtypes.int32, shape=[None]) for _ in range(8) ] weights = [ array_ops.ones_like( inp[0], dtype=dtypes.float32) for _ in range(8) ] with variable_scope.variable_scope("root"): _, losses = SampleGRUSeq2Seq(inp, out, weights) updates = [] params = variables.all_variables() optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5) for i in range(len(buckets)): full_grads = gradients_impl.gradients(losses[i], params) grads, _ = clip_ops.clip_by_global_norm(full_grads, 30.0) update = optimizer.apply_gradients(zip(grads, params)) updates.append(update) sess.run([variables.global_variables_initializer()]) steps = 6 for _ in range(steps): bucket = random.choice(np.arange(len(buckets))) length = buckets[bucket][0] i = [ np.array( [np.random.randint(9) + 1 for _ in range(batch_size)], dtype=np.int32) for _ in range(length) ] # 0 is our "GO" symbol here. o = [np.array([0] * batch_size, dtype=np.int32)] + i feed = {} for i1, i2, o1, o2 in zip(inp[:length], i[:length], out[:length], o[:length]): feed[i1.name] = i2 feed[o1.name] = o2 if length < 8: # For the 4-bucket, we need the 5th as target. feed[out[length].name] = o[length] res = sess.run([updates[bucket], losses[bucket]], feed) perplexities[bucket].append(math.exp(float(res[1]))) for bucket in range(len(buckets)): if len(perplexities[bucket]) > 1: # Assert that perplexity went down. self.assertLess(perplexities[bucket][-1], # 10% margin of error. 1.1 * perplexities[bucket][0])
def _linear_classifier_model_fn(features, targets, mode, params): """Estimator's linear model_fn.""" n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] feature_columns = params["feature_columns"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) if not isinstance(features, dict): features = {"": features} num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes feat_values = features.values() if isinstance(features, dict) else [features] partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope(feat_values, "linear", partitioner=partitioner) as scope: logits, _, _ = (layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=["linear"], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape(math_ops.to_float( features[weight_column_name]), shape=(-1, )) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append( optimizer.apply_gradients(zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def _clip_gradients_by_norm(grads_and_vars, clip_gradients): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) clipped_gradients, _ = clip_ops.clip_by_global_norm( gradients, clip_gradients) return list(zip(clipped_gradients, variables))
def _finish(self, update_ops, name_scope): """""" caches = [update_op[0] for update_op in update_ops] update_ops = [update_op[1:] for update_op in update_ops] if self._noise is not None: for cache in caches: s_t, x_tm1 = cache[:2] s_t += random_ops.random_normal(x_tm1.initialized_value().get_shape(), stddev=self._noise) cache[0] = s_t if self._clip is not None: S_t = [cache[0] for cache in caches] S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip) for cache, s_t in zip(caches, S_t): cache[0] = s_t new_update_ops = [] for cache, update_op in zip(caches, update_ops): if len(cache) == 3: s_t, x_tm1 = cache[:2] with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): x_t = state_ops.assign_sub(x_tm1, s_t, use_locking=self._use_locking) cache.append(x_t) else: s_t_, x_tm1, idxs = cache[:3] with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): x_t = state_ops.scatter_sub(x_tm1, idxs, s_t_, use_locking=self._use_locking) cache.append(x_t) new_update_ops.append(control_flow_ops.group(*([x_t] + update_op))) with ops.control_dependencies(new_update_ops): more_update_ops = [] if self._save_step: for cache in caches: if len(cache) == 4: s_t, x_tm1 = cache[:2] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): new_step_and_grads = [] s_t = state_ops.assign(s_tm1, -s_t, use_locking=self._use_locking) else: s_t_, x_tm1, idxs = cache[:3] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): s_t = state_ops.scatter_update(s_tm1, idxs, -s_t_, use_locking=self._use_locking) more_update_ops.append(s_t) if self._save_grad: for cache in caches: if len(cache) == 4: x_tm1, g_t = cache[1:3] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): new_step_and_grads = [] g_t = state_ops.assign(g_tm1, g_t, use_locking=self._use_locking) else: x_tm1, idxs, g_t_ = cache[1:4] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): g_t = state_ops.scatter_update(g_tm1, idxs, g_t_, use_locking=self._use_locking) more_update_ops.append(g_t) if self._chi > 0: for cache in caches: if len(cache) == 4: _, x_tm1, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): x_and_t = self._dense_moving_average(x_tm1, x_t, 'x', self._chi) more_update_ops.append(control_flow_ops.group(*x_and_t)) else: _, x_tm1, idxs, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): x_t_ = array_ops.gather(x_t, idxs) x_and_t = self._sparse_moving_average(x_tm1, idxs, x_t_, 'x', self._chi) more_update_ops.append(control_flow_ops.group(*x_and_t)) return control_flow_ops.group(*(new_update_ops + more_update_ops), name=name_scope)
def _finish(self, update_ops, name_scope): """""" caches = [update_op[0] for update_op in update_ops] update_ops = [update_op[1:] for update_op in update_ops] if self._noise is not None: for cache in caches: s_t, x_tm1 = cache[:2] s_t += random_ops.random_normal( x_tm1.initialized_value().get_shape(), stddev=self._noise) cache[0] = s_t if self._clip > 0: S_t = [cache[0] for cache in caches] S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip) for cache, s_t in zip(caches, S_t): cache[0] = s_t new_update_ops = [] for cache, update_op in zip(caches, update_ops): if len(cache) == 3: s_t, x_tm1 = cache[:2] with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t = state_ops.assign_sub(x_tm1, s_t, use_locking=self._use_locking) cache.append(x_t) else: s_t_, x_tm1, idxs = cache[:3] with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t = state_ops.scatter_sub(x_tm1, idxs, s_t_, use_locking=self._use_locking) cache.append(x_t) new_update_ops.append(control_flow_ops.group(*([x_t] + update_op))) with ops.control_dependencies(new_update_ops): more_update_ops = [] if self._save_step: for cache in caches: if len(cache) == 4: s_t, x_tm1 = cache[:2] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): new_step_and_grads = [] s_t = state_ops.assign( s_tm1, -s_t, use_locking=self._use_locking) else: s_t_, x_tm1, idxs = cache[:3] s_tm1 = self.get_slot(x_tm1, 's') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): s_t = state_ops.scatter_update( s_tm1, idxs, -s_t_, use_locking=self._use_locking) more_update_ops.append(s_t) if self._save_grad: for cache in caches: if len(cache) == 4: x_tm1, g_t = cache[1:3] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): new_step_and_grads = [] g_t = state_ops.assign( g_tm1, g_t, use_locking=self._use_locking) else: x_tm1, idxs, g_t_ = cache[1:4] g_tm1 = self.get_slot(x_tm1, 'g') with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): g_t = state_ops.scatter_update( g_tm1, idxs, g_t_, use_locking=self._use_locking) more_update_ops.append(g_t) if self._chi > 0: for cache in caches: if len(cache) == 4: _, x_tm1, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_and_t = self._dense_moving_average( x_tm1, x_t, 'x', self._chi) more_update_ops.append( control_flow_ops.group(*x_and_t)) else: _, x_tm1, idxs, _, x_t = cache with ops.name_scope('update_' + x_tm1.op.name), ops.device( x_tm1.device): x_t_ = array_ops.gather(x_t, idxs) x_and_t = self._sparse_moving_average( x_tm1, idxs, x_t_, 'x', self._chi) more_update_ops.append( control_flow_ops.group(*x_and_t)) return control_flow_ops.group(*(new_update_ops + more_update_ops), name=name_scope)
def build_multi_tower_graph(images, sketches, images_d, image_paired_class_ids, image_paired_class_ids_d, text_vocab_indiceses, LSTM_hybrid, vocab_size, batch_size, num_gpu, batch_portion, training, learning_rates, counter, max_iter_step, ld=10, data_format='NCHW', distance_map=True, optimizer='Adam', block_type='MRU'): """ :param images: [batch_size, 3, H, W] :param sketches: [batch_size, 3, H, W] :param images_d: [batch_size, 3, H, W] :param image_paired_class_ids: [batch_size, ], class_number :param image_paired_class_ids_d: [batch_size, ] :param text_vocab_indiceses: [batch_size, 15] :return: """ models.set_param(data_format=data_format) with tf.device('/cpu:0'): images_list = split_inputs(images, batch_size, batch_portion, num_gpu) # [num_gpu, [N, C, H, W]] images_d_list = split_inputs(images_d, batch_size, batch_portion, num_gpu) sketches_list = split_inputs(sketches, batch_size, batch_portion, num_gpu) image_paired_class_ids_list = split_inputs(image_paired_class_ids, batch_size, batch_portion, num_gpu) image_paired_class_ids_d_list = split_inputs(image_paired_class_ids_d, batch_size, batch_portion, num_gpu) text_vocab_indiceses_list = split_inputs(text_vocab_indiceses, batch_size, batch_portion, num_gpu) lr_g = learning_rates['generator'] lr_d = learning_rates['discriminator'] optimizer = get_optimizer(optimizer) decay = tf.maximum( 0.2, 1. - (tf.cast(counter, tf.float32) / max_iter_step * 0.9)) tf.summary.scalar('learning_rate_g', lr_g * decay) optim_g = optimizer(learning_rate=lr_g * decay) optim_d = optimizer(learning_rate=lr_d * decay) tower_grads_g = [] tower_grads_d = [] for i in range(num_gpu): with tf.name_scope('%s_%d' % ('GPU', i)) as scope: loss_g, loss_d, grad_g, grad_d \ = build_single_graph(images_list[i], sketches_list[i], images_d_list[i], image_paired_class_ids_list[i], image_paired_class_ids_d_list[i], text_vocab_indiceses_list[i], batch_size * batch_portion[i], training, LSTM_hybrid=LSTM_hybrid, vocab_size=vocab_size, ld=ld, data_format=data_format, distance_map=distance_map, optim_g=optim_g, optim_d=optim_d, block_type=block_type) tower_grads_g.append(grad_g) tower_grads_d.append(grad_d) assert len(tower_grads_g) == len(tower_grads_d) if len(tower_grads_d) == 1: ave_grad_g = grad_g ave_grad_d = grad_d else: ave_grad_g, ave_grad_d = average_gradients( (tower_grads_g, tower_grads_d)) # Apply gradients tf.get_variable_scope( )._reuse = False # Hack to force initialization of optimizer variables if Config.sn: # Get the update ops spectral_norm_update_ops = tf.get_collection( Config.SPECTRAL_NORM_UPDATE_OPS) else: spectral_norm_update_ops = [tf.no_op()] assign_ops = tf.no_op() # Clip gradients if using WGAN/DRAGAN global_grad_norm_G = None global_grad_norm_G_clipped = None global_grad_norm_D = None global_grad_norm_D_clipped = None if not Config.sn: max_grad_norm_G = 50. max_grad_norm_D = 100. hard_clip_norm_G = 5. hard_clip_norm_D = 10. ave_grad_g_tensors, ave_grad_g_vars = list(zip(*ave_grad_g)) global_grad_norm_G = clip_ops.global_norm(ave_grad_g_tensors) ave_grad_g_tensors, _ = clip_ops.clip_by_global_norm( ave_grad_g_tensors, max_grad_norm_G, global_grad_norm_G) ave_grad_g_tensors = [ clip_ops.clip_by_norm(t, hard_clip_norm_G) for t in ave_grad_g_tensors ] ave_grad_g = list(zip(ave_grad_g_tensors, ave_grad_g_vars)) ave_grad_d_tensors, ave_grad_d_vars = list(zip(*ave_grad_d)) global_grad_norm_D = clip_ops.global_norm(ave_grad_d_tensors) ave_grad_d_tensors, _ = clip_ops.clip_by_global_norm( ave_grad_d_tensors, max_grad_norm_D, global_grad_norm_D) ave_grad_d_tensors = [ clip_ops.clip_by_norm(t, hard_clip_norm_D) for t in ave_grad_d_tensors ] ave_grad_d = list(zip(ave_grad_d_tensors, ave_grad_d_vars)) with tf.control_dependencies(spectral_norm_update_ops): opt_g = optimize(ave_grad_g, optim_g, None, 'gradient_norm', global_norm=global_grad_norm_G, global_norm_clipped=global_grad_norm_G_clipped, appendix='_G') opt_d = optimize(ave_grad_d, optim_d, None, 'gradient_norm', global_norm=global_grad_norm_D, global_norm_clipped=global_grad_norm_D_clipped, appendix='_D') summaries = gather_summaries() loss_g, loss_d = gather_losses() # Generator output from last tower return opt_g, opt_d, loss_g, loss_d, summaries
def optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string, class or optimizer instance, used as trainer. string should be name of optimizer, like 'SGD', 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant. class should be sub-class of tf.Optimizer that implements `compute_gradients` and `apply_gradients` functions. optimizer instance should be instantion of tf.Optimizer sub-class and have `compute_gradients` and `apply_gradients` functions. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay funcitons. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Learning rate variable, with possible decay. if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0: lr = learning_rate elif isinstance(learning_rate, float): lr = vs.get_variable("learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) else: raise ValueError("Learning rate should be 0d Tensor or float. Got %s" % str(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer, given specified parameters. if isinstance(optimizer, six.string_types): if optimizer not in OPTIMIZER_CLS_NAMES: raise ValueError("Optimizer name should be one of [%s], you provided %s." % (", ".join(OPTIMIZER_CLS_NAMES), optimizer)) opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr) elif isinstance(optimizer, type) and issubclass(optimizer, optimizer_.Optimizer): opt = optimizer(learning_rate=lr) elif isinstance(optimizer, optimizer_.Optimizer): opt = optimizer else: raise ValueError("Unrecognized optimizer: should be string, " "subclass of Optimizer or instance of " "subclass of Optimizer. Got %s." % str(optimizer)) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients and clip them if provided. gradients = opt.compute_gradients(loss, variables) if clip_gradients is not None: gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) gradients = list(zip(clipped_gradients, variables)) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if grad_values is not None: logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor
def clip_grads(grads_and_vars): gradients, variables = zip(*grads_and_vars) gradients = clip_ops.clip_by_global_norm(gradients, clip_norm)[0] grads_and_vars = list(zip(gradients, variables)) return grads_and_vars
def _linear_classifier_model_fn(features, targets, mode, params): """Linear classifier model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * feature_columns: An iterable containing all the feature columns used by the model. * n_classes: number of target classes. * weight_column_name: A string defining the weight feature column, or None if there are no weights. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * enable_centered_bias: A bool. If True, estimator will learn a centered bias variable for each class. Rest of the model structure learns the residual after centered bias. * num_ps_replicas: The number of parameter server replicas. * joint_weights: If True, the weights for all columns will be stored in a single (possibly partitioned) variable. It's more efficient, but it's incompatible with SDCAOptimizer, and requires all feature columns are sparse and use the 'sum' combiner. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. Raises: ValueError: If mode is not any of the `ModeKeys`. """ feature_columns = params["feature_columns"] n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) joint_weights = params.get("joint_weights", False) if not isinstance(features, dict): features = {"": features} parent_scope = "linear" num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope( features.values(), parent_scope, partitioner=partitioner) as scope: if joint_weights: logits, _, _ = ( layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) else: logits, _, _ = ( layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape( math_ops.to_float(features[weight_column_name]), shape=(-1,)) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") logging_ops.scalar_summary("loss", loss) train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append(optimizer.apply_gradients( zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def _linear_classifier_model_fn(features, targets, mode, params): """Estimator's linear model_fn.""" n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] feature_columns = params["feature_columns"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) joint_weights = params.get("joint_weights", False) if not isinstance(features, dict): features = {"": features} num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes feat_values = (features.values() if isinstance(features, dict) else [features]) partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope( feat_values, "linear", partitioner=partitioner) as scope: if joint_weights: logits, _, _ = ( layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=["linear"], scope=scope)) else: logits, _, _ = ( layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=["linear"], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape( math_ops.to_float(features[weight_column_name]), shape=(-1,)) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") logging_ops.scalar_summary("loss", loss) train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append(optimizer.apply_gradients( zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def _testNonFiniteClippingByGlobalNorm(self, inputs, max_norm): clipped = clip_ops.clip_by_global_norm(inputs, max_norm) result, _ = self.evaluate(clipped) self.assertTrue(np.all(np.isnan(result)))
def _clip_gradients_by_norm(grads_and_vars, clip_gradients): """Clips gradients by global norm.""" gradients, variables = zip(*grads_and_vars) clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) return list(zip(clipped_gradients, variables))
def _testClipTensorByGlobalNorm(self, inputs, max_norm, expected): clipped = clip_ops.clip_by_global_norm(inputs, max_norm) result, _ = self.evaluate(clipped) self.assertAllClose(result, expected)
def _linear_classifier_model_fn(features, targets, mode, params): """Linear classifier model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * feature_columns: An iterable containing all the feature columns used by the model. * n_classes: number of target classes. * weight_column_name: A string defining the weight feature column, or None if there are no weights. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * enable_centered_bias: A bool. If True, estimator will learn a centered bias variable for each class. Rest of the model structure learns the residual after centered bias. * num_ps_replicas: The number of parameter server replicas. * joint_weights: If True, the weights for all columns will be stored in a single (possibly partitioned) variable. It's more efficient, but it's incompatible with SDCAOptimizer, and requires all feature columns are sparse and use the 'sum' combiner. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. Raises: ValueError: If mode is not any of the `ModeKeys`. """ feature_columns = params["feature_columns"] n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) joint_weights = params.get("joint_weights", False) if not isinstance(features, dict): features = {"": features} parent_scope = "linear" num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope(features.values(), parent_scope, partitioner=partitioner) as scope: if joint_weights: logits, _, _ = (layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) else: logits, _, _ = (layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape(math_ops.to_float( features[weight_column_name]), shape=(-1, )) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") logging_ops.scalar_summary("loss", loss) train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append( optimizer.apply_gradients(zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def optimize_loss(loss, global_step, learning_rate, optimizer, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None): """Given loss and parameters for optimizer, returns a training op. Args: loss: Tensor, 0 dimensional. global_step: Tensor, step counter for each update. learning_rate: float or Tensor, magnitude of update per each training step. optimizer: string or function, used as optimizer for training. clip_gradients: float or None, clips gradients by this value. moving_average_decay: float or None, takes into account previous loss to make learning smoother due to outliers. learning_rate_decay_fn: function, takes learning_rate and global_step Tensors, returns Tensor. Can be used to implement any learning rate decay funcitons. For example: tf.train.exponential_decay. variables: list of variables to optimizer or none. Returns: Training op. Raises: ValueError: if optimizer is wrong type. """ # Moving average of the loss with decay. if moving_average_decay is not None: # Generate moving averages of the loss. loss_averages = train.ExponentialMovingAverage(moving_average_decay, name="avg") loss_averages_op = loss_averages.apply([loss]) logging_ops.scalar_summary("loss/mean", loss_averages.average(loss)) loss = control_flow_ops.with_dependencies([loss_averages_op], loss) # Convert optimizer into the optimizer class. if isinstance(optimizer, str): opt_cls = OPTIMIZER_CLS_NAMES[optimizer] elif callable(optimizer): opt_cls = optimizer else: raise ValueError("Unrecognized optimizer: should be string or function.") # Learning rate variable, with possible decay. lr = vs.get_variable("learning_rate", [], trainable=False, initializer=init_ops.constant_initializer(learning_rate)) if learning_rate_decay_fn is not None: lr = learning_rate_decay_fn(lr, global_step) # Create optimizer. opt = opt_cls(learning_rate=lr) # All trainable variables, if specific variables are not specified. if variables is None: variables = vars_.trainable_variables() # Compute gradients and clip them if provided. gradients = opt.compute_gradients(loss, variables) if clip_gradients is not None: clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_gradients) gradients = zip(clipped_gradients, variables) # Add scalar summary for loss. logging_ops.scalar_summary("loss", loss) # Add histograms for variables, gradients and gradient norms. for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient logging_ops.histogram_summary(variable.name, variable) logging_ops.histogram_summary(variable.name + "/gradients", grad_values) logging_ops.histogram_summary(variable.name + "/gradient_norm", clip_ops.global_norm([grad_values])) # Create gradient updates. grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") # Make sure total_loss is valid. final_loss = array_ops.check_numerics(loss, "Loss is inf or nan") # Ensure the train_tensor computes grad_updates. train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss) return train_tensor