def testMovingAverageVariables(self): height, width = 3, 3 with self.test_session(): images = tf.random_uniform((5, height, width, 3), seed=1) ops.batch_norm(images, scale=True) moving_mean = tf.moving_average_variables()[0] moving_variance = tf.moving_average_variables()[1] self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
def testCreateVariablesWithoutCenterWithoutScale(self): height, width = 3, 3 with self.test_session(): images = tf.random_uniform((5, height, width, 3), seed=1) ops.batch_norm(images, center=False, scale=False) beta = variables.get_variables_by_name('beta') self.assertEquals(beta, []) gamma = variables.get_variables_by_name('gamma') self.assertEquals(gamma, []) moving_mean = tf.moving_average_variables()[0] moving_variance = tf.moving_average_variables()[1] self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
def variables_to_restore(self, moving_avg_variables=None): """ """ name_map = {} if moving_avg_variables is None: moving_avg_variables = tf.trainable_variables() moving_avg_variables += tf.moving_average_variables() # Remove duplicates moving_avg_variables = set(moving_avg_variables) # Collect all the variables with moving average, for v in moving_avg_variables: name_map[self.average_name(v)] = v # Make sure we restore variables without moving average as well. for v in list(set(tf.all_variables()) - moving_avg_variables): if v.op.name not in name_map: name_map[v.op.name] = v return name_map
def inception_model_fn(features, labels, mode, params): """Inception v4 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['model_transpose_dims']) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) return logits, end_points if FLAGS.clear_update_collections: with arg_scope( inception.inception_v4_arg_scope( weight_decay=0.0, batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = build_network() else: with arg_scope( inception.inception_v4_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = build_network() predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) losses = tf.add_n(tf.losses.get_losses()) l2_loss = [] for v in tf.trainable_variables(): tf.logging.info(v.name) if 'BatchNorm' not in v.name and 'weights' in v.name: l2_loss.append(tf.nn.l2_loss(v)) tf.logging.info(len(l2_loss)) loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 # Adjust the initial learning rate for warmup initial_learning_rate /= ( FLAGS.learning_rate_decay**((FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs)) final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) clr = FLAGS.cold_learning_rate wlr = initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.cold_epochs) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where( tf.greater_equal(current_epoch, FLAGS.warmup_epochs + FLAGS.cold_epochs), tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int( FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True), tf.multiply(tf.cast(current_epoch, tf.float32), wlr))), clr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def test_restore_ema(self): # Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3 x_data = np.random.rand(100).astype(np.float32) y_data = x_data * 0.1 + 0.3 # Try to find values for W and b that compute y_data = W * x_data + b # (We know that W should be 0.1 and b 0.3, but TensorFlow will # figure that out for us.) W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W') b = tf.Variable(tf.zeros([1]), name='b') y = W * x_data + b # Minimize the mean squared errors. loss = tf.reduce_mean(tf.square(y - y_data)) optimizer = tf.train.GradientDescentOptimizer(0.5) opt_op = optimizer.minimize(loss) # Track the moving averages of all trainable variables. ema = tf.train.ExponentialMovingAverage(decay=0.9999) averages_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([opt_op]): train_op = tf.group(averages_op) # Before starting, initialize the variables. We will 'run' this first. init = tf.global_variables_initializer() saver = tf.train.Saver(tf.trainable_variables()) # Launch the graph. sess = tf.Session() sess.run(init) # Fit the line. for _ in range(201): sess.run(train_op) w_reference = sess.run('W/ExponentialMovingAverage:0') b_reference = sess.run('b/ExponentialMovingAverage:0') saver.save(sess, os.path.join(self.tmp_dir, "model_ex1")) tf.reset_default_graph() tf.train.import_meta_graph(os.path.join(self.tmp_dir, "model_ex1.meta")) sess = tf.Session() print('------------------------------------------------------') for var in tf.global_variables(): print('all variables: ' + var.op.name) for var in tf.trainable_variables(): print('normal variable: ' + var.op.name) for var in tf.moving_average_variables(): print('ema variable: ' + var.op.name) print('------------------------------------------------------') mode = 1 restore_vars = {} if mode == 0: ema = tf.train.ExponentialMovingAverage(1.0) for var in tf.trainable_variables(): print('%s: %s' % (ema.average_name(var), var.op.name)) restore_vars[ema.average_name(var)] = var elif mode == 1: for var in tf.trainable_variables(): ema_name = var.op.name + '/ExponentialMovingAverage' print('%s: %s' % (ema_name, var.op.name)) restore_vars[ema_name] = var saver = tf.train.Saver(restore_vars, name='ema_restore') saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1")) w_restored = sess.run('W:0') b_restored = sess.run('b:0') self.assertAlmostEqual( w_reference, w_restored, 'Restored model modes not use the EMA filtered weight') self.assertAlmostEqual( b_reference, b_restored, 'Restored model modes not use the EMA filtered bias')
def model_fn(features, labels, mode, params): """Mobilenet v1 model using Estimator API.""" num_classes = params['num_classes'] training_active = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = supervised_images.tensor_transform_fn( features, params['input_perm']) model = tf.keras.applications.MobileNet( input_tensor=features, include_top=True, weights=None, classes=num_classes) logits = model(features, training=training_active) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not params['use_tpu']): with tf.control_dependencies([ tf.Print( predictions['classes'], [predictions['classes']], summarize=params['eval_batch_size'], message='prediction: ') ]): labels = tf.Print( labels, [labels], summarize=params['eval_batch_size'], message='label: ') one_hot_labels = tf.one_hot(labels, params['num_classes'], dtype=tf.int32) tf.losses.softmax_cross_entropy( onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = params['learning_rate'] * params['train_batch_size'] / 256 # pylint: disable=line-too-long final_learning_rate = 0.0001 * initial_learning_rate train_op = None if training_active: batches_per_epoch = params['num_train_images'] // params['train_batch_size'] global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=params['learning_rate_decay_epochs'] * batches_per_epoch, decay_rate=params['learning_rate_decay'], staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum( learning_rate, final_learning_rate, name='learning_rate') if params['optimizer'] == 'sgd': absl.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif params['optimizer'] == 'momentum': absl.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) elif params['optimizer'] == 'RMS': absl.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer( learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: absl.logging.fatal('Unknown optimizer:', params['optimizer']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) update_ops = model.updates with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if params['moving_average']: ema = tf.train.ExponentialMovingAverage( decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if eval_active: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy(labels, tf.argmax( input=predictions, axis=1)) return {'accuracy': accuracy} if params['use_logits']: eval_predictions = logits eval_metrics = (metric_fn, [labels, eval_predictions]) return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def model_fn(self, features, labels, mode, params): """Build the model based on features, labels, and mode. Args: features: The features dictionary containing the data Tensor and the number of examples. labels: The labels Tensor resulting from calling the model. mode: A string indicating the training mode. params: A dictionary of hyperparameters. Returns: A tf.estimator.EstimatorSpec. """ del params is_training = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) is_predict = (mode == tf.estimator.ModeKeys.PREDICT) if is_training: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC loss, logits = self._build_network(features, labels, mode) if is_predict: predictions = {'logits': logits} if self.hparams.use_tpu: return contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) else: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) host_call = None train_op = None if is_training: global_step = tf.train.get_or_create_global_step() gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1]) # Setup learning rate schedule learning_rate = self._build_learning_rate_schedule(global_step) # Setup optimizer. optimizer = self._build_optimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = self._build_train_op(optimizer, loss, global_step=global_step) if self.hparams.moving_average_decay > 0: ema = tf.train.ExponentialMovingAverage( decay=self.hparams.moving_average_decay, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]): with tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) lr_t = tf.reshape(learning_rate, [1]) host_call = None if self.hparams.enable_hostcall: def host_call_fn(gs, lr): # Outfeed supports int32 but global_step is expected to be int64. gs = tf.cast(tf.reduce_mean(gs), tf.int64) with tf.summary.create_file_writer( self.model_dir).as_default(): with tf.summary.always_record_summaries(): tf.summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) return tf.summary.all_summary_ops() host_call = (host_call_fn, [gs_t, lr_t]) eval_metrics = None eval_metric_ops = None if eval_active: def metric_fn(labels, logits): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" # Outfeed supports int32 but global_step is expected to be int64. predictions = tf.argmax(logits, axis=1) categorical_labels = labels top_1_accuracy = tf.metrics.accuracy(categorical_labels, predictions) in_top_5 = tf.cast( tf.nn.in_top_k(logits, categorical_labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) eval_metric_ops = metric_fn(labels, logits) if self.hparams.use_tpu: return contrib_tpu.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)