def _maybe_make_cross_shard_optimizer(opt): if callable(opt): if not isinstance(opt(), tpu_optimizer.CrossShardOptimizer): return lambda: tpu_optimizer.CrossShardOptimizer(opt()) elif not isinstance(opt, tpu_optimizer.CrossShardOptimizer): return tpu_optimizer.CrossShardOptimizer(opt) return opt
def model_fn(features, labels, mode, params): """TPUEstimatorSpec for the Squeezenet model.""" ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder is_training = mode == tf.estimator.ModeKeys.TRAIN logits = squeezenet(features, is_training=is_training, num_classes=params["num_classes"]) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) global_batch_size = params["num_shards"] * params["batch_size"] decay_steps = 1300 * 1000 * params["num_epochs"] // global_batch_size learning_rate = tf.train.polynomial_decay( params["lr"], global_step=tf.train.get_or_create_global_step(), end_learning_rate=params["min_lr"], decay_steps=decay_steps, power=1.0, cycle=False) # TODO(power): Hack copied from resnet: remove when summaries are working. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) if params["optimizer"] == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif params["optimizer"] == "rmsprop": optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, momentum=params["momentum"], epsilon=1.0) else: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"], use_nesterov=True) if params["use_tpu"]: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, tf.train.get_global_step()) param_stats = tf.profiler.profile( tf.get_default_graph(), options=ProfileOptionBuilder.trainable_variables_parameter()) fl_stats = tf.profiler.profile( tf.get_default_graph(), options=tf.profiler.ProfileOptionBuilder.float_operation()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=(metric_fn, [labels, logits, lr_repeat]), predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") }, )
def model_fn(features, labels, mode, params): del params #with G.as_default(): transfer = tf.nn.softplus hidden = tf.layers.dense(inputs=features, units=200, activation=transfer) reconstruction = tf.layers.dense(inputs=hidden, units=784) # for an autoencoder, the cost/loss is not just part of training loss_op = 0.5 * tf.reduce_sum( tf.pow(tf.subtract(reconstruction, labels), 2.0)) learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, tf.train.get_global_step(), 100000, 0.96) if FLAGS.use_tpu: opt = tpu_optimizer.CrossShardOptimizer( tf.train.GradientDescentOptimizer(learning_rate=learning_rate)) else: opt = tf.train.AdamOptimizer() train_op = opt.minimize(loss_op, global_step=tf.train.get_global_step()) #return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
def model_fn(features, labels, mode, params): _optimizer_fn = train_utils.modelParams.optimizer_fn(train_utils.optimizer, FLAGS.learning_rate) if FLAGS.use_tpu: _optimizer_fn = tpu_optimizer.CrossShardOptimizer(_optimizer_fn) _loss_fn = train_utils.modelParams.loss_fn(train_utils.loss_fn, FLAGS.loss_fn) _model_graph = train_utils.modelParams.get_model(models, FLAGS.training_model) if mode == tf.estimator.ModeKeys.TRAIN: _logits = _model_graph(features, FLAGS.num_classes) _loss = _loss_fn(_logits, labels) _train_op = _optimizer_fn.minimize(loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=_loss, train_op=_train_op, predictions={ "class": tf.argmax(_logits, axis = -1) "probabilities": _logits } )
def model_fn(features, labels, mode, params): del params # unused num_classes = 8 x = Input(tensor=features) #x = InputLayer(input_shape=(img_size_flat,)) #x = Reshape(img_shape)(x) # #model.add(Dropout(0.5, input_shape=(48, 48, 1))) x = Conv2D(kernel_size=5, strides=1, filters=32, padding='same', activation='relu')(x) x = Conv2D(kernel_size=5, strides=1, filters=32, padding='same', activation='relu')(x) x = MaxPooling2D(pool_size=2, strides=2)(x) # x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same', activation='relu')(x) x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same', activation='relu')(x) x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same', activation='relu')(x) x = MaxPooling2D(pool_size=2, strides=2)(x) x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same', activation='relu')(x) x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same', activation='relu')(x) x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same', activation='relu')(x) x = MaxPooling2D(pool_size=2, strides=2)(x) x = Flatten()(x) x = Dense(128, activation='relu')(x) x = Dense(64, activation='relu')(x) # # Last fully-connected / dense layer with softmax-activation # # for use in classification. logits = Dense(num_classes)(x) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels ) ) optimizer = tf.train.AdamOptimizer(learning_rate = 1e-4) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") } )
def model_fn(features, labels, mode, params): """Define a CIFAR model in Keras.""" del params # unused #layers = tf.keras.layers #models = tf.keras.models # Pass our input tensor to initialize the Keras input layer. #mdl = layers.Input(tensor=features) #mdl = layers.Dense(2048, activation="relu")(mdl) #op = layers.Dense(3862, activation="softmax")(features) op = tf.contrib.layers.fully_connected(inputs=features, num_outputs=3862, activation_fn=tf.nn.softmax) # Instead of constructing a Keras model for training, build our loss function # and optimizer in Tensorflow. # # N.B. This construction omits some features that are important for more # complex models (e.g. regularization, batch-norm). Once # `model_to_estimator` support is added for TPUs, it should be used instead. loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=op, labels=tf.cast( labels, tf.float32))) optimizer = tf.train.AdamOptimizer(0.01) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, predictions={"probabilities": op})
def train(features, labels, hparams, get_features_fn, embedding_fn, embedding_weights_initializer, bias_weights_initializer, global_rating_bias_initializer): """Constructs the matrix factorization model training graph.""" (query_movie_ids, query_movie_ratings, query_genre_ids, query_genre_freqs, query_genre_ratings, candidate_movie_id, candidate_genre_id) = (get_features_fn(features)) model_scores, _, _ = movie_candidate_score( query_movie_ids, query_movie_ratings, query_genre_ids, query_genre_freqs, query_genre_ratings, candidate_movie_id, candidate_genre_id, hparams, embedding_fn, embedding_weights_initializer, bias_weights_initializer, global_rating_bias_initializer) loss = tf.losses.mean_squared_error(features[LABEL_RATING_SCORE], model_scores) optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[hparams.optimizer]( learning_rate=hparams.learning_rate) if hparams.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = tf.contrib.layers.optimize_loss( loss=loss, summaries=[], global_step=tf.contrib.framework.get_global_step(), optimizer=optimizer, learning_rate=None) return EstimatorSpec(mode=TRAIN, predictions=model_scores, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """TPUEstimator model_fn for MobileNet.""" logits = predict_fn(features, mode, params) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels)) # decay once per epoch steps_per_epoch = params["num_batches_per_epoch"] if params["decay_mode"] == "piecewise": learning_rate = params["learning_rate"] * tf.train.piecewise_constant( tf.train.get_or_create_global_step(), [steps_per_epoch * 30, steps_per_epoch * 60], [1.0, 0.1, 0.01]) else: learning_rate = tf.train.exponential_decay( params["learning_rate"], tf.train.get_or_create_global_step(), decay_rate=params["learning_rate_decay"], decay_steps=steps_per_epoch, ) if params["optimizer"] == "rmsprop": optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate, momentum=params["momentum"], epsilon=params["rmsprop_epsilon"], decay=params["rmsprop_decay"], ) else: optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params["momentum"], use_nesterov=True) if params["use_tpu"]: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, tf.train.get_global_step()) # TODO(power): Hack copied from resnet: remove when summaries are working. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") }, eval_metrics=(metric_fn, [labels, logits, lr_repeat]), )
def model_fn(features, labels, mode, params): output_size = params['output_size'] net = features if FLAGS.data_type == 'float32': network = resnet_model.resnet_v1(resnet_layers, block_fn, num_classes=output_size, data_format='channels_last', filters=filters) net = network(inputs=features, is_training=True) else: with tf.variable_scope('cg', custom_getter=get_custom_getter()): network = resnet_model.resnet_v1(resnet_layers, block_fn, num_classes=output_size, data_format='channels_last', filters=filters) net = network(inputs=features, is_training=True) net = tf.cast(net, tf.float32) onehot_labels = tf.one_hot(labels, output_size) loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=net) learning_rate = tf.train.exponential_decay(0.1, tf.train.get_global_step(), 25000, 0.97) if opt == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif opt == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif opt == 'rms': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) param_stats = tf.profiler.profile( tf.get_default_graph(), options=ProfileOptionBuilder.trainable_variables_parameter()) fl_stats = tf.profiler.profile( tf.get_default_graph(), options=tf.profiler.ProfileOptionBuilder.float_operation()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def _replicated_optimizer(opt): """Wrap the optimizer `opt` with CrossShardOptimizer if applicable.""" if tpu_function.get_tpu_context().number_of_shards == 1: return opt if isinstance(opt, keras_optimizers.TFOptimizer): return tpu_optimizer.CrossShardOptimizer(opt.optimizer) else: return KerasCrossShardOptimizer(opt)
def model_fn(features, labels, mode, params): """A simple CNN.""" del params if mode != tf.estimator.ModeKeys.TRAIN: raise RuntimeError("mode {} is not supported yet".format(mode)) conv1 = tf.layers.conv2d( inputs=features, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) pool1 = tf.layers.max_pooling2d( inputs=conv1, pool_size=[2, 2], strides=2, padding="same") conv2 = tf.layers.conv2d( inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) pool2 = tf.layers.max_pooling2d( inputs=conv2, pool_size=[2, 2], strides=2, padding="same") pool2_flat = tf.reshape(pool2, [-1, 8 * 8 * 64]) dense = tf.layers.dense( inputs=pool2_flat, units=384, activation=tf.nn.relu) dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) logits = tf.layers.dense(inputs=dropout, units=10) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) loss = tf.losses.softmax_cross_entropy( onehot_labels=onehot_labels, logits=logits) learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, tf.train.get_global_step(), 25000, 0.96) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer( tf.train.GradientDescentOptimizer(learning_rate=learning_rate)) else: optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op)
def get_train_op(self, loss): """Creates a training op. Args: loss: A float32 `Tensor` representing the total training loss. Returns: train_op: A slim.learning.create_train_op train_op. Raises: ValueError: If specified optimizer isn't supported. """ # Get variables to train (defined in subclass). assert self.variables_to_train # Define a learning rate schedule. decay_steps = self._config.learning.decay_steps decay_factor = self._config.learning.decay_factor learning_rate = float(self._config.learning.learning_rate) # Define a learning rate schedule. global_step = slim.get_or_create_global_step() learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps, decay_factor, staircase=True) # Create an optimizer. opt_type = self._config.learning.optimizer if opt_type == 'adam': opt = tf.train.AdamOptimizer(learning_rate) elif opt_type == 'momentum': opt = tf.train.MomentumOptimizer(learning_rate, 0.9) elif opt_type == 'rmsprop': opt = tf.train.RMSPropOptimizer(learning_rate, momentum=0.9, epsilon=1.0, decay=0.9) else: raise ValueError('Unsupported optimizer %s' % opt_type) if self._config.use_tpu: opt = tpu_optimizer.CrossShardOptimizer(opt) # Create a training op. # train_op = opt.minimize(loss, var_list=self.variables_to_train) # Create a training op. train_op = slim.learning.create_train_op( loss, optimizer=opt, variables_to_train=self.variables_to_train, update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS)) return train_op
def model_fn(features, labels, mode, params): """Inception v3 model using Estimator API.""" del params if mode != tf.estimator.ModeKeys.TRAIN: raise RuntimeError('mode {} is not supported yet'.format(mode)) num_labels = FLAGS.num_labels with slim.arg_scope(inception_v3_arg_scope(is_training=True)): logits, end_points = inception.inception_v3( features, num_labels, is_training=True, depth_multiplier=FLAGS.depth_multiplier) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_labels) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(end_points['AuxLogits'], onehot_labels, label_smoothing=0.1, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy(logits, onehot_labels, label_smoothing=0.1, weights=1.0) loss = tf.losses.get_total_loss() if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=FLAGS.learning_rate, momentum=0.9) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize( loss, global_step=tf.train.get_or_create_global_step()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Define a CIFAR model in Keras.""" del params # unused layers = tf.contrib.keras.layers # Pass our input tensor to initialize the Keras input layer. v = layers.Input(tensor=features) v = layers.Conv2D(filters=32, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Conv2D(filters=64, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Flatten()(v) fc1 = layers.Dense(units=512, activation="relu")(v) logits = layers.Dense(units=10)(fc1) # Instead of constructing a Keras model for training, build our loss function # and optimizer in Tensorflow. # # N.B. This construction omits some features that are important for more # complex models (e.g. regularization, batch-norm). Once # `model_to_estimator` support is added for TPUs, it should be used instead. loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) optimizer = tf.train.AdamOptimizer() if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax( logits, name="softmax_tensor") })
def _build_train_op(self, tpu_opt): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) """ Comment out for TPU execution """ #tf.summary.scalar('learning_rate', self.lrn_rate) """ use tf.case() instead of SessionRunHook for changing learn rate (for TPU execution) """ def r1(): return tf.constant(0.1) def r2(): return tf.constant(0.01) def r3(): return tf.constant(0.001) def r4(): return tf.constant(0.0001) self.lrn_rate = tf.case( { tf.less(self.global_step, 40000): r1, tf.less(self.global_step, 60000): r2, tf.less(self.global_step, 80000): r3 }, default=r4, exclusive=False) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == 'mom': optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) # Added for TPU cross shared optimizer for replicas if tpu_opt: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) apply_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops)
def model_fn(features, labels, mode, params): """Define a Densenet model.""" logits = densenet_model.densenet_cifar_model( features, params["growth_rate"], params["layers"], is_training=(mode == tf.estimator.ModeKeys.TRAIN), num_blocks=params["blocks"]) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) learning_rate = tf.train.exponential_decay( 0.00001, tf.train.get_or_create_global_step(), decay_steps=200, decay_rate=0.5) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True) # N.B. We have to set this parameter manually below. if params["use_tpu"]: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") }, eval_metrics=(metric_fn, [labels, logits]), )
def _model_fn_train(self, mode, total_loss, batches_per_epoch, num_epochs_per_decay, initial_learning_rate, learning_rate_decay_factor, rmsprop_decay, rmsprop_momentum, rmsprop_epsilon, moving_average_decay): """This is the TRAIN part of model_fn.""" if mode != tf.estimator.ModeKeys.TRAIN: return None # Configure the learning rate using an exponetial decay. global_step = tf.train.get_or_create_global_step() decay_steps = int(1.0 * batches_per_epoch * num_epochs_per_decay) learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=learning_rate_decay_factor, staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, 0.0001 * initial_learning_rate, name='learning_rate') optimizer = tf.train.RMSPropOptimizer(learning_rate, rmsprop_decay, momentum=rmsprop_momentum, epsilon=rmsprop_epsilon) if self.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step=global_step) # NB. In the inception code this was "tf.trainable_variables() # + tf.moving_average_variables()", but we've settled on just # tf.model_variables() in the existing production DV2. variables_to_average = tf.model_variables() variable_averages = tf.train.ExponentialMovingAverage( decay=moving_average_decay, num_updates=global_step) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = variable_averages.apply(variables_to_average) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, train_op) return train_op
def train(features, labels, hparams, embedding_weights_initializer, class_weights_initializer, class_biases_initializer, get_features_fn, embedding_fn): """Constructs the training graph.""" (movie_ids_ratings, genre_ids_freqs, genre_ids_ratings) = (get_features_fn(features)) query_embeddings = embed_query_features(movie_ids_ratings, genre_ids_freqs, genre_ids_ratings, hparams, TRAIN, embedding_weights_initializer, embedding_fn) class_weights, class_biases = class_weights_biases( hparams, class_weights_initializer, class_biases_initializer) scores = tf.matmul(query_embeddings, tf.transpose(class_weights)) + class_biases target_one_hot = tf.one_hot(indices=features['candidate_movie_id_values'], depth=MOVIE_VOCAB_SIZE, on_value=1.0) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=target_one_hot, logits=scores)) optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[hparams.optimizer]( learning_rate=hparams.learning_rate) if hparams.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = tf.contrib.layers.optimize_loss( loss=loss, summaries=[], global_step=tf.contrib.framework.get_global_step(), optimizer=optimizer, learning_rate=None) return EstimatorSpec(mode=TRAIN, predictions=scores, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): output_dim = params['output_dim'] net = features shp = net.get_shape().as_list() flattened_shape = shp[1] * shp[2] * shp[3] net = tf.reshape(net, [shp[0], flattened_shape]) net = tf.layers.dense(inputs=net, units=4, activation=tf.nn.relu) net = tf.layers.dropout(inputs=net, rate=0.5) net = tf.layers.dense(inputs=net, units=output_dim, activation=None) loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=net) learning_rate = tf.train.exponential_decay(0.01, tf.train.get_global_step(), 25000, 0.97) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer( tf.train.GradientDescentOptimizer(learning_rate=learning_rate)) else: optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) param_stats = tf.profiler.profile( tf.get_default_graph(), options=ProfileOptionBuilder.trainable_variables_parameter()) fl_stats = tf.profiler.profile( tf.get_default_graph(), options=tf.profiler.ProfileOptionBuilder.float_operation()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def tpu_resnet_model_fn(features, labels, mode, params): """Our model_fn for ResNet to be used with our TPUEstimator.""" del params model_result = resnet_model_common(features, labels, mode) def metric_fn(labels, logits): accuracy = tf.metrics.accuracy(tf.argmax(input=labels, axis=1), tf.argmax(input=logits, axis=1)) return {'accuracy': accuracy} optimizer = tpu_optimizer.CrossShardOptimizer(model_result.optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(model_result.loss, global_step=tf.train.get_global_step()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=model_result.loss, predictions=model_result.predictions, train_op=train_op, eval_metrics=(metric_fn, [labels, model_result.logits]))
def _build_optimizer(self, learning_rate): """Build optimizer.""" if self.hparams.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif self.hparams.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=self.hparams.momentum_rate) elif self.hparams.optimizer == 'rmsprop': tf.logging.info('Using RMSProp optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', self.hparams.optimizer) if self.hparams.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) return optimizer
def char_rnn_model(features, labels, mode, params): """Character level recurrent neural network model to predict classes.""" byte_vectors = tf.one_hot(features[CHARS_FEATURE], 256, 1., 0.) byte_list = tf.unstack(byte_vectors, axis=1) cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE) _, encoding = tf.nn.static_rnn(cell, byte_list, dtype=tf.float32) logits = tf.layers.dense(encoding, MAX_LABEL, activation=None) predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions={ 'class': predicted_classes, 'prob': tf.nn.softmax(logits) }) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer(learning_rate=0.01) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) eval_metric_ops = { 'accuracy': tf.metrics.accuracy( labels=labels, predictions=predicted_classes) } return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def model_fn(features, labels, mode, params): """Our model_fn for Densenet to be used with our Estimator.""" tf.logging.info("model_fn") with tf.variable_scope('cg', custom_getter=get_custom_getter()): if FLAGS.network_depth == 169: logits = densenet_model.densenet_imagenet_169( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) elif FLAGS.network_depth == 201: logits = densenet_model.densenet_imagenet_201( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) elif FLAGS.network_depth == 121: logits = densenet_model.densenet_imagenet_121( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) else: tf.logging.info("Number of layers not supported, revert to 121") logits = densenet_model.densenet_imagenet_121( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) logits = tf.cast(logits, tf.float32) # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, _LABEL_CLASSES) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels) # Add weight decay to the loss. We exclude weight decay on the batch # normalization variables because it slightly improves accuracy. loss = cross_entropy + _WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if "batch_normalization" not in v.name ]) global_step = tf.train.get_global_step() current_epoch = (tf.cast(global_step, tf.float32) / params["batches_per_epoch"]) learning_rate = learning_rate_schedule(current_epoch) # TODO(chrisying): this is a hack to get the LR and epoch for Tensorboard. # Reimplement this when TPU training summaries are supported. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) ce_repeat = tf.reshape( tf.tile(tf.expand_dims(current_epoch, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=_MOMENTUM) if FLAGS.use_tpu == True: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits, lr_repeat, ce_repeat): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" predictions = tf.argmax(logits, axis=1) accuracy = tf.metrics.accuracy(tf.argmax(labels, axis=1), predictions) lr = tf.metrics.mean(lr_repeat) ce = tf.metrics.mean(ce_repeat) return { "accuracy": accuracy, "learning_rate": lr, "current_epoch": ce } eval_metrics = (metric_fn, [labels, logits, lr_repeat, ce_repeat]) param_stats = tf.profiler.profile( tf.get_default_graph(), options=ProfileOptionBuilder.trainable_variables_parameter()) fl_stats = tf.profiler.profile( tf.get_default_graph(), options=tf.profiler.ProfileOptionBuilder.float_operation()) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params): # inference will happen in another way assert mode != tf.estimator.ModeKeys.PREDICT network = lambda images, is_training: shufflenet( images, is_training, num_classes=params['num_classes'], depth_multiplier=params['depth_multiplier']) # tensor `features` is a half precision tensor with shape [height, width, 3, batch_size], # it represents RGB images with values in [0, 1] images = features images = tf.transpose(images, [3, 0, 1, 2]) # HWCN to NHWC is_training = mode == tf.estimator.ModeKeys.TRAIN if params['use_bfloat16']: with bfloat16.bfloat16_scope(): logits = network(images, is_training) logits = tf.to_float(logits) # to full precision else: logits = network(images, is_training) with tf.name_scope('weight_decay'): add_weight_decay(params['weight_decay']) regularization_loss = tf.losses.get_regularization_loss() with tf.name_scope('cross_entropy'): one_hot_labels = tf.one_hot(labels, params['num_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=LABEL_SMOOTHING) total_loss = tf.losses.get_total_loss(add_regularization_losses=True) if mode == tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, eval_metrics=(metric_fn, [labels, logits])) assert mode == tf.estimator.ModeKeys.TRAIN with tf.variable_scope('learning_rate_schedule'): global_step = tf.train.get_global_step() learning_rate = get_learning_rate(global_step, params) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops), tf.variable_scope('optimizer'): optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM, use_nesterov=USE_NESTEROV) optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(total_loss, global_step) with tf.control_dependencies([train_op]), tf.name_scope('ema'): ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) train_op = ema.apply(tf.trainable_variables()) with tf.name_scope('train_accuracy_calculation'): predictions = tf.argmax(logits, axis=1, output_type=tf.int32) train_accuracy = tf.reduce_mean(tf.to_float( tf.equal(labels, predictions)), axis=0) tensors_to_summarize = [ tf.reshape(global_step, [1]), tf.reshape(total_loss, [1]), tf.reshape(cross_entropy, [1]), tf.reshape(regularization_loss, [1]), tf.reshape(learning_rate, [1]), tf.reshape(train_accuracy, [1]) ] def host_call_fn(global_step, total_loss, cross_entropy, regularization_loss, learning_rate, train_accuracy): global_step = global_step[0] with summary.create_file_writer( params['model_dir'], max_queue=params['iterations_per_loop']).as_default(): with summary.always_record_summaries(): summary.scalar('entire_loss', total_loss[0], step=global_step) summary.scalar('cross_entropy_loss', cross_entropy[0], step=global_step) summary.scalar('regularization_loss', regularization_loss[0], step=global_step) summary.scalar('learning_rate', learning_rate[0], step=global_step) summary.scalar('train_accuracy', train_accuracy[0], step=global_step) return summary.all_summary_ops() return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=(host_call_fn, tensors_to_summarize))
def model_fn(features, labels, mode, params): """Constructs DCGAN from individual generator and discriminator networks.""" del labels # Unconditional GAN does not use labels if mode == tf.estimator.ModeKeys.PREDICT: ########### # PREDICT # ########### # Pass only noise to PREDICT mode random_noise = features['random_noise'] predictions = { 'generated_images': model.generator(random_noise, is_training=False) } return tpu_estimator.TPUEstimatorSpec(mode=mode, predictions=predictions) # Use params['batch_size'] for the batch size inside model_fn batch_size = params['batch_size'] # pylint: disable=unused-variable real_images = features['real_images'] random_noise = features['random_noise'] is_training = (mode == tf.estimator.ModeKeys.TRAIN) generated_images = model.generator(random_noise, is_training=is_training) # Get logits from discriminator d_on_data_logits = tf.squeeze(model.discriminator(real_images)) d_on_g_logits = tf.squeeze(model.discriminator(generated_images)) # Calculate discriminator loss d_loss_on_data = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(d_on_data_logits), logits=d_on_data_logits) d_loss_on_gen = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(d_on_g_logits), logits=d_on_g_logits) d_loss = d_loss_on_data + d_loss_on_gen # Calculate generator loss g_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(d_on_g_logits), logits=d_on_g_logits) if mode == tf.estimator.ModeKeys.TRAIN: ######### # TRAIN # ######### d_loss = tf.reduce_mean(d_loss) g_loss = tf.reduce_mean(g_loss) d_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, beta1=0.5) g_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, beta1=0.5) if FLAGS.use_tpu: d_optimizer = tpu_optimizer.CrossShardOptimizer(d_optimizer) g_optimizer = tpu_optimizer.CrossShardOptimizer(g_optimizer) with tf.control_dependencies(tf.get_collection( tf.GraphKeys.UPDATE_OPS)): d_step = d_optimizer.minimize(d_loss, var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Discriminator')) g_step = g_optimizer.minimize(g_loss, var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Generator')) increment_step = tf.assign_add( tf.train.get_or_create_global_step(), 1) joint_op = tf.group([d_step, g_step, increment_step]) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=g_loss, train_op=joint_op) elif mode == tf.estimator.ModeKeys.EVAL: ######## # EVAL # ######## def _eval_metric_fn(d_loss, g_loss): # When using TPUs, this function is run on a different machine than the # rest of the model_fn and should not capture any Tensors defined there return { 'discriminator_loss': tf.metrics.mean(d_loss), 'generator_loss': tf.metrics.mean(g_loss) } return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=tf.reduce_mean(g_loss), eval_metrics=(_eval_metric_fn, [d_loss, g_loss])) # Should never reach here raise ValueError('Invalid mode provided to model_fn')
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if FLAGS.data_format == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) network = resnet_model.resnet_v1(resnet_depth=FLAGS.resnet_depth, num_classes=LABEL_CLASSES, data_format=FLAGS.data_format) logits = network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, LABEL_CLASSES) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size current_epoch = (tf.cast(global_step, tf.float32) / batches_per_epoch) learning_rate = learning_rate_schedule(current_epoch) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=MOMENTUM, use_nesterov=True) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', loss[0], step=gs) summary.scalar('learning_rate', lr[0], step=gs) summary.scalar('current_epoch', ce[0], step=gs) return summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params): """Mobilenet v1 model using Estimator API.""" num_classes = FLAGS.num_classes training_active = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) features = tensor_transform_fn(features, params['input_perm']) with bfloat16.bfloat16_scope(): if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=FLAGS.depth_multiplier) else: with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=FLAGS.depth_multiplier) logits = tf.cast(logits, tf.float32) for k in end_points.keys(): end_points[k] = tf.cast(end_points[k], tf.float32) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) #loss = tf.losses.get_total_loss(add_regularization_losses=True) loss += WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 final_learning_rate = 0.0001 * initial_learning_rate train_op = None if training_active: batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=FLAGS.learning_rate_decay_epochs * batches_per_epoch, decay_rate=FLAGS.learning_rate_decay, staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if eval_active: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy( labels, tf.argmax(input=predictions, axis=1)) return {'accuracy': accuracy} if FLAGS.use_logits: eval_predictions = logits else: eval_predictions = end_points['Predictions'] eval_metrics = (metric_fn, [labels, eval_predictions]) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, config, params): """Estimator model function.""" # Not sure why it does this? del labels del config del params tf.get_variable_scope().set_initializer( tf.variance_scaling_initializer(1.0, mode="fan_avg", distribution="uniform")) # PREDICTION (e.g. evaluate) if mode == tf.estimator.ModeKeys.PREDICT: predictions, _, _ = model_params.estimator_prediction_fn(features) if include_features_in_predictions: predictions.update(features) if decode_keys: # Decode the raw ids into strings in prediction. def decode_host_call(tensor_dict): for key in decode_keys: predictions[key] = public_parsing_ops.decode( tensor_dict[key], model_params.vocab_filename, model_params.encoder_type) return tensor_dict contrib_tpu.outside_compilation(decode_host_call, predictions) return tpu_estimator.TPUEstimatorSpec(mode=mode, predictions=predictions) # TRAINING training = mode == tf.estimator.ModeKeys.TRAIN # use_tpu is false by default so this skips if use_tpu and model_params.use_bfloat16: with contrib_tpu.bfloat16_scope(): loss, outputs = model_params.model()(features, training) else: XENT_loss, outputs = model_params.model()(features, training) # XENT_loss, outputs = model_params.model().double_sampling(features, training, model_params.batch_size, # features["targets"].get_shape().as_list()[1], # mixed=True) # TPU requires outputs all have batch dimension and doesn't handle scalar. # Tile all scalars to 1 dimension vector. outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size) # Create optimizer and define learning rate if mode == tf.estimator.ModeKeys.TRAIN: init_lr = model_params.learning_rate global_step = tf.train.get_global_step() lr = init_lr / 0.01 * tf.rsqrt( tf.maximum(tf.to_float(global_step), 10000)) if train_init_checkpoint: lr = tf.minimum( tf.to_float(global_step + 1) / train_warmup_steps * init_lr, lr) optimizer = adafactor.AdafactorOptimizer( learning_rate=lr, decay_rate=adafactor.adafactor_decay_rate_pow(0.8), beta1=0.0) if use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) ############################################################################################################### ##### VARIABLES ############################################################################################### # Create index tensors to stack and get corresponding probabilities from logp # max_seq_len = outputs["targets"].get_shape().as_list()[1] # sequence_index = tf.constant(np.arange(0, max_seq_len)) # batch_index = tf.constant(np.zeros(sequence_index.get_shape().as_list()[0]), dtype=tf.int64) ##### I.I.D SAMPLING ########################################################################################## """ Here we sample the tokens that are produced by teacher forcing. """ # Normalise logits to log-prob, and compute Gumbel samples with location # logit_probs = tf.math.softmax(outputs["logits"], axis=2) # should not be x <= 0 # clipped_logit_probs = tf.clip_by_value(logit_probs, 1e-8, 1.0) # logp = tf.log(clipped_logit_probs) # RETURNS TEACHER FORCING SAMPLED TOKEN VARIATIONS # argmax_logp_index, soft_logp_index, topk_out, z = iid_sampling(logp, max_seq_len, greedy=True, soft=False, # topk=False, k=2) # topk_probs, topk_indices = topk_out # TEST SAMPLING METHODS PROVIDED BY PEGASUS # sampled_BxT = iid_process_logits(outputs["logits"], max_seq_len, model_params.batch_size, # outputs["logits"].get_shape().as_list()[-1], # top_k=0, top_p=0.9, temperature=1.0) ##### DECODER SAMPLING ######################################################################################## """ Here we sample the tokens using the decoder. Beam size == 1. PREDS: IDs LOGP: transformed logits SCORE: scalar score using RISK trick LOGP: [BxTxV] beam logp LOGITS: [BxTxV] beam logits the dictionary contains the following keys: {ids, logp_BxT, sent_score, logp_BxTxV} # Note: the logp_BxTxV are analogous to z -> should be used for RELAX, preds are the BxT of these -> b=H(z), and # logp are the corresponding values (score is normalised to sentence score). """ # greedy_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 0.0} # random_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 1.0} # topk_beam_params = {"_beam": 3, "top_k": 10000, "top_p": 0.0, "temperature": 1.0} # topp_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.9, "temperature": 1.0} # greedy_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=greedy_beam_params, sentence_score=False) # random_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=random_beam_params, sentence_score=False) # topk_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=topk_beam_params, sentence_score=False) # topp_dict = non_beam_sampling(model_params, features, max_seq_len, # beam_params=topp_beam_params, sentence_score=False) # BEAM SEARCH # greedy_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=greedy_beam_params) # random_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=random_beam_params) # topk_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=topk_beam_params) # topp_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index, # beam_params=topp_beam_params) ##### RELAX VARIABLES ######################################################################################### """ Here we create the variables for RELAX. Pass in the logp, logits, and z that has already been sampled/created from manipulation. Will return z_tilde [BxTxV] and logp(b) [BxT]. """ # TEACHER FORCING SAMPLING # z_tilde, logp_b = create_variables(z, logp, batch_index, sequence_index, clipped_logit_probs) # DECODER SAMPLING -> sample_b is already argmaxed in decode loop # z_tilde, logp_b = create_variables_from_samples(random_dict["logits_BxTxV"], random_dict["logp_BxTxV"], # random_dict["ids"], batch_index, sequence_index) ##### TEXT AND ROUGE ########################################################################################## """ Here we first convert sequences to text, and calculate corresponding rouge scores/losses. """ # target_text = rouge_decoding(outputs["targets"], model_params) # TARGET SAMPLES # argmax_pred_text = rouge_decoding(argmax_logp_index, model_params) # ARGMAX SAMPLES # soft_pred_text = rouge_decoding(soft_logp_index, model_params) # SOFTMAX SAMPLES # additional_pred_text = rouge_decoding(sampled_BxT, model_params) # ADDITIONAL SAMPLES # Token-level ROUGE # ROUGE_token = tf.py_function(rouge_token,(outputs["targets"], random_dict["ids"], 0, 0), tf.float32) # CALCULATE ROUGE LOSS: ROUGE score -> ROUGE loss = -ROUGE score # NOTE: for ROUGE variant, change value (0: precision, 1: recall, 2: f1) # rouge_loss_argmax = -tf.py_function(evaluate_rl, (target_text, argmax_pred_text, 2), tf.float32) # rouge_loss_soft = -tf.py_function(evaluate_rl, (target_text, soft_pred_text, 2), tf.float32) # rouge_loss_extra = -tf.py_function(evaluate_rl, (target_text, additional_pred_text, 2), tf.float32) ##### REINFORCE LOSS ########################################################################################## """ Calculate standard REINFORCE loss. Can be document-level (score using RISK trick), or token-level [BxT]. """ # FIND CORRESPONDING LOG_PROBS OF THE I.I.D SAMPLED TOKENS # ARGMAX -> logp(argmax(y)) # argmax_logp = iid_log_probs(argmax_logp_index, batch_index, sequence_index, logp) # SOFTMAX -> logp(sample_y) # softmax_logp = iid_log_probs(soft_logp_index, batch_index, sequence_index, logp) # ADDITIONAL # additional_logp = iid_log_probs(sampled_BxT, batch_index, sequence_index, logp) # CHANGE BELOW IF USING DECODER SAMPLED TOKENS/SCORES # weight the logp by ROUGE score (neg ROUGE_loss), sum values # reinforce_loss = tf.reduce_sum(tf.multiply(rouge_loss_argmax, argmax_logp)) ##### REINFORCE w/ BASELINE ################################################################################### """ Calculate RwB using Socher's loss function (2017). Optional: use a Q_func as baseline. """ # improve the probs of the SOFT labels (soft - hard)*soft_logp # improve the probs of the HARD labels (hard - soft)*hard_logp # BASELINE: CONTROL VARIATE # ffn_output = control_variate(source, targets) # with tf.variable_scope("Q_func"): # cv = rwb_Q_func(tf.reshape(softmax_logp, [1, 32]), tf.reshape(additional_logp, [1, 32])) # cv_loss = tf.reduce_mean(tf.square(tf.subtract(rouge_loss_argmax, cv))) # loss_difference = tf.subtract(rouge_loss_soft, rouge_loss_argmax) # reinforce_baseline = tf.reduce_sum(tf.multiply(loss_difference, softmax_logp)) # BASELINE: HINGE LOSS # rouge_soft = -rouge_loss_soft # rouge_hard = -rouge_loss_argmax # hinge = -tf.maximum((rouge_soft - rouge_hard), 0) # hinge_baseline = tf.reduce_sum(tf.multiply(hinge, softmax_logp)) ##### REINFORCE w/ THRESHOLD ################################################################################## """ Calculate REINFORCE with a constant threshold as the baseline. """ # we take output of ROUGE score as ROUGE_loss = -ROUGE score # intermediate_loss = tf.reduce_sum(tf.multiply(tf.subtract(0.3, -rouge_loss_argmax), argmax_logp)) ##### EXPECTED RISK MINIMISATION ############################################################################## """ Calculate the RISK loss using n sequences from sampling process. """ # L_risk = risk_loss(model_params.batch_size, max_seq_len, # rouge_losses=[rouge_loss_argmax, rouge_loss_soft, rouge_loss_extra], # logps=[topk_dict["logp1"], topk_dict["logp2"], topk_dict["logp3"]], n=3) ##### MIXED LOSS ############################################################################################## """ Implement a mixed loss function that is weighted by an alpha term. """ # combined_loss = tf.math.add(tf.multiply(tf.constant(0.3, dtype=tf.float32), XENT_loss), # tf.multiply(tf.constant(0.7, dtype=tf.float32), L_risk)) # OR conditional loss switch # constraint = tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32) # combined_loss = tf.cond(constraint > 0.8, lambda: hard_reinforce_loss, lambda: XENT_loss) ##### RELAX CONTROL VARIATE ################################################################################### """ Prepare the target sequence for use in the control variate. """ # z = random_dict["logp_BxTxV"] # z_target, zt_target = create_cv_target(outputs, batch_index, sequence_index, z, z_tilde) ##### RELAX LOSS ############################################################################################## """ Manipulate z and z_tilde using the Q_func to mimic ROUGE loss. """ # with tf.variable_scope("Q_func"): # c_z = Q_func(z, z_target) # with tf.variable_scope("Q_func", reuse=True): # c_z_tilde = Q_func(z_tilde, zt_target) # Formulate RELAX as a loss function # f_y = rouge_loss_soft # negative for loss (defined above) # c_z_tilde1 = tf.stop_gradient(tf.identity(c_z_tilde)) # clone, detach, stop grad # L_relax = tf.reduce_sum(((f_y - c_z_tilde1)*logp_b) - c_z_tilde + c_z) # OR construct gradient estimator # theta = [tv for tv in tf.trainable_variables() if "Q_func" not in tv.name] # d_logp_d_theta = tf.gradients(logp_b, theta)[0] # logp # d_c_z_tilde_d_theta = tf.gradients(c_z_tilde, theta)[0] # d_c_z_d_theta = tf.gradients(c_z, theta)[0] # relax = tf.reduce_sum(f_y - c_z_tilde)*d_logp_d_theta - d_c_z_tilde_d_theta + d_c_z_d_theta # relax = tf.gradients(L_relax, theta)[0] # Calculate the first optimization step with loss # list_of_gradient_variable_pairs = optimizer.compute_gradients(L_relax) # train_op = optimizer.apply_gradients(list_of_gradient_variable_pairs, global_step=global_step) # Variance reduction objective # variance_loss = tf.reduce_mean(tf.square(relax), name="variance_loss") # initialise adafactor again for variance optimiser # var_opt = adafactor.AdafactorOptimizer( # learning_rate=lr, # decay_rate=adafactor.adafactor_decay_rate_pow(0.8), # beta1=0.0) # est_params = [eta, log_temperature] # TODO: REBAR implementation # Adds the parameters of the FFNN # nn_params = [tv for tv in tf.trainable_variables() if "Q_func" in tv.name] # est_params = nn_params # est_params = est_params + nn_params # TODO: REBAR implementation # Additional optimization step # var_gradvars = var_opt.compute_gradients(variance_loss, var_list=est_params) # var_train_op = var_opt.apply_gradients(var_gradvars) # This may allow for both train ops to be passed in the return statement below? # with tf.control_dependencies([train_op, var_train_op]): # train_op = tf.no_op() ############################################################################################################### # Calculate gradients # If freezing layers, only optimise wrt certain layers (find names) - speeds up, worsens performance # last_params = [tv for tv in tf.trainable_variables() if "decoder/LayerNorm/" in tv.name] # list_of_gradient_variable_pairs = optimizer.compute_gradients(combined_loss, var_list=last_params) list_of_gradient_variable_pairs = optimizer.compute_gradients( XENT_loss) train_op = optimizer.apply_gradients( list_of_gradient_variable_pairs, global_step=global_step) tf.logging.set_verbosity(tf.logging.INFO) # Debugging steps - add into logging hook directly if needed # tf.debugging.check_numerics(sum_logp, "DEBUG: sum_logp has a NaN") logging_hook = tf.train.LoggingTensorHook( { "loss": XENT_loss, # "variance_loss": variance_loss, # "cv_loss": cv_loss, "learning_rate": lr, "global_step": global_step, }, every_n_iter=5) # This is the configured estimator function that is returned to train the model return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=XENT_loss, train_op=train_op, training_hooks=[logging_hook], scaffold_fn=_load_vars_from_checkpoint(use_tpu, train_init_checkpoint), host_call=add_scalars_to_summary( model_dir, { "learning_rate": lr, # "rouge_loss_hard": rouge_loss_argmax, # "rouge_loss_soft": rouge_loss_soft, # "rouge_loss_extra": rouge_loss_extra, # "reinforce_loss": reinforce_loss, # "risk_loss": L_risk, # "XENT_loss": XENT_loss, })) # EVALUATION (evaluating the performance) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = model_params.estimator_eval_metrics_fn( features, outputs) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=XENT_loss, eval_metrics=eval_metrics)
def inception_model_fn(features, labels, mode, params): """Inception v3 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) features = tensor_transform_fn(features, params['input_perm']) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): if FLAGS.precision == 'bfloat16': with bfloat16.bfloat16_scope(): logits, end_points = inception.inception_v3( features, num_classes, is_training=is_training) logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits, end_points = inception.inception_v3( features, num_classes, is_training=is_training) return logits, end_points if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with arg_scope( inception.inception_v3_arg_scope( weight_decay=0.0, batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = build_network() else: with arg_scope( inception.inception_v3_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = build_network() predictions = end_points predictions.update({ 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') }) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) losses = tf.add_n(tf.losses.get_losses()) l2_loss = [] for v in tf.trainable_variables(): if 'BatchNorm' not in v.name and 'weights' in v.name: l2_loss.append(tf.nn.l2_loss(v)) loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 if FLAGS.use_learning_rate_warmup: # Adjust initial learning rate to match final warmup rate warmup_decay = FLAGS.learning_rate_decay**( (FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs) adj_initial_learning_rate = initial_learning_rate * warmup_decay final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int(FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True) if FLAGS.use_learning_rate_warmup: wlr = 0.1 * adj_initial_learning_rate wlr_height = tf.cast( 0.9 * adj_initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1), tf.float32) epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32) exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs + FLAGS.learning_rate_decay_epochs) lin_inc_lr = tf.add( wlr, tf.multiply( tf.cast(tf.subtract(current_epoch, epoch_offset), tf.float32), wlr_height)) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where(tf.greater_equal(current_epoch, exp_decay_start), learning_rate, lin_inc_lr)), wlr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide them as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: asg_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) eval_dict = eval_util.result_dict_for_single_example( tf.expand_dims(features[fields.InputDataFields.original_image][0], 0), features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2) if not use_tpu: tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() coco_evaluator = coco_evaluation.CocoDetectionEvaluator( category_index.values()) eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( image_id=eval_dict[input_data_fields.key], groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes], detection_boxes=eval_dict[detection_fields.detection_boxes], detection_scores=eval_dict[detection_fields.detection_scores], detection_classes=eval_dict[detection_fields.detection_classes]) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)