def model_fn(features, labels, mode, params): im_mode = MODEKEY_TO_MODE[mode] model_config = configuration.ModelConfig() training_config = configuration.TrainingConfig() model = show_and_tell_model.ShowAndTellModel( model_config, mode=im_mode, train_inception=FLAGS.train_inception) model.build_model_for_tpu(images=features["images"], input_seqs=features["input_seqs"], target_seqs=features["target_seqs"], input_mask=features["input_mask"]) optimizer = tf.train.GradientDescentOptimizer( learning_rate=training_config.initial_learning_rate) optimizer = contrib_estimator.clip_gradients_by_norm( optimizer, training_config.clip_gradients) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize( model.total_loss, global_step=tf.train.get_or_create_global_step()) def scaffold_fn(): """Load pretrained Inception checkpoint at initialization time.""" return tf.train.Scaffold(init_fn=model.init_fn) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=model.total_loss, train_op=train_op, scaffold_fn=scaffold_fn)
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning ratde. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def optimizer(lr): opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=opt_cfg.beta1, beta2=opt_cfg.beta2, epsilon=opt_cfg.epsilon) if is_tpu: opt = contrib_tpu.CrossShardOptimizer(opt) return opt
def model_fn(features, labels, mode, params): """TPUEstimatorSpec for the Squeezenet model.""" is_training = mode == tf.estimator.ModeKeys.TRAIN logits = squeezenet(features, is_training=is_training, num_classes=params["num_classes"]) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) global_batch_size = (params["train"]["num_cores_per_replica"] * params["train"]["train_batch_size"]) decay_steps = (params["train"]["num_examples_per_epoch"] * params["train"]["num_epochs"]) // global_batch_size learning_rate = tf.train.polynomial_decay( params["train"]["learning_rate"]["init_learning_rate"], global_step=tf.train.get_or_create_global_step(), end_learning_rate=params["train"]["learning_rate"] ["end_learning_rate"], decay_steps=decay_steps, power=1.0, cycle=False) # TODO(power): Hack copied from resnet: remove when summaries are working. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["train"]["train_batch_size"], ]), [params["train"]["train_batch_size"], 1]) if params["train"]["optimizer"]["type"] == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif params["train"]["optimizer"]["type"] == "rmsprop": optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate, momentum=params["train"]["optimizer"]["momentum"], epsilon=1.0) else: optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params["train"]["optimizer"]["momentum"], use_nesterov=True) if params["use_tpu"]: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=(metric_fn, [labels, logits, lr_repeat]), predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") }, )
def my_model(features, labels, mode, params): """Deep Neural Network(DNN) model. This is a DNN Model with 3 hidden layers. First 2 hidden layers are having 10 neurons in each. And number of neurons in the last layer is equal to the number of output classes. This is a densely connected network where each neuron of previous layer is connected to each neuron of next layer. Args: features: Feature values for input samples. labels: label/class assigned to the corresponding input sample. mode: "TRAIN"/"EVAL"/"PREDICT" params: Dictionary used to pass extra parameters to model function from the main function. Returns: TPUEstimator object. """ # Create three fully connected layers. net = tf.feature_column.input_layer(features, params["feature_columns"]) for units in params["hidden_units"]: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) # Compute logits (1 per class). logits = tf.layers.dense(net, params["n_classes"], activation=None) # Compute predictions. predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "class_ids": predicted_classes[:, tf.newaxis], "probabilities": tf.nn.softmax(logits), "logits": logits, } return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions) # Compute loss. loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) if mode == tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits])) # Create training op. if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Inception v3 model using Estimator API.""" del params if mode != tf.estimator.ModeKeys.TRAIN: raise RuntimeError('mode {} is not supported yet'.format(mode)) num_labels = FLAGS.num_labels with slim.arg_scope(inception_v3_arg_scope(is_training=True)): logits, end_points = inception.inception_v3( features, num_labels, is_training=True, depth_multiplier=FLAGS.depth_multiplier) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_labels) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(end_points['AuxLogits'], onehot_labels, label_smoothing=0.1, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy(logits, onehot_labels, label_smoothing=0.1, weights=1.0) loss = tf.losses.get_total_loss() if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=FLAGS.learning_rate, momentum=0.9) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize( loss, global_step=tf.train.get_or_create_global_step()) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Define a CIFAR model in Keras.""" del params # unused layers = contrib_keras.layers # Pass our input tensor to initialize the Keras input layer. v = layers.Input(tensor=features) v = layers.Conv2D(filters=32, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Conv2D(filters=64, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Flatten()(v) fc1 = layers.Dense(units=512, activation="relu")(v) logits = layers.Dense(units=10)(fc1) # Instead of constructing a Keras model for training, build our loss function # and optimizer in Tensorflow. # # N.B. This construction omits some features that are important for more # complex models (e.g. regularization, batch-norm). Once # `model_to_estimator` support is added for TPUs, it should be used instead. loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) optimizer = tf.train.AdamOptimizer() if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax( logits, name="softmax_tensor") })
def create_train_op(loss, learning_rate, var_list, global_step, use_tpu=False): exp_learning_rate = tf.train.exponential_decay(learning_rate, global_step, decay_steps=10000, decay_rate=0.96) optimizer = tf.train.AdamOptimizer(learning_rate=exp_learning_rate, beta1=0.5, beta2=0.999) if use_tpu: optimizer = tpu.CrossShardOptimizer(optimizer) return optimizer.minimize(loss, var_list=var_list, global_step=global_step, colocate_gradients_with_ops=True)
def get_train_op_and_metrics(loss, params): """Generate training op and metrics to save in TensorBoard.""" with tf.variable_scope("get_train_op"): learning_rate = get_learning_rate( learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"]) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. optimizer = contrib_opt.LazyAdamOptimizer( learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # Uses automatic mixed precision FP16 training if on GPU. if params["dtype"] == "fp16": optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) train_metrics = {"learning_rate": learning_rate} if not params["use_tpu"]: # gradient norm is not included as a summary when running on TPU, as # it can cause instability between the TPU and the host controller. gradient_norm = tf.global_norm(list(zip(*gradients))[0]) train_metrics["global_norm/gradient_norm"] = gradient_norm return train_op, train_metrics
def model_fn(features, labels, mode, params): """model_fn constructs the ML model used to predict handwritten digits.""" del params image = features if isinstance(image, dict): image = features["image"] model = mnist.create_model("channels_last") if mode == tf.estimator.ModeKeys.PREDICT: logits = model(image, training=False) predictions = { 'class_ids': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits), } return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions) logits = model(image, training=(mode == tf.estimator.ModeKeys.TRAIN)) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, tf.train.get_global_step(), decay_steps=100000, decay_rate=0.96) optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=optimizer.minimize( loss, tf.train.get_global_step())) if mode == tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits]))
def model_fn(features, labels, mode, params): # Build graph logits = tf.layers.dense(features, 10) loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits) optim = tf.train.GradientDescentOptimizer(learning_rate=1e-2) # NOTE: # When using TPUs, you have to use CrossShardOptimizer which aggregate gradients with all reduce. if params["use_tpu"]: optim = tpu.CrossShardOptimizer(optim) train_op = optim.minimize(loss=loss, global_step=tf.train.get_or_create_global_step()) # Create EstimatorSpec estimator_spec = tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, ) return estimator_spec
def _build_optimizer(self, optimizer_to_use=tf.train.AdamOptimizer, tpu_support=False): """Buids the optimizer(s) to minimize the loss(es) of the model. Args: optimizer_to_use (tf optimizer, optional): Defaults to tf.train.AdamOptimizer. Which optimizer to use. tpu_support (bool, optional): Defaults to False. If the optimizer has to support shard optimier, required for TPU usage. """ self.optimize_ops = [] for loss in self.losses[ 'train']: # TODO Create apropoiate external training scheme optimize_op = optimizer_to_use(learning_rate=self.learning_rate) if tpu_support: optimize_op = tpu.CrossShardOptimizer(optimize_op) optimize_op = optimize_op.minimize( loss=loss, global_step=tf.train.get_global_step()) self.optimize_ops.append(optimize_op) logging.info('Optimizers built')
def _build_optimizer(self, learning_rate): """Build optimizer.""" if self.hparams.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif self.hparams.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=self.hparams.momentum_rate) elif self.hparams.optimizer == 'rmsprop': tf.logging.info('Using RMSProp optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', self.hparams.optimizer) if self.hparams.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) return optimizer
def train_function(training_method, loss, cross_loss, reg_loss, output_dir, use_tpu): """Training script for resnet model. Args: training_method: string indicating pruning method used to compress model. loss: tensor float32 of the cross entropy + regularization losses. cross_loss: tensor, only cross entropy loss, passed for logging. reg_loss: tensor, only regularization loss, passed for logging. output_dir: string tensor indicating the directory to save summaries. use_tpu: boolean indicating whether to run script on a tpu. Returns: host_call: summary tensors to be computed at each training step. train_op: the optimization term. """ global_step = tf.train.get_global_step() steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) learning_rate = lr_schedule(current_epoch) if FLAGS.use_adam: # We don't use step decrease for the learning rate. learning_rate = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) else: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=FLAGS.momentum, use_nesterov=True) if use_tpu: # use CrossShardOptimizer when using TPU. optimizer = contrib_tpu.CrossShardOptimizer(optimizer) if training_method == 'set': # We override the train op to also update the mask. optimizer = sparse_optimizers.SparseSETOptimizer( optimizer, begin_step=FLAGS.maskupdate_begin_step, end_step=FLAGS.maskupdate_end_step, grow_init=FLAGS.grow_init, frequency=FLAGS.maskupdate_frequency, drop_fraction=FLAGS.drop_fraction, drop_fraction_anneal=FLAGS.drop_fraction_anneal, stateless_seed_offset=FLAGS.seed) elif training_method == 'static': # We override the train op to also update the mask. optimizer = sparse_optimizers.SparseStaticOptimizer( optimizer, begin_step=FLAGS.maskupdate_begin_step, end_step=FLAGS.maskupdate_end_step, grow_init=FLAGS.grow_init, frequency=FLAGS.maskupdate_frequency, drop_fraction=FLAGS.drop_fraction, drop_fraction_anneal=FLAGS.drop_fraction_anneal, stateless_seed_offset=FLAGS.seed) elif training_method == 'momentum': # We override the train op to also update the mask. optimizer = sparse_optimizers.SparseMomentumOptimizer( optimizer, begin_step=FLAGS.maskupdate_begin_step, end_step=FLAGS.maskupdate_end_step, momentum=FLAGS.s_momentum, frequency=FLAGS.maskupdate_frequency, drop_fraction=FLAGS.drop_fraction, grow_init=FLAGS.grow_init, stateless_seed_offset=FLAGS.seed, drop_fraction_anneal=FLAGS.drop_fraction_anneal, use_tpu=use_tpu) elif training_method == 'rigl': # We override the train op to also update the mask. optimizer = sparse_optimizers.SparseRigLOptimizer( optimizer, begin_step=FLAGS.maskupdate_begin_step, end_step=FLAGS.maskupdate_end_step, grow_init=FLAGS.grow_init, frequency=FLAGS.maskupdate_frequency, drop_fraction=FLAGS.drop_fraction, stateless_seed_offset=FLAGS.seed, drop_fraction_anneal=FLAGS.drop_fraction_anneal, initial_acc_scale=FLAGS.rigl_acc_scale, use_tpu=use_tpu) elif training_method == 'snip': optimizer = sparse_optimizers.SparseSnipOptimizer( optimizer, mask_init_method=FLAGS.mask_init_method, custom_sparsity_map=CUSTOM_SPARSITY_MAP, default_sparsity=FLAGS.end_sparsity, use_tpu=use_tpu) elif training_method in ('scratch', 'baseline'): pass else: raise ValueError('Unsupported pruning method: %s' % FLAGS.training_method) # UPDATE_OPS needs to be added as a dependency due to batch norm update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops), tf.name_scope('train'): train_op = optimizer.minimize(loss, global_step) metrics = { 'global_step': tf.train.get_or_create_global_step(), 'loss': loss, 'cross_loss': cross_loss, 'reg_loss': reg_loss, 'learning_rate': learning_rate, 'current_epoch': current_epoch, } # Logging drop_fraction if dynamic sparse training. if training_method in ('set', 'momentum', 'rigl', 'static'): metrics['drop_fraction'] = optimizer.drop_fraction # Let's log some statistics from a single parameter-mask couple. # This is useful for debugging. test_var = pruning.get_weights()[0] test_var_mask = pruning.get_masks()[0] metrics.update({ 'fw_nz_weight': tf.count_nonzero(test_var), 'fw_nz_mask': tf.count_nonzero(test_var_mask), 'fw_l1_weight': tf.reduce_sum(tf.abs(test_var)) }) masks = pruning.get_masks() global_sparsity = sparse_utils.calculate_sparsity(masks) metrics['global_sparsity'] = global_sparsity metrics.update( utils.mask_summaries(masks[:4] + masks[-1:], with_img=FLAGS.log_mask_imgs_each_iteration)) host_call = (functools.partial(utils.host_call_fn, output_dir), utils.format_tensors(metrics)) return host_call, train_op
def neumf_model_fn(features, labels, mode, params): """Model Function for NeuMF estimator.""" if params.get("use_seed"): tf.set_random_seed(stat_utils.random_int32()) users = features[movielens.USER_COLUMN] items = features[movielens.ITEM_COLUMN] user_input = tf.keras.layers.Input(tensor=users) item_input = tf.keras.layers.Input(tensor=items) logits = construct_model(user_input, item_input, params).output # Softmax with the first column of zeros is equivalent to sigmoid. softmax_logits = ncf_common.convert_to_softmax_logits(logits) if mode == tf.estimator.ModeKeys.EVAL: duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) return _get_estimator_spec_with_metrics( logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_xla_for_gpu"]) elif mode == tf.estimator.ModeKeys.TRAIN: labels = tf.cast(labels, tf.int32) valid_pt_mask = features[rconst.VALID_POINT_MASK] mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, value=params["learning_rate"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1, value=params["beta1"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2, value=params["beta2"]) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON, value=params["epsilon"]) optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]) if params["use_tpu"]: # TODO(seemuch): remove this contrib import optimizer = contrib_tpu.CrossShardOptimizer(optimizer) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN, value=mlperf_helper.TAGS.BCE) loss = tf.compat.v1.losses.sparse_softmax_cross_entropy( labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32)) # This tensor is used by logging hooks. tf.identity(loss, name="cross_entropy") global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) gradients = sparse_to_dense_grads(gradients) minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) else: raise NotImplementedError
def train_function(pruning_method, loss, output_dir, use_tpu): """Training script for resnet model. Args: pruning_method: string indicating pruning method used to compress model. loss: tensor float32 of the cross entropy + regularization losses. output_dir: string tensor indicating the directory to save summaries. use_tpu: boolean indicating whether to run script on a tpu. Returns: host_call: summary tensors to be computed at each training step. train_op: the optimization term. """ global_step = tf.train.get_global_step() steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) learning_rate = lr_schedule(current_epoch) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=FLAGS.momentum, use_nesterov=True) if use_tpu: # use CrossShardOptimizer when using TPU. optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # UPDATE_OPS needs to be added as a dependency due to batch norm update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops), tf.name_scope('train'): train_op = optimizer.minimize(loss, global_step) if not use_tpu: if FLAGS.num_workers > 0: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=FLAGS.num_workers, total_num_replicas=FLAGS.num_workers) optimizer.make_session_run_hook(True) metrics = { 'global_step': tf.train.get_or_create_global_step(), 'loss': loss, 'learning_rate': learning_rate, 'current_epoch': current_epoch } if pruning_method == 'threshold': # construct the necessary hparams string from the FLAGS hparams_string = ('begin_pruning_step={0},' 'sparsity_function_begin_step={0},' 'end_pruning_step={1},' 'sparsity_function_end_step={1},' 'target_sparsity={2},' 'pruning_frequency={3},' 'threshold_decay=0,' 'use_tpu={4}'.format( FLAGS.sparsity_begin_step, FLAGS.sparsity_end_step, FLAGS.end_sparsity, FLAGS.pruning_frequency, FLAGS.use_tpu, )) # Parse pruning hyperparameters pruning_hparams = pruning.get_pruning_hparams().parse(hparams_string) # The first layer has so few parameters, we don't need to prune it, and # pruning it a higher sparsity levels has very negative effects. if FLAGS.prune_first_layer and FLAGS.first_layer_sparsity >= 0.: pruning_hparams.set_hparam( 'weight_sparsity_map', ['resnet_model/initial_conv:%f' % FLAGS.first_layer_sparsity]) if FLAGS.prune_last_layer and FLAGS.last_layer_sparsity >= 0: pruning_hparams.set_hparam( 'weight_sparsity_map', ['resnet_model/final_dense:%f' % FLAGS.last_layer_sparsity]) # Create a pruning object using the pruning hyperparameters pruning_obj = pruning.Pruning(pruning_hparams, global_step=global_step) # We override the train op to also update the mask. with tf.control_dependencies([train_op]): train_op = pruning_obj.conditional_mask_update_op() masks = pruning.get_masks() metrics.update(utils.mask_summaries(masks)) elif pruning_method == 'scratch': masks = pruning.get_masks() # make sure the masks have the sparsity we expect and that it doesn't change metrics.update(utils.mask_summaries(masks)) elif pruning_method == 'variational_dropout': masks = utils.add_vd_pruning_summaries( threshold=FLAGS.log_alpha_threshold) metrics.update(masks) elif pruning_method == 'l0_regularization': summaries = utils.add_l0_summaries() metrics.update(summaries) elif pruning_method == 'baseline': pass else: raise ValueError('Unsupported pruning method', FLAGS.pruning_method) host_call = (functools.partial(utils.host_call_fn, output_dir), utils.format_tensors(metrics)) return host_call, train_op
def model_fn(features, labels, mode, params): """TPUEstimator compatible model function.""" loss = loss_fn(features, labels, mode, params) host_call = None train_op = None if mode == tf.estimator.ModeKeys.TRAIN: num_batches_per_epoch = params['num_batches_per_epoch'] global_step = tf.train.get_global_step() current_epoch = tf.cast(global_step, tf.float32) / num_batches_per_epoch learning_rate = _get_learning_rate(params, global_step, num_batches_per_epoch) optimizer = _get_optimizer(params, learning_rate) if params['use_tpu']: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, tf.train.get_global_step()) if params['use_host_call']: def host_call_fn(global_step, loss, learning_rate, current_epoch): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: global_step: `Tensor with shape `[batch, ]` for the global_step. loss: `Tensor` with shape `[batch, ]` for the training loss. learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate. current_epoch: `Tensor` with shape `[batch, ]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ # Outfeed supports int32 but global_step is expected to be int64. global_step = tf.reduce_mean(global_step) with (contrib_summary.create_file_writer( params['model_dir']).as_default()): with contrib_summary.always_record_summaries(): contrib_summary.scalar('loss', tf.reduce_mean(loss), step=global_step) contrib_summary.scalar('learning_rate', tf.reduce_mean(learning_rate), step=global_step) contrib_summary.scalar('current_epoch', tf.reduce_mean(current_epoch), step=global_step) return contrib_summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. global_step_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) learning_rate_t = tf.reshape(learning_rate, [1]) current_epoch_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [ global_step_t, loss_t, learning_rate_t, current_epoch_t ]) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = _create_eval_metric(features, labels, params) # Restore from checkpoint if available. if params['init_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: tf.logging.info('Found an init checkpoint.') model_variant = params['model_options'].model_variant var_scope = '{}/'.format(feature_extractor.name_scope[model_variant]) def scaffold_fn(): """Create Scaffold for initialization, etc.""" tf.train.init_from_checkpoint(params['init_checkpoint'], { var_scope: var_scope, }) return tf.train.Scaffold() else: tf.logging.info('No init checkpoint found. Training from scratch.') scaffold_fn = None return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn, host_call=host_call, eval_metrics=eval_metrics, )
def model_fn(features, labels, mode, params): """`model_fn` for training mode for `TPUEstimator`.""" labels = labels is_training = (mode == tf.estimator.ModeKeys.TRAIN) x = tf.transpose(features['x'], [1, 0]) y = tf.transpose(features['y'], [1, 0]) init_states, model_params = _build_params(params) (update_average_ops, moving_average_mu, use_moving_average_ops) = _create_average_ops(params) if params.moving_average: tf.logging.info('swap in moving average') with tf.control_dependencies(use_moving_average_ops): total_loss, cross_entropy_loss = _forward(params, x, y, model_params, init_states, is_training=is_training) else: if not is_training: tf.logging.info('not swap in moving average') total_loss, cross_entropy_loss = _forward(params, x, y, model_params, init_states, is_training=is_training) if is_training: tf_vars = tf.trainable_variables() global_step = tf.train.get_or_create_global_step() lr_scale = (tf.cast(tf.shape(y)[-1], dtype=tf.float32) / tf.cast(params.bptt_steps, dtype=tf.float32)) learning_rate = utils.get_lr(global_step, params) * lr_scale grads = tf.gradients(total_loss, tf_vars) clipped_grads, grad_norm = tf.clip_by_global_norm( grads, params.grad_bound) optimizer = tf.train.GradientDescentOptimizer(learning_rate) if params.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer( opt=optimizer, reduction=tf.losses.Reduction.MEAN) with tf.control_dependencies(update_average_ops): train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), global_step=global_step) names_and_tensors = [ ('learning_rate', learning_rate), ('per_example_cross_entropy', cross_entropy_loss), ('train_ppl', tf.exp(cross_entropy_loss)), ('grad_norm', grad_norm), ('moving_average_mu', moving_average_mu), ] host_call = utils.build_host_call_fn(params, names_and_tensors) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call) else: def _metric_fn(cross_entropy_loss): """Computes metrics for EstimatorSpec.""" metrics = { 'log_ppl/{0}'.format(params.task_mode): tf.metrics.mean(values=cross_entropy_loss), } return metrics return contrib_tpu.TPUEstimatorSpec(mode=tf.estimator.ModeKeys.EVAL, loss=total_loss, eval_metrics=(_metric_fn, [cross_entropy_loss ]))
def get_cross_shard_optimizer(optimizer, disable_for_cpu_debugging=False): if disable_for_cpu_debugging: return optimizer return contrib_tpu.CrossShardOptimizer(optimizer)
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, optimizer="adamw", poly_power=1.0, start_warmup_step=0, colocate_gradients_with_ops=False, hvd=None, use_fp16=False, manual_fp16=False): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=poly_power, cycle=False) # Implements linear warmup. I.e., if global_step - start_warmup_step < # num_warmup_steps, the learning rate will be # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`. if num_warmup_steps: tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step) + ", for " + str(num_warmup_steps) + " steps ++++++") global_steps_int = tf.cast(global_step, tf.int32) start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32) global_steps_int = global_steps_int - start_warm_int warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ( (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is OK that you use this optimizer for finetuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) # It is OK to use AdamW in the finetuning even the model is trained by LAMB. # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a # batch size of 64 in the finetune. if optimizer == "adamw": tf.logging.info("using adamw") optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) elif optimizer == "lamb": tf.logging.info("using lamb") optimizer = lamb_optimizer.LAMBOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Not supported optimizer: ", optimizer) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) // Change 9 add horovod optimizer if hvd is not None: optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) tvars = tf.trainable_variables() # grads = tf.gradients( # loss, tvars, colocate_gradients_with_ops=colocate_gradients_with_ops) // Change 10 calculate gradients with horovod
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model defination for the RetinaNet model based on ResNet-50. Args: features: The input images tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: The input labels in a tensor with the same shape as input images. mode: The mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: The dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: The FPN segmentation model outputs class logits. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ def _model_outputs(): return model(features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], resnet_depth=params['resnet_depth'], is_training_bn=params['is_training_bn']) if params['use_bfloat16']: with contrib_tpu.bfloat16_scope(): cls_outputs = _model_outputs() cls_outputs = tf.cast(cls_outputs, tf.float32) else: cls_outputs = _model_outputs() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'image': features, 'cls_outputs': cls_outputs} return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % params['resnet_depth'], }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. retinanet_model.update_learning_rate_schedule_parameters(params) global_step = tf.train.get_global_step() learning_rate = retinanet_model.learning_rate_schedule( params['adjusted_learning_rate'], params['lr_warmup_init'], params['lr_warmup_step'], params['first_lr_drop_step'], params['second_lr_drop_step'], global_step) cls_loss = _segmentation_loss(cls_outputs, labels, params) weight_decay_loss = params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) # Add L2 regularization loss total_loss = cls_loss + weight_decay_loss if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = variable_filter_fn( tf.trainable_variables(), params['resnet_depth']) if variable_filter_fn else None with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) else: train_op = None # Evaluation only works on GPU/CPU host and batch_size=1 eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: batch_size = params['batch_size'] def metric_fn(**kwargs): """Creates metric_fn for TPUEstimatorSpec.""" cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) total_loss = tf.metrics.mean(kwargs['total_loss_repeat']) logits = tf.image.resize_bilinear(kwargs['prediction'], tf.shape(kwargs['labels'])[1:3], align_corners=True) predictions_with_shape = tf.argmax(logits, 3, output_type=tf.int32) predictions = tf.reshape(predictions_with_shape, shape=[-1]) labels = tf.reshape(kwargs['labels'], shape=[-1]) # Background class is considered as a class. Not ignored. weights = tf.to_float(tf.not_equal(labels, params['ignore_label'])) # Set ignore_label regions to label 0, because metrics.mean_iou requires # range of labels = [0, dataset.num_classes). # Note the ignore_lable regions are not evaluated since the corresponding # regions contain weights = 0. labels = tf.where(tf.equal(labels, params['ignore_label']), tf.zeros_like(labels), labels) return { 'total_loss': total_loss, 'cls_loss': cls_loss, 'miou': tf.metrics.mean_iou(predictions, labels, params['num_classes'], weights=weights), } cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ batch_size, ]), [batch_size, 1]) total_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(total_loss, 0), [ batch_size, ]), [batch_size, 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'total_loss_repeat': total_loss_repeat, 'prediction': cls_outputs, 'labels': labels, } eval_metrics = (metric_fn, metric_fn_inputs) return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn, )
def inception_model_fn(features, labels, mode, params): """Inception v2 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['input_perm']) if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with arg_scope( inception.inception_v2_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = inception.inception_v2( features, num_classes, is_training=is_training, replace_separable_convolution=True) else: with arg_scope( inception.inception_v2_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = inception.inception_v2( features, num_classes, is_training=is_training, replace_separable_convolution=True) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 if FLAGS.use_learning_rate_warmup: # Adjust initial learning rate to match final warmup rate warmup_decay = FLAGS.learning_rate_decay**( (FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs) adj_initial_learning_rate = initial_learning_rate * warmup_decay final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int(FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True) if FLAGS.use_learning_rate_warmup: wlr = 0.1 * adj_initial_learning_rate wlr_height = tf.cast( 0.9 * adj_initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1), tf.float32) epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32) exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs + FLAGS.learning_rate_decay_epochs) lin_inc_lr = tf.add( wlr, tf.multiply( tf.cast(tf.subtract(current_epoch, epoch_offset), tf.float32), wlr_height)) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where(tf.greater_equal(current_epoch, exp_decay_start), learning_rate, lin_inc_lr)), wlr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def _model_fn(features, labels, mode, params, model, use_tpu_estimator_spec, variable_filter_fn=None): """Model defination for the RetinaNet model based on ResNet. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in dataloader.py mode: the mode of TPUEstimator/Estimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the RetinaNet model outputs class logits and box regression outputs. use_tpu_estimator_spec: Whether to use TPUEstimatorSpec or EstimatorSpec. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ # In predict mode features is a dict with input as value of the 'inputs'. image_info = None if (mode == tf.estimator.ModeKeys.PREDICT and isinstance(features, dict) and 'inputs' in features): image_info = features['image_info'] labels = None if 'labels' in features: labels = features['labels'] features = features['inputs'] def _model_outputs(): return model(features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], num_anchors=len(params['aspect_ratios'] * params['num_scales']), resnet_depth=params['resnet_depth'], is_training_bn=params['is_training_bn']) if params['use_bfloat16']: with contrib_tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: # Postprocess on host; memory layout for NMS on TPU is very inefficient. def _predict_postprocess_wrapper(args): return _predict_postprocess(*args) predictions = contrib_tpu.outside_compilation( _predict_postprocess_wrapper, (cls_outputs, box_outputs, labels, params)) # Include resizing information on prediction output to help bbox drawing. if image_info is not None: predictions.update({ 'image_info': tf.identity(image_info, 'ImageInfo'), }) return contrib_tpu.TPUEstimatorSpec(mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % params['resnet_depth'], }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_global_step() learning_rate = learning_rate_schedule(params['adjusted_learning_rate'], params['lr_warmup_init'], params['lr_warmup_step'], params['first_lr_drop_step'], params['second_lr_drop_step'], global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) total_loss += _WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) else: if params['auto_mixed_precision']: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # Batch norm requires `update_ops` to be executed alongside `train_op`. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = variable_filter_fn( tf.trainable_variables(), params['resnet_depth']) if variable_filter_fn else None minimize_op = optimizer.minimize(total_loss, global_step, var_list=var_list) train_op = tf.group(minimize_op, update_ops) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) if use_tpu_estimator_spec: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, # TODO(rostam): Fix bug to get scaffold working. # scaffold=scaffold_fn(), train_op=train_op)
def model_fn(features, labels, mode, params): """Our model_fn for Densenet to be used with our Estimator.""" tf.logging.info("model_fn") if FLAGS.network_depth == 169: logits = densenet_model.densenet_imagenet_169( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) elif FLAGS.network_depth == 201: logits = densenet_model.densenet_imagenet_201( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) elif FLAGS.network_depth == 121: logits = densenet_model.densenet_imagenet_121( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) else: tf.logging.info("Number of layers not supported, revert to 121") logits = densenet_model.densenet_imagenet_121( features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels) # Add weight decay to the loss. We exclude weight decay on the batch # normalization variables because it slightly improves accuracy. loss = cross_entropy + _WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if "batch_normalization" not in v.name ]) global_step = tf.train.get_global_step() current_epoch = (tf.cast(global_step, tf.float32) / params["batches_per_epoch"]) learning_rate = learning_rate_schedule(current_epoch) # TODO(chrisying): this is a hack to get the LR and epoch for Tensorboard. # Reimplement this when TPU training summaries are supported. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) ce_repeat = tf.reshape( tf.tile(tf.expand_dims(current_epoch, 0), [ params["batch_size"], ]), [params["batch_size"], 1]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=_MOMENTUM) optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits, lr_repeat, ce_repeat): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" predictions = tf.argmax(logits, axis=1) accuracy = tf.metrics.accuracy(tf.argmax(labels, axis=1), predictions) lr = tf.metrics.mean(lr_repeat) ce = tf.metrics.mean(ce_repeat) return { "accuracy": accuracy, "learning_rate": lr, "current_epoch": ce } eval_metrics = (metric_fn, [labels, logits, lr_repeat, ce_repeat]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] # obtaining the membership variables is important since only those weights # are modified during the optimization process. membership_logits, membership_vars = create_model( bert_config=bert_config, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, membership_features_str=membership_features_str) membership_probs = tf.nn.softmax(membership_logits, axis=-1) membership_log_probs = tf.nn.log_softmax(membership_logits, axis=-1) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: one_hot_positions = tf.one_hot(label_ids, depth=2, dtype=tf.float32) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * membership_log_probs, axis=-1)) global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss=loss, global_step=global_step, var_list=membership_vars) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: one_hot_positions = tf.one_hot(label_ids, depth=2, dtype=tf.float32) per_example_loss = -1 * tf.reduce_sum( one_hot_positions * membership_log_probs, axis=-1) total_loss = tf.reduce_mean(per_example_loss) def metric_fn(per_example_loss, label_ids, membership_logits): predictions = tf.argmax(membership_logits, axis=-1, output_type=tf.int32) loss = tf.metrics.mean(values=per_example_loss) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions) return {"eval_accuracy": accuracy, "eval_loss": loss} eval_metrics = (metric_fn, [per_example_loss, label_ids, membership_logits]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "membership_probs": membership_probs } output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, optimizer="adamw", poly_power=1.0, start_warmup_step=0): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=poly_power, cycle=False) # Implements linear warmup. I.e., if global_step - start_warmup_step < # num_warmup_steps, the learning rate will be # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`. if num_warmup_steps: tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step) + ", for " + str(num_warmup_steps) + " steps ++++++") global_steps_int = tf.cast(global_step, tf.int32) start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32) global_steps_int = global_steps_int - start_warm_int warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is OK that you use this optimizer for finetuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) # It is OK to use AdamW in the finetuning even the model is trained by LAMB. # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a # batch size of 64 in the finetune. if optimizer == "adamw": tf.logging.info("using adamw") optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) elif optimizer == "lamb": tf.logging.info("using lamb") optimizer = lamb_optimizer.LAMBOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Not supported optimizer: ", optimizer) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(list(zip(grads, tvars)), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this. # But if you use a different optimizer, you should probably take this line # out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) # Set policy for mixed-precision training with Keras-based models. if use_tpu and train_config.use_bfloat16: from tensorflow.python.keras.engine import base_layer_utils # pylint: disable=g-import-not-at-top # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0. base_layer_utils.enable_v2_dtype_behavior() tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): provide_groundtruth(detection_model, labels) preprocessed_images = features[fields.InputDataFields.image] side_inputs = detection_model.get_side_inputs(features) if use_tpu and train_config.use_bfloat16: with contrib_tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) prediction_dict = ops.bfloat16_to_float32_nested( prediction_dict) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = contrib_tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: load_pretrained = hparams.load_pretrained if hparams else False if train_config.fine_tune_checkpoint and load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): if (mode == tf.estimator.ModeKeys.EVAL and eval_config.use_dummy_loss_in_eval): total_loss = tf.constant(1.0) losses_dict = {'Loss/total_loss': total_loss} else: losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if use_tpu and train_config.use_bfloat16: regularization_losses = ops.bfloat16_to_float32_nested( regularization_losses) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = contrib_tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = contrib_framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = contrib_layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if fields.InputDataFields.image_additional_channels in features: eval_dict[fields.InputDataFields. image_additional_channels] = features[ fields.InputDataFields.image_additional_channels] if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def get_estimator_spec(hparams, mode, features, labels, frame_logits, onset_logits, offset_logits, velocity_values, offset_network=True): """Create TPUEstimatorSpec.""" loss_metrics = {} loss = None if (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL): onset_losses = tf.losses.sigmoid_cross_entropy( labels.onsets[:, :, :constants.MIDI_PITCHES], onset_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.onsets)[1]), axis=2)) loss_metrics['onset'] = onset_losses if offset_network and not hparams.drums_only: offset_losses = tf.losses.sigmoid_cross_entropy( labels.offsets[:, :, :constants.MIDI_PITCHES], offset_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.offsets)[1]), axis=2)) loss_metrics['offset'] = offset_losses velocity_losses = tf.losses.mean_squared_error( labels.velocities, velocity_values, weights=labels.onsets * hparams.velocity_loss_weight) loss_metrics['velocity'] = velocity_losses if not hparams.drums_only: frame_losses = tf.losses.sigmoid_cross_entropy( labels.labels[:, :, :constants.MIDI_PITCHES], frame_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.labels)[1]), axis=2)) loss_metrics['frame'] = frame_losses loss = tf.losses.get_total_loss() if (mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT): frame_probs = tf.sigmoid(frame_logits) onset_probs = tf.sigmoid(onset_logits) if offset_network: offset_probs = tf.sigmoid(offset_logits) else: offset_probs = tf.zeros_like(onset_probs) frame_predictions = frame_probs > hparams.predict_frame_threshold onset_predictions = onset_probs > hparams.predict_onset_threshold offset_predictions = offset_probs > hparams.predict_offset_threshold if hparams.drum_prediction_map: map_predictions = functools.partial( drum_mappings.map_pianoroll, mapping_name=hparams.drum_prediction_map, reduce_mode='any', min_pitch=constants.MIN_MIDI_PITCH) frame_predictions = tf.map_fn(map_predictions, frame_predictions) onset_predictions = tf.map_fn(map_predictions, onset_predictions) offset_predictions = tf.map_fn(map_predictions, offset_predictions) map_values = functools.partial( drum_mappings.map_pianoroll, mapping_name=hparams.drum_prediction_map, reduce_mode='max', min_pitch=constants.MIN_MIDI_PITCH) velocity_values = tf.map_fn(map_values, velocity_values) metrics_values = get_metrics(features, labels, frame_probs, onset_probs, frame_predictions, onset_predictions, offset_predictions, velocity_values, hparams) for label, loss_collection in loss_metrics.items(): loss_label = 'losses/' + label metrics_values[loss_label] = loss_collection if mode == tf.estimator.ModeKeys.TRAIN: train_op = contrib_layers.optimize_loss( name='training', loss=loss, global_step=tf.train.get_or_create_global_step(), learning_rate=hparams.learning_rate, learning_rate_decay_fn=functools.partial( tf.train.exponential_decay, decay_steps=hparams.decay_steps, decay_rate=hparams.decay_rate, staircase=True), clip_gradients=hparams.clip_norm, summaries=[], optimizer=lambda lr: contrib_tpu.CrossShardOptimizer( # pylint:disable=g-long-lambda tf.train.AdamOptimizer(lr))) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: metric_ops = {k: tf.metrics.mean(v) for k, v in metrics_values.items()} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=metric_ops) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'frame_probs': frame_probs, 'onset_probs': onset_probs, 'frame_predictions': frame_predictions, 'onset_predictions': onset_predictions, 'offset_predictions': offset_predictions, 'velocity_values': velocity_values, 'sequence_predictions': _predict_sequences( frame_probs=frame_probs, onset_probs=onset_probs, frame_predictions=frame_predictions, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values, hparams=hparams), # Include some features and labels in output because Estimator 'predict' # API does not give access to them. 'sequence_ids': features.sequence_id, 'sequence_labels': labels.note_sequence, 'frame_labels': labels.labels, 'onset_labels': labels.onsets, } for k, v in metrics_values.items(): predictions[k] = tf.stack(v) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) else: raise ValueError('Unsupported mode: %s' % mode)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf_estimator.ModeKeys.TRAIN) membership_logits, membership_vars = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, num_labels, use_one_hot_embeddings, membership_features_str) membership_probs = tf.nn.softmax(membership_logits, axis=-1) membership_log_probs = tf.nn.log_softmax(membership_logits, axis=-1) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf_estimator.ModeKeys.TRAIN: one_hot_positions = tf.one_hot(label_ids, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum( one_hot_positions * membership_log_probs, axis=-1) total_loss = tf.reduce_mean(per_example_loss) global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # only optimize the membership classifier variables since we want to # freeze the model. train_op = optimizer.minimize(loss=total_loss, global_step=global_step, var_list=membership_vars) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf_estimator.ModeKeys.EVAL: one_hot_positions = tf.one_hot(label_ids, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum( one_hot_positions * membership_log_probs, axis=-1) total_loss = tf.reduce_mean(per_example_loss) def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [ per_example_loss, label_ids, membership_logits, is_real_example ]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": membership_probs}, scaffold_fn=scaffold_fn) return output_spec