def model_fn(self, features, labels, mode, params): """TPUEstimator compatible model_fn.""" del params is_training = (mode == tf.estimator.ModeKeys.TRAIN) update_ops = [] # First, embed the context and answer panels. if self.embedding_model_class == "values": # Use the integer values of the ground-truth factors. context_embeddings = features["context_factor_values"] answer_embeddings = features["answers_factor_values"] elif self.embedding_model_class == "onehot": # Use one-hot embeddings of the ground-truth factors. context_embeddings = features["context_factors_onehot"] answer_embeddings = features["answers_factors_onehot"] else: embedding_model = self.embedding_model_class() context_embeddings, answer_embeddings = embedding_model( [ features["context"], features["answers"], ], training=is_training, ) embedding_model.summary(print_fn=tf.logging.info) update_ops += embedding_model.updates # Apply the reasoning model. reasoning_model = self.reasoning_model_class() logits = reasoning_model([context_embeddings, answer_embeddings], training=is_training) reasoning_model.summary(print_fn=tf.logging.info) update_ops += reasoning_model.updates loss_vec = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss_mean = tf.reduce_mean(loss_vec) if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): predictions = tf.argmax(logits, 1) return { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions), } return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss_mean, eval_metrics=(metric_fn, [labels, logits])) if mode == tf.estimator.ModeKeys.TRAIN: # In case we use batch norm, the following is required. with tf.control_dependencies(update_ops): optimizer = self.optimizer_fn() train_op = optimizer.minimize( loss=loss_mean, global_step=tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss_mean, train_op=train_op) raise NotImplementedError("Unsupported mode.")
def model_fn(self, features, labels, mode, params): """TPUEstimator compatible model function.""" del labels is_training = (mode == tf.estimator.ModeKeys.TRAIN) data_shape = features.get_shape().as_list()[1:] batch_size = tf.shape(features)[0] z_mean, z_logvar = self.gaussian_encoder(features, is_training=is_training) z_sampled = self.sample_from_latent_distribution(z_mean, z_logvar) # z_sampled_sum = z_sampled[:batch_size // 2] + \ # z_sampled[batch_size // 2:] # z_sampled_all = tf.concat([z_sampled, z_sampled_sum], axis=0) z_sampled_all = z_sampled reconstructions, group_feats_G, lie_alg_basis = self.decode_with_gfeats( z_sampled_all, data_shape, is_training) per_sample_loss = losses.make_reconstruction_loss( features, reconstructions[:batch_size]) reconstruction_loss = tf.reduce_mean(per_sample_loss) kl_loss = compute_gaussian_kl(z_mean, z_logvar) regularizer = self.regularizer(kl_loss, z_mean, z_logvar, z_sampled, group_feats_G, lie_alg_basis, batch_size) loss = tf.add(reconstruction_loss, regularizer, name="loss") elbo = tf.add(reconstruction_loss, kl_loss, name="elbo") if mode == tf.estimator.ModeKeys.TRAIN: optimizer = optimizers.make_vae_optimizer() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) train_op = tf.group([train_op, update_ops]) tf.summary.scalar("reconstruction_loss", reconstruction_loss) tf.summary.scalar("elbo", -elbo) logging_hook = tf.train.LoggingTensorHook( { "loss": loss, "reconstruction_loss": reconstruction_loss, "elbo": -elbo }, every_n_iter=100) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=(make_metric_fn("reconstruction_loss", "elbo", "regularizer", "kl_loss"), [ reconstruction_loss, -elbo, regularizer, kl_loss ])) else: raise NotImplementedError("Eval mode not supported.")
def my_model(features, labels, mode, params): """Deep Neural Network(DNN) model. This is a DNN Model with 3 hidden layers. First 2 hidden layers are having 10 neurons in each. And number of neurons in the last layer is equal to the number of output classes. This is a densely connected network where each neuron of previous layer is connected to each neuron of next layer. Args: features: Feature values for input samples. labels: label/class assigned to the corresponding input sample. mode: "TRAIN"/"EVAL"/"PREDICT" params: Dictionary used to pass extra parameters to model function from the main function. Returns: TPUEstimator object. """ # Create three fully connected layers. net = tf.feature_column.input_layer(features, params["feature_columns"]) for units in params["hidden_units"]: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) # Compute logits (1 per class). logits = tf.layers.dense(net, params["n_classes"], activation=None) # Compute predictions. predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "class_ids": predicted_classes[:, tf.newaxis], "probabilities": tf.nn.softmax(logits), "logits": logits, } return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions) # Compute loss. loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) if mode == tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits])) # Create training op. if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, albert_hub_module_handle) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(label_ids, predictions) loss = tf.metrics.mean(per_example_loss) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) elif mode == tf.estimator.ModeKeys.PREDICT: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}) else: raise ValueError( "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" utils.log("Building model") is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = MultitaskModel(config, tasks, task_weights, is_training, features, num_train_steps) # Load pre-trained weights from checkpoint tvars = tf.trainable_variables() scaffold_fn = None if not config.debug: assignment_map, _ = modeling.get_assignment_map_from_checkpoint( tvars, config.init_checkpoint) if config.use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(config.init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(config.init_checkpoint, assignment_map) # Run training or prediction if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(config, model.loss, num_train_steps) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=model.loss, train_op=train_op, scaffold_fn=scaffold_fn, training_hooks=[ training_utils.ETAHook( config, {} if config.use_tpu else dict(loss=model.loss), num_train_steps) ]) else: assert mode == tf.estimator.ModeKeys.PREDICT output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions=utils.flatten_dict(model.outputs), scaffold_fn=scaffold_fn) utils.log("Building complete") return output_spec
def model_fn(features, labels, mode, params): im_mode = MODEKEY_TO_MODE[mode] model_config = configuration.ModelConfig() training_config = configuration.TrainingConfig() model = show_and_tell_model.ShowAndTellModel( model_config, mode=im_mode, train_inception=FLAGS.train_inception) model.build_model_for_tpu(images=features["images"], input_seqs=features["input_seqs"], target_seqs=features["target_seqs"], input_mask=features["input_mask"]) optimizer = tf.train.GradientDescentOptimizer( learning_rate=training_config.initial_learning_rate) optimizer = contrib_estimator.clip_gradients_by_norm( optimizer, training_config.clip_gradients) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize( model.total_loss, global_step=tf.train.get_or_create_global_step()) def scaffold_fn(): """Load pretrained Inception checkpoint at initialization time.""" return tf.train.Scaffold(init_fn=model.init_fn) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=model.total_loss, train_op=train_op, scaffold_fn=scaffold_fn)
def _get_estimator_spec_with_metrics( logits, # type: tf.Tensor softmax_logits, # type: tf.Tensor duplicate_mask, # type: tf.Tensor num_training_neg, # type: int match_mlperf=False, # type: bool use_tpu_spec=False # type: bool ): """Returns a EstimatorSpec that includes the metrics.""" cross_entropy, \ metric_fn, \ in_top_k, \ ndcg, \ metric_weights = compute_eval_loss_and_metrics_helper( logits, softmax_logits, duplicate_mask, num_training_neg, match_mlperf, use_tpu_spec) if use_tpu_spec: return contrib_tpu.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metrics=(metric_fn, [in_top_k, ndcg, metric_weights])) return tf.estimator.EstimatorSpec(mode=tf.estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metric_ops=metric_fn( in_top_k, ndcg, metric_weights))
def model_fn(features, labels, mode, params): """TPUEstimatorSpec for the Squeezenet model.""" is_training = mode == tf.estimator.ModeKeys.TRAIN logits = squeezenet(features, is_training=is_training, num_classes=params["num_classes"]) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) global_batch_size = (params["train"]["num_cores_per_replica"] * params["train"]["train_batch_size"]) decay_steps = (params["train"]["num_examples_per_epoch"] * params["train"]["num_epochs"]) // global_batch_size learning_rate = tf.train.polynomial_decay( params["train"]["learning_rate"]["init_learning_rate"], global_step=tf.train.get_or_create_global_step(), end_learning_rate=params["train"]["learning_rate"] ["end_learning_rate"], decay_steps=decay_steps, power=1.0, cycle=False) # TODO(power): Hack copied from resnet: remove when summaries are working. lr_repeat = tf.reshape( tf.tile(tf.expand_dims(learning_rate, 0), [ params["train"]["train_batch_size"], ]), [params["train"]["train_batch_size"], 1]) if params["train"]["optimizer"]["type"] == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif params["train"]["optimizer"]["type"] == "rmsprop": optimizer = tf.train.RMSPropOptimizer( learning_rate=learning_rate, momentum=params["train"]["optimizer"]["momentum"], epsilon=1.0) else: optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params["train"]["optimizer"]["momentum"], use_nesterov=True) if params["use_tpu"]: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=(metric_fn, [labels, logits, lr_repeat]), predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor") }, )
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) all_layers = model.get_all_encoder_layers() predictions = { "unique_id": unique_ids, } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): """model_fn constructs the ML model used to predict handwritten digits.""" del params image = features if isinstance(image, dict): image = features["image"] model = mnist.create_model("channels_last") if mode == tf.estimator.ModeKeys.PREDICT: logits = model(image, training=False) predictions = { 'class_ids': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits), } return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions) logits = model(image, training=(mode == tf.estimator.ModeKeys.TRAIN)) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, tf.train.get_global_step(), decay_steps=100000, decay_rate=0.96) optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=optimizer.minimize( loss, tf.train.get_global_step())) if mode == tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits]))
def model_fn(features, labels, mode, params): """Inception v3 model using Estimator API.""" del params if mode != tf.estimator.ModeKeys.TRAIN: raise RuntimeError('mode {} is not supported yet'.format(mode)) num_labels = FLAGS.num_labels with slim.arg_scope(inception_v3_arg_scope(is_training=True)): logits, end_points = inception.inception_v3( features, num_labels, is_training=True, depth_multiplier=FLAGS.depth_multiplier) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_labels) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(end_points['AuxLogits'], onehot_labels, label_smoothing=0.1, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy(logits, onehot_labels, label_smoothing=0.1, weights=1.0) loss = tf.losses.get_total_loss() if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=FLAGS.learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=FLAGS.learning_rate, momentum=0.9) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize( loss, global_step=tf.train.get_or_create_global_step()) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def _define_model(features, labels, mode, params): data_source = (features, labels) self.outputs = {} self.losses = {} self.otters = {} outputs, losses, others = self.define_model(data_source, mode) if mode == tf.estimator.ModeKeys.EVAL: return tpu.TPUEstimatorSpec(mode=mode, loss=losses, eval_metrics=others) if mode == tf.estimator.ModeKeys.PREDICT: return tpu.TPUEstimatorSpec(mode=mode, predictions=outputs) if mode == tf.estimator.ModeKeys.TRAIN: self.losses['train'] = losses self._build_optimizer(tpu_support=True) if not len(self.optimize_ops) == 1: logging.error( 'Implementati Error: More than one optimizer defined') logging.warning(' [*] Selecting only the first optimizer') return tpu.TPUEstimatorSpec(mode=mode, loss=losses[0], train_op=self.optimize_ops[0])
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" global initialized_variable_names # tf.logging.info("*** Features ***") # for name in sorted(features.keys()): # tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) predictions = _creat_bert(is_training, features, bert_config, use_one_hot_embeddings, init_checkpoint) # the concatenate of predictions is the output of bert encoder # and it will be seen as input of other modules total_loss, logits = _create_cqa_modules(is_training, predictions) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: output_spec = tpu.TPUEstimatorSpec(mode=mode, predictions=logits, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def model_fn(features, labels, mode, params): """Define a CIFAR model in Keras.""" del params # unused layers = contrib_keras.layers # Pass our input tensor to initialize the Keras input layer. v = layers.Input(tensor=features) v = layers.Conv2D(filters=32, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Conv2D(filters=64, kernel_size=5, activation="relu", padding="same")(v) v = layers.MaxPool2D(pool_size=2)(v) v = layers.Flatten()(v) fc1 = layers.Dense(units=512, activation="relu")(v) logits = layers.Dense(units=10)(fc1) # Instead of constructing a Keras model for training, build our loss function # and optimizer in Tensorflow. # # N.B. This construction omits some features that are important for more # complex models (e.g. regularization, batch-norm). Once # `model_to_estimator` support is added for TPUs, it should be used instead. loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) optimizer = tf.train.AdamOptimizer() if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, predictions={ "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax( logits, name="softmax_tensor") })
def model_fn(features, labels, mode, params): # Build graph logits = tf.layers.dense(features, 10) loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits) optim = tf.train.GradientDescentOptimizer(learning_rate=1e-2) # NOTE: # When using TPUs, you have to use CrossShardOptimizer which aggregate gradients with all reduce. if params["use_tpu"]: optim = tpu.CrossShardOptimizer(optim) train_op = optim.minimize(loss=loss, global_step=tf.train.get_or_create_global_step()) # Create EstimatorSpec estimator_spec = tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, ) return estimator_spec
def _model_fn(features, labels, mode, params, model, use_tpu_estimator_spec, variable_filter_fn=None): """Model defination for the RetinaNet model based on ResNet. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in dataloader.py mode: the mode of TPUEstimator/Estimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the RetinaNet model outputs class logits and box regression outputs. use_tpu_estimator_spec: Whether to use TPUEstimatorSpec or EstimatorSpec. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ # In predict mode features is a dict with input as value of the 'inputs'. image_info = None if (mode == tf.estimator.ModeKeys.PREDICT and isinstance(features, dict) and 'inputs' in features): image_info = features['image_info'] labels = None if 'labels' in features: labels = features['labels'] features = features['inputs'] def _model_outputs(): return model(features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], num_anchors=len(params['aspect_ratios'] * params['num_scales']), resnet_depth=params['resnet_depth'], is_training_bn=params['is_training_bn']) if params['use_bfloat16']: with contrib_tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: # Postprocess on host; memory layout for NMS on TPU is very inefficient. def _predict_postprocess_wrapper(args): return _predict_postprocess(*args) predictions = contrib_tpu.outside_compilation( _predict_postprocess_wrapper, (cls_outputs, box_outputs, labels, params)) # Include resizing information on prediction output to help bbox drawing. if image_info is not None: predictions.update({ 'image_info': tf.identity(image_info, 'ImageInfo'), }) return contrib_tpu.TPUEstimatorSpec(mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % params['resnet_depth'], }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_global_step() learning_rate = learning_rate_schedule(params['adjusted_learning_rate'], params['lr_warmup_init'], params['lr_warmup_step'], params['first_lr_drop_step'], params['second_lr_drop_step'], global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) total_loss += _WEIGHT_DECAY * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) else: if params['auto_mixed_precision']: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # Batch norm requires `update_ops` to be executed alongside `train_op`. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = variable_filter_fn( tf.trainable_variables(), params['resnet_depth']) if variable_filter_fn else None minimize_op = optimizer.minimize(total_loss, global_step, var_list=var_list) train_op = tf.group(minimize_op, update_ops) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) if use_tpu_estimator_spec: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, # TODO(rostam): Fix bug to get scaffold working. # scaffold=scaffold_fn(), train_op=train_op)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tfes.estimator.ModeKeys.TRAIN) (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # tf.logging.info("**** Trainable Variables ****") # for var in tvars: # init_string = "" # if var.name in initialized_variable_names: # init_string = ", *INIT_FROM_CKPT*" # tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, # init_string) output_spec = None if mode == tfes.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tfes.estimator.ModeKeys.PREDICT: # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(start_logits), axis=2), # tf.expand_dims(tf.nn.softmax(end_logits), axis=1)) # outer = tf.matrix_band_part(outer, -1, 15) # 取上3角15条对角线,表示答案最大长度只能取到15+1个单词 # yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) # 寻找最大值在L1轴的索引 # yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, # "yp1": yp1, # "yp2": yp2, } output_spec = tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def resnet_model_fn_w_pruning(features, labels, mode, params): """The model_fn for ResNet-50 with pruning. Args: features: A float32 batch of images. labels: A int32 batch of labels. mode: Specifies whether training or evaluation. params: Dictionary of parameters passed to the model. Returns: A TPUEstimatorSpec for the model """ width = 1. if FLAGS.width <= 0 else FLAGS.width if isinstance(features, dict): features = features['feature'] if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) training_method = params['training_method'] use_tpu = params['use_tpu'] def build_network(): """Construct the network in the graph.""" if FLAGS.model_architecture == 'mobilenet_v2': network_func = functools.partial( mobilenetv2_model.mobilenet_v2, expansion_factor=FLAGS.expansion_factor) elif FLAGS.model_architecture == 'mobilenet_v1': network_func = functools.partial(mobilenetv1_model.mobilenet_v1) elif FLAGS.model_architecture == 'resnet': prune_first_layer = FLAGS.first_layer_sparsity != 0. network_func = functools.partial( resnet_model.resnet_v1_, resnet_depth=FLAGS.resnet_depth, init_method=FLAGS.init_method, end_sparsity=FLAGS.end_sparsity, prune_first_layer=prune_first_layer) else: raise ValueError('Unknown archiecture ' + FLAGS.archiecture) prune_last_layer = FLAGS.last_layer_sparsity != 0. network = network_func( num_classes=FLAGS.num_label_classes, # TODO remove the pruning_method option. pruning_method='threshold', width=width, prune_last_layer=prune_last_layer, data_format=FLAGS.data_format, weight_decay=FLAGS.weight_decay) is_training = (mode == tf.estimator.ModeKeys.TRAIN) if FLAGS.use_batch_statistics: is_training = True return network(inputs=features, is_training=is_training) if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) output_dir = params['output_dir'] # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) # make sure we reuse the same label smoothing parameter is we're doing # scratch / lottery ticket experiments. label_smoothing = FLAGS.label_smoothing if FLAGS.training_method == 'scratch' and FLAGS.load_mask_dir: scratch_stripped = FLAGS.load_mask_dir.replace('/scratch', '') label_smoothing = float(scratch_stripped.split('/')[15]) tf.logging.info('LABEL SMOOTHING USED: %.2f' % label_smoothing) cross_loss = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=label_smoothing) # Add regularization loss term reg_loss = tf.losses.get_regularization_loss() loss = cross_loss + reg_loss host_call = None if mode == tf.estimator.ModeKeys.TRAIN: host_call, train_op = train_function(training_method, loss, cross_loss, reg_loss, output_dir, use_tpu) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits, cross_loss, reg_loss): """Calculate eval metrics.""" logging.info('In metric function') eval_metrics = {} predictions = tf.cast(tf.argmax(logits, axis=1), tf.int32) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) eval_metrics['top_5_eval_accuracy'] = tf.metrics.mean(in_top_5) eval_metrics['cross_loss'] = tf.metrics.mean(cross_loss) eval_metrics['reg_loss'] = tf.metrics.mean(reg_loss) eval_metrics['eval_accuracy'] = tf.metrics.accuracy( labels=labels, predictions=predictions) # If evaluating once lets also calculate sparsities. if FLAGS.mode == 'eval_once': sparsity_summaries = utils.mask_summaries(pruning.get_masks()) # We call mean on a scalar to create tensor, update_op pairs. sparsity_summaries = { k: tf.metrics.mean(v) for k, v in sparsity_summaries.items() } eval_metrics.update(sparsity_summaries) return eval_metrics tensors = [ labels, logits, tf.broadcast_to(cross_loss, tf.shape(labels)), tf.broadcast_to(reg_loss, tf.shape(labels)) ] eval_metrics = (metric_fn, tensors) if (FLAGS.load_mask_dir and FLAGS.training_method not in ('snip', 'baseline')): def scaffold_fn(): """For initialization, passed to the estimator.""" utils.initialize_parameters_from_ckpt(FLAGS.load_mask_dir, FLAGS.output_dir, MASK_SUFFIX) if FLAGS.initial_value_checkpoint: utils.initialize_parameters_from_ckpt( FLAGS.initial_value_checkpoint, FLAGS.output_dir, PARAM_SUFFIXES) return tf.train.Scaffold() elif (FLAGS.mask_init_method and FLAGS.training_method not in ('snip', 'baseline')): def scaffold_fn(): """For initialization, passed to the estimator.""" if FLAGS.initial_value_checkpoint: utils.initialize_parameters_from_ckpt( FLAGS.initial_value_checkpoint, FLAGS.output_dir, PARAM_SUFFIXES) all_masks = pruning.get_masks() assigner = sparse_utils.get_mask_init_fn(all_masks, FLAGS.mask_init_method, FLAGS.end_sparsity, CUSTOM_SPARSITY_MAP) def init_fn(scaffold, session): """A callable for restoring variable from a checkpoint.""" del scaffold # Unused. session.run(assigner) return tf.train.Scaffold(init_fn=init_fn) else: assert FLAGS.training_method in ('snip', 'baseline') scaffold_fn = None tf.logging.info('No mask is set, starting dense.') return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = tf.reshape(features["input_ids"], [-1, FLAGS.max_seq_length]) input_mask = tf.reshape(features["input_mask"], [-1, FLAGS.max_seq_length]) segment_ids = tf.reshape(features["segment_ids"], [-1, FLAGS.max_seq_length]) label_types = features["label_types"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_real_example = tf.reduce_sum( tf.one_hot(label_types, FLAGS.k_size * 2), axis=1) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (cpc_loss, _, logits, probabilities) = bilin_model_builder.create_model( model, label_ids, label_types, num_choices, k_size=FLAGS.k_size) if add_masking: mask_rate = FLAGS.mask_rate # search alternatives? max_predictions_per_seq = int(math.ceil(FLAGS.max_seq_length * mask_rate)) masked_lm_positions = tf.reshape(features["mask_indices"], [-1, max_predictions_per_seq]) masked_lm_ids = tf.reshape(features["target_token_ids"], [-1, max_predictions_per_seq]) masked_lm_weights = tf.reshape(features["target_token_weights"], [-1, max_predictions_per_seq]) (masked_lm_loss, _, _) = bilin_model_builder.get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) total_loss = cpc_loss + masked_lm_loss else: total_loss = cpc_loss masked_lm_loss = tf.constant([0]) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(cpc_loss, mlm_loss, label_ids, logits, is_real_example): """Collect metrics for function.""" predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) cpc_loss_metric = tf.metrics.mean(values=cpc_loss) mlm_loss_metric = tf.metrics.mean(values=mlm_loss) metric_dict = { "eval_accuracy": accuracy, "eval_cpc_loss": cpc_loss_metric, "eval_mlm_loss": mlm_loss_metric } for i in range(FLAGS.k_size * 2): metric_dict["acc" + str(i)] = tf.metrics.accuracy( labels=label_ids[:, i], predictions=predictions[:, i], weights=is_real_example[:, i]) return metric_dict eval_metrics = (metric_fn, [ cpc_loss, masked_lm_loss, label_ids, logits, is_real_example ]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def _model_fn(features, labels, mode, params): """Estimator model_fn for an autoencoder with adaptive damping.""" del params training_model = classifier_mnist.Model() layer_collection = kfac.LayerCollection() def loss_fn(minibatch, logits=None, return_error=False): features, labels = minibatch if logits is None: # Note we do not need to do anything like # `with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):` # here because Sonnet takes care of variable reuse for us as long as we # call the same `training_model` module. Otherwise we would need to # use variable reusing here. logits = training_model(features) return classifier_mnist.compute_loss(logits=logits, labels=labels, return_error=return_error) logits = training_model(features) pre_update_batch_loss, pre_update_batch_error = loss_fn((features, labels), logits=logits, return_error=True) global_step = tf.train.get_or_create_global_step() if mode == tf.estimator.ModeKeys.TRAIN: layer_collection.register_softmax_cross_entropy_loss(logits, seed=FLAGS.seed + 1) layer_collection.auto_register_layers() train_op, kfac_optimizer = make_train_op( (features, labels), pre_update_batch_loss, layer_collection, loss_fn) tensors_to_print = { 'learning_rate': tf.expand_dims(kfac_optimizer.learning_rate, 0), 'momentum': tf.expand_dims(kfac_optimizer.momentum, 0), 'damping': tf.expand_dims(kfac_optimizer.damping, 0), 'global_step': tf.expand_dims(global_step, 0), 'loss': tf.expand_dims(pre_update_batch_loss, 0), 'error': tf.expand_dims(pre_update_batch_error, 0), } if FLAGS.adapt_damping: tensors_to_print['qmodel_change'] = tf.expand_dims( kfac_optimizer.qmodel_change, 0) tensors_to_print['rho'] = tf.expand_dims(kfac_optimizer.rho, 0) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=pre_update_batch_loss, train_op=train_op, host_call=(print_tensors, tensors_to_print), eval_metrics=None) else: # mode == tf.estimator.ModeKeys.{EVAL, PREDICT}: return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=pre_update_batch_loss, eval_metrics=None)
def estimator_spec_train(self, loss, num_async_replicas=1, use_tpu=False): """Constructs `tf.estimator.EstimatorSpec` for TRAIN (training) mode.""" train_op = self.optimize(loss, num_async_replicas=num_async_replicas, use_tpu=use_tpu) sparsity_technique = self._hparams.get("sparsity_technique") if "pruning" in sparsity_technique: if not self._hparams.load_masks_from: # If we are loading trained masks, don't add the mask update # step to the training process and keep the masks static with tf.control_dependencies([train_op]): mp_hparams = pruning_hparams( self._hparams, use_tpu, sparsity_technique == "random_pruning") p = magnitude_pruning.Pruning( mp_hparams, global_step=tf.train.get_global_step()) mask_update_op = p.conditional_mask_update_op() train_op = mask_update_op check_global_sparsity() if use_tpu: if self._hparams.warm_start_from: def scaffold_fn(): self.initialize_from_ckpt(self._hparams.warm_start_from) return tf.train.Scaffold() elif self._hparams.load_masks_from and self._hparams.load_weights_from: def scaffold_fn(): self.initialize_masks_from_ckpt( self._hparams.load_masks_from) self.initialize_non_masks_from_ckpt( self._hparams.load_weights_from) return tf.train.Scaffold() elif self._hparams.load_masks_from: def scaffold_fn(): self.initialize_masks_from_ckpt( self._hparams.load_masks_from) return tf.train.Scaffold() else: scaffold_fn = None # Note: important to call this before remove_summaries() if self.hparams.tpu_enable_host_call: host_call = t2t_model.create_host_call(self.hparams.model_dir) else: host_call = None t2t_model.remove_summaries() return contrib_tpu.TPUEstimatorSpec(tf_estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: if self._hparams.warm_start_from: self.initialize_from_ckpt(self._hparams.warm_start_from) elif self._hparams.load_masks_from: self.initialize_masks_from_ckpt(self._hparams.load_masks_from) return tf_estimator.EstimatorSpec(tf_estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, probabilities, logits, predictions) = \ create_model(albert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, task_name, hub_module) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: if task_name not in ["sts-b", "cola"]: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } elif task_name == "sts-b": def metric_fn(per_example_loss, label_ids, logits, is_real_example): """Compute Pearson correlations for STS-B.""" # Display labels and predictions concat1 = contrib_metrics.streaming_concat(logits) concat2 = contrib_metrics.streaming_concat(label_ids) # Compute Pearson correlation pearson = contrib_metrics.streaming_pearson_correlation( logits, label_ids, weights=is_real_example) # Compute MSE # mse = tf.metrics.mean(per_example_loss) mse = tf.metrics.mean_squared_error( label_ids, logits, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "pred": concat1, "label_ids": concat2, "pearson": pearson, "MSE": mse, "eval_loss": loss, } elif task_name == "cola": def metric_fn(per_example_loss, label_ids, logits, is_real_example): """Compute Matthew's correlations for STS-B.""" predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) # https://en.wikipedia.org/wiki/Matthews_correlation_coefficient tp, tp_op = tf.metrics.true_positives( predictions, label_ids, weights=is_real_example) tn, tn_op = tf.metrics.true_negatives( predictions, label_ids, weights=is_real_example) fp, fp_op = tf.metrics.false_positives( predictions, label_ids, weights=is_real_example) fn, fn_op = tf.metrics.false_negatives( predictions, label_ids, weights=is_real_example) # Compute Matthew's correlation mcc = tf.div_no_nan( tp * tn - fp * fn, tf.pow((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn), 0.5)) # Compute accuracy accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "matthew_corr": (mcc, tf.group(tp_op, tn_op, fp_op, fn_op)), "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [ per_example_loss, label_ids, logits, is_real_example ]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, predictions={ "probabilities": probabilities, "predictions": predictions }, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] # Note: We keep this feature name `next_sentence_labels` to be compatible # with the original data created by lanzhzh@. However, in the ALBERT case # it does represent sentence_order_labels. sentence_order_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.AlbertModel( config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( albert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (sentence_order_loss, sentence_order_example_loss, sentence_order_log_probs) = get_sentence_order_output( albert_config, model.get_pooled_output(), sentence_order_labels) total_loss = masked_lm_loss + sentence_order_loss tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: tf.logging.info("number of hidden group %d to initialize", albert_config.num_hidden_groups) num_of_initialize_group = 1 if FLAGS.init_from_group0: num_of_initialize_group = albert_config.num_hidden_groups if albert_config.net_structure_type > 0: num_of_initialize_group = albert_config.num_hidden_layers (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint, num_of_initialize_group) if use_tpu: def tpu_scaffold(): for gid in range(num_of_initialize_group): tf.logging.info("initialize the %dth layer", gid) tf.logging.info(assignment_map[gid]) tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: for gid in range(num_of_initialize_group): tf.logging.info("initialize the %dth layer", gid) tf.logging.info(assignment_map[gid]) tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid]) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, poly_power, start_warmup_step) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(*args): """Computes the loss and accuracy of the model.""" (masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, sentence_order_example_loss, sentence_order_log_probs, sentence_order_labels) = args[:7] masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) metrics = { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } sentence_order_log_probs = tf.reshape( sentence_order_log_probs, [-1, sentence_order_log_probs.shape[-1]]) sentence_order_predictions = tf.argmax( sentence_order_log_probs, axis=-1, output_type=tf.int32) sentence_order_labels = tf.reshape(sentence_order_labels, [-1]) sentence_order_accuracy = tf.metrics.accuracy( labels=sentence_order_labels, predictions=sentence_order_predictions) sentence_order_mean_loss = tf.metrics.mean( values=sentence_order_example_loss) metrics.update({ "sentence_order_accuracy": sentence_order_accuracy, "sentence_order_loss": sentence_order_mean_loss }) return metrics metric_values = [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, sentence_order_example_loss, sentence_order_log_probs, sentence_order_labels ] eval_metrics = (metric_fn, metric_values) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def resnet_model_fn_w_pruning(features, labels, mode, params): """The model_fn for ResNet-50 with pruning. Args: features: A float32 batch of images. labels: A int32 batch of labels. mode: Specifies whether training or evaluation. params: Dictionary of parameters passed to the model. Returns: A TPUEstimatorSpec for the model """ width = 1. if FLAGS.width <= 0 else FLAGS.width if isinstance(features, dict): features = features['feature'] if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) pruning_method = params['pruning_method'] use_tpu = params['use_tpu'] log_alpha_threshold = params['log_alpha_threshold'] def build_network(): """Construct the network in the graph.""" model_pruning_method = pruning_method if pruning_method == 'scratch': model_pruning_method = 'threshold' network = resnet_model.resnet_v1_( resnet_depth=FLAGS.resnet_depth, num_classes=FLAGS.num_label_classes, # we need to construct the model with the pruning masks, but they won't # be updated if we're doing scratch training pruning_method=model_pruning_method, init_method=FLAGS.init_method, width=width, prune_first_layer=FLAGS.prune_first_layer, prune_last_layer=FLAGS.prune_last_layer, data_format=FLAGS.data_format, end_sparsity=FLAGS.end_sparsity, clip_log_alpha=FLAGS.clip_log_alpha, log_alpha_threshold=log_alpha_threshold, weight_decay=FLAGS.weight_decay) return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) output_dir = params['output_dir'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) # make sure we reuse the same label smoothing parameter is we're doing # scratch / lottery ticket experiments. label_smoothing = FLAGS.label_smoothing if FLAGS.pruning_method == 'scratch': label_smoothing = float(FLAGS.load_mask_dir.split('/')[15]) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_labels, label_smoothing=label_smoothing) # Add regularization loss term loss += tf.losses.get_regularization_loss() if pruning_method == 'variational_dropout': reg_loss = utils.variational_dropout_dkl_loss( reg_scalar=FLAGS.reg_scalar, start_reg_ramp_up=FLAGS.sparsity_begin_step, end_reg_ramp_up=FLAGS.sparsity_end_step, warm_up=FLAGS.is_warm_up, use_tpu=use_tpu) loss += reg_loss tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES) elif pruning_method == 'l0_regularization': reg_loss = utils.l0_regularization_loss( reg_scalar=FLAGS.reg_scalar, start_reg_ramp_up=FLAGS.sparsity_begin_step, end_reg_ramp_up=FLAGS.sparsity_end_step, warm_up=FLAGS.is_warm_up, use_tpu=use_tpu) loss += reg_loss tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: host_call, train_op = train_function(pruning_method, loss, output_dir, use_tpu) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Calculate eval metrics.""" logging.info('In metric function') eval_metrics = {} predictions = tf.cast(tf.argmax(logits, axis=1), tf.int32) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) eval_metrics['top_5_eval_accuracy'] = tf.metrics.mean(in_top_5) eval_metrics['eval_accuracy'] = tf.metrics.accuracy( labels=labels, predictions=predictions) return eval_metrics def vd_metric_fn(labels, logits, global_sparsity): eval_metrics = metric_fn(labels, logits) eval_metrics['global_sparsity'] = tf.metrics.mean(global_sparsity) return eval_metrics tensors = [labels, logits] metric_function = metric_fn if FLAGS.pruning_method == 'variational_dropout': batch_size = labels.shape[0] ones = tf.ones([batch_size, 1]) mask_metrics = utils.add_vd_pruning_summaries( threshold=FLAGS.log_alpha_threshold) tensors.append(mask_metrics['global_sparsity'] * ones) metric_function = vd_metric_fn eval_metrics = (metric_function, tensors) # define a custom scaffold function to enable initializing the mask from an # already trained checkpoint. def initialize_mask_from_ckpt(ckpt_path): """Load mask from an existing checkpoint.""" model_dir = FLAGS.output_dir already_has_ckpt = model_dir and tf.train.latest_checkpoint( model_dir) is not None if already_has_ckpt: tf.logging.info( 'Training already started on this model, not loading masks from' 'previously trained model') return reader = tf.train.NewCheckpointReader(ckpt_path) mask_names = reader.get_variable_to_shape_map().keys() mask_names = [x for x in mask_names if x.endswith('mask')] variable_map = {} for var in tf.global_variables(): var_name = var.name.split(':')[0] if var_name in mask_names: tf.logging.info('Loading mask variable from checkpoint: %s', var_name) variable_map[var_name] = var elif 'mask' in var_name: tf.logging.info( 'Cannot find mask variable in checkpoint, skipping: %s', var_name) tf.train.init_from_checkpoint(ckpt_path, variable_map) def initialize_parameters_from_ckpt(ckpt_path): """Load parameters from an existing checkpoint.""" model_dir = FLAGS.output_dir already_has_ckpt = model_dir and tf.train.latest_checkpoint( model_dir) is not None if already_has_ckpt: tf.logging.info( 'Training already started on this model, not loading masks from' 'previously trained model') return reader = tf.train.NewCheckpointReader(ckpt_path) param_names = reader.get_variable_to_shape_map().keys() param_names = [x for x in param_names if not x.endswith('mask')] variable_map = {} for var in tf.global_variables(): var_name = var.name.split(':')[0] if var_name in param_names: tf.logging.info( 'Loading parameter variable from checkpoint: %s', var_name) variable_map[var_name] = var elif 'mask' not in var_name: tf.logging.info( 'Cannot find parameter variable in checkpoint, skipping: %s', var_name) tf.train.init_from_checkpoint(ckpt_path, variable_map) if FLAGS.pruning_method == 'scratch': if FLAGS.load_mask_dir: def scaffold_fn(): initialize_mask_from_ckpt(FLAGS.load_mask_dir) if FLAGS.initial_value_checkpoint: initialize_parameters_from_ckpt( FLAGS.initial_value_checkpoint) return tf.train.Scaffold() else: raise ValueError( 'Must supply a mask directory to use scratch method') else: scaffold_fn = None return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) num_choices = 2 read_size = num_choices + 1 input_ids = [ features["input_ids" + str(i)] for i in range(0, read_size) ] input_mask = [ features["input_mask" + str(i)] for i in range(0, read_size) ] segment_ids = [ features["segment_ids" + str(i)] for i in range(0, read_size) ] label_ids = features["labels"] label_ids = label_ids[:, 4] seq_length = input_ids[0].shape[-1] input_ids = tf.reshape(tf.stack(input_ids, axis=1), [-1, seq_length]) input_mask = tf.reshape(tf.stack(input_mask, axis=1), [-1, seq_length]) segment_ids = tf.reshape(tf.stack(segment_ids, axis=1), [-1, seq_length]) is_training = (mode == tf_estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) if FLAGS.bilin_preproc: (total_loss, per_example_loss, logits, probabilities) = model_builder.create_model_bilin( model, label_ids, num_choices) else: (total_loss, per_example_loss, logits, probabilities) = model_builder.create_model( model, label_ids, num_choices) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf_estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf_estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions) loss = tf.metrics.mean(values=per_example_loss) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, scaffold_fn=scaffold_fn) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) else: initialized_variable_names = [] tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy(label_ids, predictions) loss = tf.metrics.mean(per_example_loss) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = {"probabilities": probabilities} output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn) return output_spec
def inception_model_fn(features, labels, mode, params): """Inception v2 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['input_perm']) if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with arg_scope( inception.inception_v2_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = inception.inception_v2( features, num_classes, is_training=is_training, replace_separable_convolution=True) else: with arg_scope( inception.inception_v2_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = inception.inception_v2( features, num_classes, is_training=is_training, replace_separable_convolution=True) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 if FLAGS.use_learning_rate_warmup: # Adjust initial learning rate to match final warmup rate warmup_decay = FLAGS.learning_rate_decay**( (FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs) adj_initial_learning_rate = initial_learning_rate * warmup_decay final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int(FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True) if FLAGS.use_learning_rate_warmup: wlr = 0.1 * adj_initial_learning_rate wlr_height = tf.cast( 0.9 * adj_initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1), tf.float32) epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32) exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs + FLAGS.learning_rate_decay_epochs) lin_inc_lr = tf.add( wlr, tf.multiply( tf.cast(tf.subtract(current_epoch, epoch_offset), tf.float32), wlr_height)) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where(tf.greater_equal(current_epoch, exp_decay_start), learning_rate, lin_inc_lr)), wlr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) if FLAGS.use_next_sentence_prediction: sample_weights = None if FLAGS.no_nsp_while_masking: sample_weights = tf.cast( tf.math.greater_equal( tf.reduce_sum(masked_lm_weights, axis=1), 0.0), tf.float32) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels, sample_weights) # Compute total weighted loss: # if mlm_loss_weight=1, this amounts to summing up the losses. total_loss = (bert_config.mlm_loss_weight * masked_lm_loss + next_sentence_loss) / ( 1 + bert_config.mlm_loss_weight) * 2 next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) else: total_loss = masked_lm_loss masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.logging.info("**** Assignment map **** %s" % assignment_map) for x in assignment_map: tf.logging.info(x) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) # Host function for saving summaries. def _host_fn(**kwargs): global_step = kwargs.pop("global_step")[0] with tf.compat.v2.summary.create_file_writer( os.path.join(FLAGS.output_dir, "train")).as_default(): with tf.compat.v2.summary.record_summaries_every_n_global_steps( FLAGS.steps_per_summary, global_step): for name, tensor in kwargs.items(): tf.compat.v2.summary.scalar(name, tf.reduce_mean(tensor), step=global_step) return tf.summary.all_v2_summary_ops() global_step = tf.train.get_or_create_global_step() if FLAGS.use_next_sentence_prediction: host_inputs = { "global_step": tf.expand_dims(global_step, 0), "loss/mlm_loss": tf.expand_dims(masked_lm_loss, 0), "loss/cls_loss": tf.expand_dims(next_sentence_loss, 0), "loss/total_loss": tf.expand_dims(total_loss, 0), "accuracy/mlm_accuracy": tf.expand_dims(masked_lm_accuracy, 0), "accuracy/cls_accuracy": tf.expand_dims(next_sentence_accuracy, 0), } else: host_inputs = { "global_step": tf.expand_dims(global_step, 0), "loss/mlm_loss": tf.expand_dims(masked_lm_loss, 0), "loss/total_loss": tf.expand_dims(total_loss, 0), "accuracy/mlm_accuracy": tf.expand_dims(masked_lm_accuracy, 0), } host_call = (_host_fn, host_inputs) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def get_estimator_spec(hparams, mode, features, labels, frame_logits, onset_logits, offset_logits, velocity_values, offset_network=True): """Create TPUEstimatorSpec.""" loss_metrics = {} loss = None if (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL): onset_losses = tf.losses.sigmoid_cross_entropy( labels.onsets[:, :, :constants.MIDI_PITCHES], onset_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.onsets)[1]), axis=2)) loss_metrics['onset'] = onset_losses if offset_network and not hparams.drums_only: offset_losses = tf.losses.sigmoid_cross_entropy( labels.offsets[:, :, :constants.MIDI_PITCHES], offset_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.offsets)[1]), axis=2)) loss_metrics['offset'] = offset_losses velocity_losses = tf.losses.mean_squared_error( labels.velocities, velocity_values, weights=labels.onsets * hparams.velocity_loss_weight) loss_metrics['velocity'] = velocity_losses if not hparams.drums_only: frame_losses = tf.losses.sigmoid_cross_entropy( labels.labels[:, :, :constants.MIDI_PITCHES], frame_logits[:, :, :constants.MIDI_PITCHES], weights=tf.expand_dims( tf.sequence_mask( features.length, maxlen=tf.shape(labels.labels)[1]), axis=2)) loss_metrics['frame'] = frame_losses loss = tf.losses.get_total_loss() if (mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT): frame_probs = tf.sigmoid(frame_logits) onset_probs = tf.sigmoid(onset_logits) if offset_network: offset_probs = tf.sigmoid(offset_logits) else: offset_probs = tf.zeros_like(onset_probs) frame_predictions = frame_probs > hparams.predict_frame_threshold onset_predictions = onset_probs > hparams.predict_onset_threshold offset_predictions = offset_probs > hparams.predict_offset_threshold if hparams.drum_prediction_map: map_predictions = functools.partial( drum_mappings.map_pianoroll, mapping_name=hparams.drum_prediction_map, reduce_mode='any', min_pitch=constants.MIN_MIDI_PITCH) frame_predictions = tf.map_fn(map_predictions, frame_predictions) onset_predictions = tf.map_fn(map_predictions, onset_predictions) offset_predictions = tf.map_fn(map_predictions, offset_predictions) map_values = functools.partial( drum_mappings.map_pianoroll, mapping_name=hparams.drum_prediction_map, reduce_mode='max', min_pitch=constants.MIN_MIDI_PITCH) velocity_values = tf.map_fn(map_values, velocity_values) metrics_values = get_metrics(features, labels, frame_probs, onset_probs, frame_predictions, onset_predictions, offset_predictions, velocity_values, hparams) for label, loss_collection in loss_metrics.items(): loss_label = 'losses/' + label metrics_values[loss_label] = loss_collection if mode == tf.estimator.ModeKeys.TRAIN: train_op = contrib_layers.optimize_loss( name='training', loss=loss, global_step=tf.train.get_or_create_global_step(), learning_rate=hparams.learning_rate, learning_rate_decay_fn=functools.partial( tf.train.exponential_decay, decay_steps=hparams.decay_steps, decay_rate=hparams.decay_rate, staircase=True), clip_gradients=hparams.clip_norm, summaries=[], optimizer=lambda lr: contrib_tpu.CrossShardOptimizer( # pylint:disable=g-long-lambda tf.train.AdamOptimizer(lr))) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: metric_ops = {k: tf.metrics.mean(v) for k, v in metrics_values.items()} return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=metric_ops) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'frame_probs': frame_probs, 'onset_probs': onset_probs, 'frame_predictions': frame_predictions, 'onset_predictions': onset_predictions, 'offset_predictions': offset_predictions, 'velocity_values': velocity_values, 'sequence_predictions': _predict_sequences( frame_probs=frame_probs, onset_probs=onset_probs, frame_predictions=frame_predictions, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values, hparams=hparams), # Include some features and labels in output because Estimator 'predict' # API does not give access to them. 'sequence_ids': features.sequence_id, 'sequence_labels': labels.note_sequence, 'frame_labels': labels.labels, 'onset_labels': labels.onsets, } for k, v in metrics_values.items(): predictions[k] = tf.stack(v) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) else: raise ValueError('Unsupported mode: %s' % mode)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits, answer_type_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] # Computes the loss for positions. def compute_loss(logits, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss # Computes the loss for labels. def compute_label_loss(logits, labels): one_hot_labels = tf.one_hot( labels, depth=len(AnswerType), dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_labels * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] answer_types = features["answer_types"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) answer_type_loss = compute_label_loss(answer_type_logits, answer_types) total_loss = (start_loss + end_loss + answer_type_loss) / 3.0 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, "answer_type_logits": answer_type_logits, } output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec