def compute_loss(log_probs, positions, depth=seq_length): one_hot_positions = tf.one_hot( positions, depth=depth, dtype=tf.float32) loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss
def kl_div_loss(student_logits, teacher_logits, temperature=1): """The Kullback–Leibler divergence from Q to P: D_kl (P||Q) = sum(P * log(P / Q)) from student to teacher: sum(teacher * log(teacher / student)) """ teacher_softmax = tf.nn.softmax(teacher_logits / temperature) teacher_log_softmax = tf.nn.log_softmax(teacher_logits / temperature) student_log_softmax = tf.nn.log_softmax(student_logits / temperature) kl_dist = teacher_softmax * (teacher_log_softmax - student_log_softmax) kl_loss = tf.reduce_mean(tf.reduce_sum(kl_dist, -1)) return kl_loss
def get_race_loss(FLAGS, features, is_training): """Loss for downstream multi-choice QA tasks such as RACE.""" bsz_per_core = tf.shape(features["input_ids"])[0] def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 4, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 4]) return out inp = _transform_features(features["input_ids"]) seg_id = _transform_features(features["segment_ids"]) inp_mask = _transform_features(features["input_mask"]) label = tf.reshape(features["label_ids"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("logits"): logits = tf.layers.dense(summary, 1, kernel_initializer=xlnet_model.get_initializer()) logits = tf.reshape(logits, [bsz_per_core, 4]) one_hot_target = tf.one_hot(label, 4) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def model_fn(features, labels, mode, params): # ### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) return_dict = function_builder.get_classification_outputs( FLAGS, features, is_training) # per_example_loss = return_dict["per_example_loss"] cls_logits = return_dict["cls_logits"] # ### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) logger.info('#params: {}'.format(num_params)) # ### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) if mode == tf.estimator.ModeKeys.PREDICT: # label_ids = tf.reshape(features["cls"], [-1]) predictions = { "feature_id": features["feature_id"], "cls_logits": cls_logits, # "cls": label_ids, } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec def compute_loss(log_probs, positions, depth): one_hot_positions = tf.one_hot(positions, depth=depth, dtype=tf.float32) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss cls_log_probs = return_dict["cls_log_probs"] num_choices = FLAGS.num_choices if num_choices: num_classes = num_choices else: num_classes = FLAGS.num_classes total_loss = compute_loss(cls_log_probs, features["cls"], depth=num_classes) # ### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {'loss/cls': total_loss, "lr": learning_rate} # ### Constucting training TPUEstimatorSpec with new cache. if FLAGS.use_tpu: # ### Creating host calls if not FLAGS.is_regression: label_ids = tf.reshape(features['cls'], [-1]) predictions = tf.argmax(cls_logits, axis=-1, output_type=label_ids.dtype) is_correct = tf.equal(predictions, label_ids) accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) monitor_dict["accuracy"] = accuracy host_call = function_builder.construct_scalar_host_call( monitor_dict=monitor_dict, model_dir=FLAGS.model_dir, prefix="train/", reduce_fn=tf.reduce_mean) else: host_call = None train_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, scaffold_fn=scaffold_fn) else: train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, dropatt, input_mask, is_training, initializer, scope=None, reuse=None, use_proj=True): """ Different classification tasks may not may not share the same parameters to summarize the sequence features. If shared, one can keep the `scope` to the default value `None`. Otherwise, one should specify a different `scope` for each task. """ with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): if summary_type == 'last': summary = hidden[-1] elif summary_type == 'first': summary = hidden[0] elif summary_type == 'mean': summary = tf.reduce_mean(hidden, axis=0) elif summary_type == 'attn': bsz = tf.shape(hidden)[1] summary_bias = tf.get_variable('summary_bias', [d_model], dtype=hidden.dtype, initializer=initializer) summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) if input_mask is not None: input_mask = input_mask[None, :, :, None] summary = multihead_attn(summary_bias, hidden, hidden, input_mask, d_model, n_head, d_head, dropout, dropatt, is_training, initializer, residual=False) summary = summary[0] else: raise ValueError( 'Unsupported summary type {}'.format(summary_type)) # use another projection as in BERT if use_proj: summary = tf.layers.dense(summary, d_model, activation=tf.tanh, kernel_initializer=initializer, name='summary') # dropout summary = tf.layers.dropout(summary, dropout, training=is_training, name='dropout') return summary