def get_regression_loss( FLAGS, features, is_training): """Loss for downstream regression tasks.""" bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label = tf.reshape(features["label_ids"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out( FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): per_example_loss, logits = modeling.regression_loss( hidden=summary, labels=label, initializer=xlnet_model.get_initializer(), scope="regression_{}".format(FLAGS.task_name.lower()), return_logits=True) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def _create_model(self, input_ids, input_masks, segment_ids, label_ids, label_list, mode): """Creates XLNet-NER model""" model = xlnet.XLNetModel(xlnet_config=self.model_config, run_config=xlnet.create_run_config( mode == tf.estimator.ModeKeys.TRAIN, True, FLAGS), input_ids=tf.transpose(input_ids, perm=[1, 0]), input_mask=tf.transpose(input_masks, perm=[1, 0]), seg_ids=tf.transpose(segment_ids, perm=[1, 0])) initializer = model.get_initializer() with tf.variable_scope("ner", reuse=tf.AUTO_REUSE): result = tf.transpose(model.get_sequence_output(), perm=[1, 0, 2]) result_mask = tf.cast(tf.expand_dims(1 - input_masks, axis=-1), dtype=tf.float32) dense_layer = tf.keras.layers.Dense( units=len(label_list), activation=None, use_bias=True, kernel_initializer=initializer, bias_initializer=tf.zeros_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) result = dense_layer(result) if mode == tf.estimator.ModeKeys.TRAIN: result = dropout_layer(result) masked_predict = result * result_mask + MIN_FLOAT * (1 - result_mask) predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_predict, axis=-1), axis=-1), dtype=tf.int32) loss = tf.constant(0.0, dtype=tf.float32) if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] and label_ids is not None: with tf.variable_scope("loss", reuse=tf.AUTO_REUSE): label = tf.cast(label_ids, dtype=tf.float32) label_mask = tf.cast(1 - input_masks, dtype=tf.float32) masked_label = tf.cast(label * label_mask, dtype=tf.int32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_label, logits=masked_predict) loss = tf.reduce_sum( cross_entropy * label_mask) / tf.reduce_sum( tf.reduce_max(label_mask, axis=-1)) return loss, predict_ids
def load_model(self, model: str, model_path: str): model_path = os.path.join(model_path, next(os.walk(model_path))[1][0]) self.xlnet_config = xlnet.XLNetConfig( json_path=os.path.join(model_path, Embeddings.mode_config_path)) self.run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=Flags) self.load_tokenizer(model_path) self.model = model print("Model loaded Successfully !")
def __init__(self, flags, input_ids, seg_ids, input_mask): xlnet_config = xln.XLNetConfig(json_path=flags.model_config_path) run_config = xln.create_run_config(is_training=True, is_finetune=True, FLAGS=flags) self.model = xln.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_ids, input_mask=input_mask)
def create_model(cf, input_ids, input_mask, segment_ids, labels, is_training=True): ''' 构建模型 :param cf: :param input_ids: :param input_mask: :param segment_ids: :param labels: :param is_training: :return: ''' bsz_per_core = tf.shape(input_ids)[0] inp = tf.transpose(input_ids, [1, 0]) seg_id = tf.transpose(segment_ids, [1, 0]) inp_mask = tf.transpose(input_mask, [1, 0]) label = tf.reshape(labels, [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=cf.model_config_path) run_config = xlnet.create_run_config(is_training, True, cf) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(cf.summary_type, cf.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): if cf.cls_scope is not None and cf.cls_scope: cls_scope = "classification_{}".format(cf.cls_scope) else: cls_scope = "classification_{}".format(cf.task_name.lower()) per_example_loss, logits = modeling.classification_loss( hidden=summary, labels=label, n_class=cf.num_labels, initializer=xlnet_model.get_initializer(), scope=cls_scope, return_logits=True) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tpu_config = model_utils.configure_tpu(FLAGS) model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) model_builder = XLNetModelBuilder( default_model_config=model_config, default_run_config=run_config, default_init_checkpoint=FLAGS.init_checkpoint, use_tpu=FLAGS.use_tpu) model_fn = model_builder.get_model_fn(model_config, run_config, FLAGS.init_checkpoint, FLAGS.model_type) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_config, export_to_tpu=FLAGS.use_tpu, train_batch_size=1) tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file, lower_case=FLAGS.lower_case) example_converter = XLNetExampleConverter( label_list=[], max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) features = example_converter.convert_examples_to_features( [PaddingInputExample()]) input_fn = XLNetInputBuilder.get_input_builder(features, FLAGS.max_seq_length, True, False) estimator.train(input_fn, max_steps=1) tf.gfile.MakeDirs(FLAGS.export_dir) serving_input_fn = XLNetInputBuilder.get_serving_input_fn( FLAGS.max_seq_length) estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
def __init__(self, model_config_path, is_training, FLAGS, input_ids, segment_ids, input_mask, label, n_class): ''' :param model_config_path: :param is_training: :param FLAGS: :param input_ids: :param segment_ids: :param input_mask: :param label: :param n_class: ''' self.xlnet_config = xlnet.XLNetConfig(json_path=model_config_path) self.run_config = xlnet.create_run_config(is_training, True, FLAGS) self.input_ids = tf.transpose(input_ids, [1, 0]) self.segment_ids = tf.transpose(segment_ids, [1, 0]) self.input_mask = tf.transpose(input_mask, [1, 0]) self.model = xlnet.XLNetModel(xlnet_config=self.xlnet_config, run_config=self.run_config, input_ids=self.input_ids, seg_ids=self.segment_ids, input_mask=self.input_mask) cls_scope = FLAGS.cls_scope summary = self.model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) self.per_example_loss, self.logits = modeling.classification_loss( hidden=summary, labels=label, n_class=n_class, initializer=self.model.get_initializer(), scope=cls_scope, return_logits=True) self.total_loss = tf.reduce_mean(self.per_example_loss) with tf.name_scope("train_op"): self.train_op, _, _ = model_utils.get_train_op( FLAGS, self.total_loss) with tf.name_scope("acc"): one_hot_target = tf.one_hot(label, n_class) self.acc = self.accuracy(self.logits, one_hot_target)
def get_classification_loss(options, features, n_class, is_training): """Loss for downstream classification tasks.""" bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label = tf.reshape(features["label_ids"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=options['model_config_file']) run_config = xlnet.create_run_config(is_training, True, options) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(options['summary_type'], options['use_summ_proj']) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): if options['cls_scope'] is not None and options['cls_scope']: cls_scope = "classification_{}".format(options['cls_scope']) else: cls_scope = "classification_{}".format( options['task_name'].lower()) per_example_loss, logits = modeling.classification_loss( hidden=summary, labels=label, n_class=n_class, initializer=xlnet_model.get_initializer(), scope=cls_scope, return_logits=True) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def get_race_loss(FLAGS, features, is_training): """Loss for downstream multi-choice QA tasks such as RACE.""" bsz_per_core = tf.shape(features["input_ids"])[0] def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 4, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 4]) return out inp = _transform_features(features["input_ids"]) seg_id = _transform_features(features["segment_ids"]) inp_mask = _transform_features(features["input_mask"]) label = tf.reshape(features["label_ids"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out( FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("logits"): logits = tf.layers.dense( summary, 1, kernel_initializer=xlnet_model.get_initializer()) logits = tf.reshape(logits, [bsz_per_core, 4]) one_hot_target = tf.one_hot(label, 4) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def main(_): tf.logging.set_verbosity(tf.logging.INFO) np.random.seed(FLAGS.random_seed) processor = NerProcessor(data_dir=FLAGS.data_dir, input_file=FLAGS.input_file, task_name=FLAGS.task_name.lower()) label_list = processor.get_labels() tf.logging.info(label_list) tpu_config = model_utils.configure_tpu(FLAGS) model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) model_builder = XLNetModelBuilder( default_model_config=model_config, default_run_config=run_config, default_init_checkpoint=FLAGS.init_checkpoint, use_tpu=FLAGS.use_tpu) model_fn = model_builder.get_model_fn(model_config, run_config, FLAGS.init_checkpoint, label_list) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_config, export_to_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file, lower_case=FLAGS.lower_case) example_converter = XLNetExampleConverter( label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) if FLAGS.do_train: train_examples = processor.get_chem_examples() tf.logging.info("***** Run training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_steps) train_features = example_converter.convert_examples_to_features( train_examples) train_input_fn = XLNetInputBuilder.get_input_builder( train_features, FLAGS.max_seq_length, True, True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples() tf.logging.info("***** Run evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_features = example_converter.convert_examples_to_features( eval_examples) eval_input_fn = XLNetInputBuilder.get_input_builder( eval_features, FLAGS.max_seq_length, False, False) result = estimator.evaluate(input_fn=eval_input_fn) precision = result["precision"] recall = result["recall"] f1_score = 2.0 * precision * recall / (precision + recall) tf.logging.info("***** Evaluation result *****") tf.logging.info(" Precision (token-level) = %s", str(precision)) tf.logging.info(" Recall (token-level) = %s", str(recall)) tf.logging.info(" F1 score (token-level) = %s", str(f1_score)) if FLAGS.do_predict: predict_examples = processor.get_test_examples() pmids = [e.guid for e in predict_examples] tokens = [e.guid for e in predict_examples] tf.logging.info("***** Run prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_features = example_converter.convert_examples_to_features( predict_examples) predict_input_fn = XLNetInputBuilder.get_input_builder( predict_features, FLAGS.max_seq_length, False, False) result = estimator.predict(input_fn=predict_input_fn) predict_recorder = XLNetPredictRecorder( output_dir=FLAGS.output_dir, label_list=label_list, guids=pmids, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer, predict_tag=FLAGS.predict_tag) predicts = [{ "input_ids": feature.input_ids, "input_masks": feature.input_masks, "label_ids": feature.label_ids, "predict_ids": predict["predict"].tolist() } for feature, predict in zip(predict_features, result)] predict_recorder.record(predicts) if FLAGS.do_export: tf.logging.info("***** Running exporting *****") tf.gfile.MakeDirs(FLAGS.export_dir) serving_input_fn = XLNetInputBuilder.get_serving_input_fn( FLAGS.max_seq_length) estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
def _create_model(self, input_ids, input_masks, segment_ids, sent_label_ids, sent_label_list, mode): """Creates XLNet-Classifier model""" model = xlnet.XLNetModel(xlnet_config=self.model_config, run_config=xlnet.create_run_config( mode == tf.estimator.ModeKeys.TRAIN, True, FLAGS), input_ids=tf.transpose(input_ids, perm=[1, 0]), input_mask=tf.transpose(input_masks, perm=[1, 0]), seg_ids=tf.transpose(segment_ids, perm=[1, 0])) initializer = model.get_initializer() with tf.variable_scope("sent", reuse=tf.AUTO_REUSE): sent_result = model.get_pooled_out("last") sent_result_mask = tf.cast(tf.reduce_max(1 - input_masks, axis=-1, keepdims=True), dtype=tf.float32) sent_dense_layer = tf.keras.layers.Dense( units=len(sent_label_list), activation=None, use_bias=True, kernel_initializer=initializer, bias_initializer=tf.zeros_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) sent_dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) sent_result = sent_dense_layer(sent_result) if mode == tf.estimator.ModeKeys.TRAIN: sent_result = sent_dropout_layer(sent_result) masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * ( 1 - sent_result_mask) sent_predict_probs = tf.nn.softmax(masked_sent_predict, axis=-1) sent_predict_ids = tf.cast(tf.argmax(sent_predict_probs, axis=-1), dtype=tf.int32) sent_predict_scores = tf.reduce_max(sent_predict_probs, axis=-1) loss = tf.constant(0.0, dtype=tf.float32) if mode not in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ]: return loss, sent_predict_ids, sent_predict_scores, sent_predict_probs if sent_label_ids is not None: with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE): sent_label = tf.cast(sent_label_ids, dtype=tf.float32) sent_label_mask = tf.cast(tf.reduce_max(1 - input_masks, axis=-1), dtype=tf.float32) masked_sent_label = tf.cast(sent_label * sent_label_mask, dtype=tf.int32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_sent_label, logits=masked_sent_predict) sent_loss = tf.reduce_sum( cross_entropy * sent_label_mask) / tf.reduce_sum( tf.reduce_max(sent_label_mask, axis=-1)) loss = loss + sent_loss return loss, sent_predict_ids, sent_predict_scores, sent_predict_probs
from xlnet import xlnet from absl.flags import FLAGS # some code omitted here... # initialize FLAGS # initialize instances of tf.Tensor, including input_ids, seg_ids, and input_mask # XLNetConfig contains hyperparameters that are specific to a model checkpoint. xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) # RunConfig contains hyperparameters that could be different between pretraining and finetuning. run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=FLAGS) # Construct an XLNet model xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_ids, input_mask=input_mask) # Get a summary of the sequence using the last hidden state summary = xlnet_model.get_pooled_out(summary_type="last") # Get a sequence output seq_out = xlnet_model.get_sequence_output() # build your applications based on `summary` or `seq_out`
def two_stream_loss(FLAGS, features, labels, mems, is_training): """Pretraining loss with two-stream attention Transformer-XL.""" # Unpack input mem_name = "mems" mems = mems.get(mem_name, None) inp_k = tf.transpose(features["input_k"], [1, 0]) inp_q = tf.transpose(features["input_q"], [1, 0]) seg_id = tf.transpose(features["seg_id"], [1, 0]) inp_mask = None perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0]) if FLAGS.num_predict is not None: # [num_predict x tgt_len x bsz] target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0]) else: target_mapping = None # target for LM loss tgt = tf.transpose(features["target"], [1, 0]) # target mask for LM loss tgt_mask = tf.transpose(features["target_mask"], [1, 0]) # construct xlnet config and save to model_dir xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS) xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json")) # construct run config from FLAGS run_config = xlnet.create_run_config(is_training, False, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp_k, seg_ids=seg_id, input_mask=inp_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q) output = xlnet_model.get_sequence_output() new_mems = {mem_name: xlnet_model.get_new_memory()} lookup_table = xlnet_model.get_embedding_table() initializer = xlnet_model.get_initializer() with tf.variable_scope("model", reuse=tf.AUTO_REUSE): # LM loss lm_loss = modeling.lm_loss( hidden=output, target=tgt, n_token=xlnet_config.n_token, d_model=xlnet_config.d_model, initializer=initializer, lookup_table=lookup_table, tie_weight=True, bi_data=run_config.bi_data, use_tpu=run_config.use_tpu) # Quantity to monitor monitor_dict = {} if FLAGS.use_bfloat16: tgt_mask = tf.cast(tgt_mask, tf.float32) lm_loss = tf.cast(lm_loss, tf.float32) total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask) monitor_dict["total_loss"] = total_loss return total_loss, new_mems, monitor_dict
def get_qa_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) cls_index = tf.reshape(features["cls_index"], [-1]) seq_len = tf.shape(inp)[0] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() return_dict = {} # invalid position mask such as query and special symbols (PAD, SEP, CLS) p_mask = features["p_mask"] # logit of the start position with tf.variable_scope("start_logits"): start_logits = tf.layers.dense( output, 1, kernel_initializer=initializer) start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) # logit of the end position with tf.variable_scope("end_logits"): if is_training: # during training, compute the end logits based on the # ground truth of the start position start_positions = tf.reshape(features["start_positions"], [-1]) start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum("lbh,bl->bh", output, start_index) start_features = tf.tile(start_features[None], [seq_len, 1, 1]) end_logits = tf.layers.dense( tf.concat([output, start_features], axis=-1), xlnet_config.d_model, kernel_initializer=initializer, activation=tf.tanh, name="dense_0") end_logits = tf.contrib.layers.layer_norm( end_logits, begin_norm_axis=-1) end_logits = tf.layers.dense( end_logits, 1, kernel_initializer=initializer, name="dense_1") end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) else: # during inference, compute the end logits based on beam search start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=FLAGS.start_n_top) start_index = tf.one_hot(start_top_index, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum("lbh,bkl->bkh", output, start_index) end_input = tf.tile(output[:, :, None], [1, 1, FLAGS.start_n_top, 1]) start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1]) end_input = tf.concat([end_input, start_features], axis=-1) end_logits = tf.layers.dense( end_input, xlnet_config.d_model, kernel_initializer=initializer, activation=tf.tanh, name="dense_0") end_logits = tf.contrib.layers.layer_norm(end_logits, begin_norm_axis=-1) end_logits = tf.layers.dense( end_logits, 1, kernel_initializer=initializer, name="dense_1") end_logits = tf.reshape( end_logits, [ seq_len, -1, FLAGS.start_n_top]) end_logits = tf.transpose(end_logits, [1, 2, 0]) end_logits_masked = end_logits * ( 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_top_log_probs, end_top_index = tf.nn.top_k( end_log_probs, k=FLAGS.end_n_top) end_top_log_probs = tf.reshape( end_top_log_probs, [-1, FLAGS.start_n_top * FLAGS.end_n_top]) end_top_index = tf.reshape( end_top_index, [-1, FLAGS.start_n_top * FLAGS.end_n_top]) if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs else: return_dict["start_top_log_probs"] = start_top_log_probs return_dict["start_top_index"] = start_top_index return_dict["end_top_log_probs"] = end_top_log_probs return_dict["end_top_index"] = end_top_index # an additional layer to predict answerability with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) # get the representation of START start_p = tf.nn.softmax(start_logits_masked, axis=-1, name="softmax_start") start_feature = tf.einsum("lbh,bl->bh", output, start_p) # note(zhiliny): no dependency on end_feature so that we can obtain # one single `cls_logits` for each sample ans_feature = tf.concat([start_feature, cls_feature], -1) ans_feature = tf.layers.dense( ans_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name="dense_0") ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) cls_logits = tf.layers.dense( ans_feature, 1, kernel_initializer=initializer, name="dense_1", use_bias=False) cls_logits = tf.squeeze(cls_logits, -1) return_dict["cls_logits"] = cls_logits return return_dict
def get_predictions_and_loss(self, input_ids, seg_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map): run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=self.FLAGS) # Construct an XLNet model model = xlnet.XLNetModel(xlnet_config=self.xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_ids, input_mask=input_mask) mention_doc = model.get_sequence_output() mention_doc = tf.transpose(mention_doc, perm=[1, 0, 2]) input_ids = tf.transpose(input_ids) input_mask = tf.transpose(input_mask) seg_ids = tf.transpose(seg_ids) speaker_ids = tf.transpose(speaker_ids) flipped_mask = (input_mask < 1) input_mask = tf.cast(flipped_mask, tf.float32) self.dropout = self.get_dropout(self.config["dropout_rate"], is_training) num_sentences = tf.shape(mention_doc)[0] max_sentence_length = tf.shape(mention_doc)[1] mention_doc = self.flatten_emb_by_sentence(mention_doc, input_mask) num_words = util_xlnet.shape(mention_doc, 0) antecedent_doc = mention_doc flattened_sentence_indices = sentence_map #with tf.control_dependencies([print_input_ids]): candidate_starts = tf.tile( tf.expand_dims(tf.range(num_words), 1), [1, self.max_span_width]) # [num_words, max_span_width] candidate_ends = candidate_starts + tf.expand_dims( tf.range(self.max_span_width), 0) # [num_words, max_span_width] candidate_start_sentence_indices = tf.gather( flattened_sentence_indices, candidate_starts) # [num_words, max_span_width] candidate_end_sentence_indices = tf.gather( flattened_sentence_indices, tf.minimum(candidate_ends, num_words - 1)) # [num_words, max_span_width] candidate_mask = tf.logical_and( candidate_ends < num_words, tf.equal( candidate_start_sentence_indices, candidate_end_sentence_indices)) # [num_words, max_span_width] flattened_candidate_mask = tf.reshape( candidate_mask, [-1]) # [num_words * max_span_width] candidate_starts = tf.boolean_mask( tf.reshape(candidate_starts, [-1]), flattened_candidate_mask) # [num_candidates] candidate_ends = tf.boolean_mask( tf.reshape(candidate_ends, [-1]), flattened_candidate_mask) # [num_candidates] candidate_sentence_indices = tf.boolean_mask( tf.reshape(candidate_start_sentence_indices, [-1]), flattened_candidate_mask) # [num_candidates] candidate_cluster_ids = self.get_candidate_labels( candidate_starts, candidate_ends, gold_starts, gold_ends, cluster_ids) # [num_candidates] candidate_span_emb = self.get_span_emb( mention_doc, mention_doc, candidate_starts, candidate_ends) # [num_candidates, emb] candidate_mention_scores = self.get_mention_scores( candidate_span_emb, candidate_starts, candidate_ends) candidate_mention_scores = tf.squeeze(candidate_mention_scores, 1) # [k] # beam size k = tf.minimum( 3900, tf.to_int32( tf.floor( tf.to_float(num_words) * self.config["top_span_ratio"]))) c = tf.minimum(self.config["max_top_antecedents"], k) # pull from beam top_span_indices = coref_ops.extract_spans( tf.expand_dims(candidate_mention_scores, 0), tf.expand_dims(candidate_starts, 0), tf.expand_dims(candidate_ends, 0), tf.expand_dims(k, 0), num_words, True) # [1, k] top_span_indices.set_shape([1, None]) top_span_indices = tf.squeeze(top_span_indices, 0) # [k] top_span_starts = tf.gather(candidate_starts, top_span_indices) # [k] top_span_ends = tf.gather(candidate_ends, top_span_indices) # [k] top_span_emb = tf.gather(candidate_span_emb, top_span_indices) # [k, emb] top_span_cluster_ids = tf.gather(candidate_cluster_ids, top_span_indices) # [k] top_span_mention_scores = tf.gather(candidate_mention_scores, top_span_indices) # [k] genre_emb = tf.gather( tf.get_variable( "genre_embeddings", [len(self.genres), self.config["feature_size"]], initializer=tf.truncated_normal_initializer(stddev=0.02)), genre) # [emb] if self.config['use_metadata']: speaker_ids = self.flatten_emb_by_sentence(speaker_ids, input_mask) top_span_speaker_ids = tf.gather(speaker_ids, top_span_starts) # [k]i else: top_span_speaker_ids = None dummy_scores = tf.zeros([k, 1]) # [k, 1] top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets = self.coarse_to_fine_pruning( top_span_emb, top_span_mention_scores, c) num_segs, seg_len = util_xlnet.shape(input_ids, 0), util_xlnet.shape( input_ids, 1) word_segments = tf.tile(tf.expand_dims(tf.range(0, num_segs), 1), [1, seg_len]) flat_word_segments = tf.boolean_mask(tf.reshape(word_segments, [-1]), tf.reshape(input_mask, [-1])) mention_segments = tf.expand_dims( tf.gather(flat_word_segments, top_span_starts), 1) # [k, 1] antecedent_segments = tf.gather(flat_word_segments, tf.gather(top_span_starts, top_antecedents)) #[k, c] segment_distance = tf.clip_by_value( mention_segments - antecedent_segments, 0, self.config['max_training_sentences'] - 1) if self.config['use_segment_distance'] else None #[k, c] if self.config['fine_grained']: for i in range(self.config["coref_depth"]): with tf.variable_scope("coref_layer", reuse=(i > 0)): top_antecedent_emb = tf.gather( top_span_emb, top_antecedents) # [k, c, emb] top_antecedent_scores = top_fast_antecedent_scores + self.get_slow_antecedent_scores( top_span_emb, top_antecedents, top_antecedent_emb, top_antecedent_offsets, top_span_speaker_ids, genre_emb, segment_distance) # [k, c] top_antecedent_weights = tf.nn.softmax( tf.concat([dummy_scores, top_antecedent_scores], 1)) # [k, c + 1] top_antecedent_emb = tf.concat( [tf.expand_dims(top_span_emb, 1), top_antecedent_emb], 1) # [k, c + 1, emb] attended_span_emb = tf.reduce_sum( tf.expand_dims(top_antecedent_weights, 2) * top_antecedent_emb, 1) # [k, emb] with tf.variable_scope("f"): f = tf.sigmoid( util_xlnet.projection( tf.concat([top_span_emb, attended_span_emb], 1), util_xlnet.shape(top_span_emb, -1))) # [k, emb] top_span_emb = f * attended_span_emb + ( 1 - f) * top_span_emb # [k, emb] else: top_antecedent_scores = top_fast_antecedent_scores top_antecedent_scores = tf.concat( [dummy_scores, top_antecedent_scores], 1) # [k, c + 1] top_antecedent_cluster_ids = tf.gather(top_span_cluster_ids, top_antecedents) # [k, c] top_antecedent_cluster_ids += tf.to_int32( tf.log(tf.to_float(top_antecedents_mask))) # [k, c] same_cluster_indicator = tf.equal(top_antecedent_cluster_ids, tf.expand_dims( top_span_cluster_ids, 1)) # [k, c] non_dummy_indicator = tf.expand_dims(top_span_cluster_ids > 0, 1) # [k, 1] pairwise_labels = tf.logical_and(same_cluster_indicator, non_dummy_indicator) # [k, c] dummy_labels = tf.logical_not( tf.reduce_any(pairwise_labels, 1, keepdims=True)) # [k, 1] top_antecedent_labels = tf.concat([dummy_labels, pairwise_labels], 1) # [k, c + 1] loss = self.softmax_loss(top_antecedent_scores, top_antecedent_labels) # [k] loss = tf.reduce_sum(loss) # [] return [ candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores ], loss
def get_uda_classification_loss(options, features, n_class, is_training, global_step, input_ids, input_mask, segment_ids, labels): """Loss for downstream classification tasks.""" tsa = options['tsa'] unsup_ratio = options['unsup_ratio'] num_train_steps = options['num_train_steps'] uda_softmax_temp = options['uda_softmax_temp'] uda_confidence_thresh = options['uda_confidence_thresh'] inp = tf.transpose(input_ids, [1, 0]) seg_id = tf.transpose(segment_ids, [1, 0]) inp_mask = tf.transpose(input_mask, [1, 0]) num_sample = input_ids.shape[0].value if is_training: assert num_sample % (1 + 2 * unsup_ratio) == 0 sup_batch_size = num_sample // (1 + 2 * unsup_ratio) unsup_batch_size = sup_batch_size * unsup_ratio bsz_per_core = tf.shape(input_ids)[0] // (1 + 2 * unsup_ratio) else: sup_batch_size = num_sample unsup_batch_size = 0 bsz_per_core = tf.shape(input_ids)[0] labels = tf.reshape(labels, [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=options['model_config_file']) run_config = xlnet.create_run_config(is_training, True, options) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(options['summary_type'], options['use_summ_proj']) if options['cls_scope'] is not None and options['cls_scope']: cls_scope = "classification_{}".format(options['cls_scope']) else: cls_scope = "classification_{}".format(options['task_name'].lower()) clas_logits = modeling.uda_logits( hidden=summary, labels=labels, n_class=n_class, initializer=xlnet_model.get_initializer(), scope=cls_scope) log_probs = tf.nn.log_softmax(clas_logits, axis=-1) correct_label_probs = None with tf.variable_scope("sup_loss"): sup_log_probs = log_probs[:sup_batch_size] one_hot_labels = tf.one_hot(labels, depth=n_class, dtype=tf.float32) tgt_label_prob = one_hot_labels per_example_loss = -tf.reduce_sum(tgt_label_prob * sup_log_probs, axis=-1) loss_mask = tf.ones_like(per_example_loss, dtype=per_example_loss.dtype) correct_label_probs = tf.reduce_sum(one_hot_labels * tf.exp(sup_log_probs), axis=-1) if tsa: tf.logging.info("Applying TSA") # Starting threshold is just the inverse number of labels. tsa_start = 1. / n_class tsa_threshold = model_utils.get_tsa_threshold(tsa, global_step, num_train_steps, tsa_start, end=1) larger_than_threshold = tf.greater(correct_label_probs, tsa_threshold) loss_mask = loss_mask * ( 1 - tf.cast(larger_than_threshold, tf.float32)) else: tsa_threshold = 1 loss_mask = tf.stop_gradient(loss_mask) per_example_loss = per_example_loss * loss_mask sup_loss = (tf.reduce_sum(per_example_loss) / tf.maximum(tf.reduce_sum(loss_mask), 1)) unsup_loss_mask = None if is_training and unsup_ratio > 0: with tf.variable_scope("unsup_loss"): ori_start = sup_batch_size ori_end = ori_start + unsup_batch_size aug_start = sup_batch_size + unsup_batch_size aug_end = aug_start + unsup_batch_size ori_log_probs = log_probs[ori_start:ori_end] aug_log_probs = log_probs[aug_start:aug_end] unsup_loss_mask = 1 if options['uda_softmax_temp'] != -1: tgt_ori_log_probs = tf.nn.log_softmax( clas_logits[ori_start:ori_end] / options['uda_softmax_temp'], axis=-1) tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs) else: tgt_ori_log_probs = tf.stop_gradient(ori_log_probs) if options['uda_confidence_thresh'] != -1: largest_prob = tf.reduce_max(tf.exp(ori_log_probs), axis=-1) unsup_loss_mask = tf.cast( tf.greater(largest_prob, options['uda_confidence_thresh']), tf.float32) unsup_loss_mask = tf.stop_gradient(unsup_loss_mask) per_example_kl_loss = model_utils.kl_for_log_probs( tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask unsup_loss = tf.reduce_mean(per_example_kl_loss) else: unsup_loss = 0. return (sup_loss, unsup_loss, clas_logits[:sup_batch_size], per_example_loss, loss_mask, tsa_threshold, unsup_loss_mask, correct_label_probs)