Exemplo n.º 1
0
    def _create_model(self, input_ids, input_masks, segment_ids, label_ids,
                      label_list, mode):
        """Creates XLNet-NER model"""
        model = xlnet.XLNetModel(xlnet_config=self.model_config,
                                 run_config=xlnet.create_run_config(
                                     mode == tf.estimator.ModeKeys.TRAIN, True,
                                     FLAGS),
                                 input_ids=tf.transpose(input_ids, perm=[1,
                                                                         0]),
                                 input_mask=tf.transpose(input_masks,
                                                         perm=[1, 0]),
                                 seg_ids=tf.transpose(segment_ids, perm=[1,
                                                                         0]))

        initializer = model.get_initializer()

        with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
            result = tf.transpose(model.get_sequence_output(), perm=[1, 0, 2])
            result_mask = tf.cast(tf.expand_dims(1 - input_masks, axis=-1),
                                  dtype=tf.float32)

            dense_layer = tf.keras.layers.Dense(
                units=len(label_list),
                activation=None,
                use_bias=True,
                kernel_initializer=initializer,
                bias_initializer=tf.zeros_initializer,
                kernel_regularizer=None,
                bias_regularizer=None,
                trainable=True)

            dropout_layer = tf.keras.layers.Dropout(
                rate=0.1, seed=np.random.randint(10000))

            result = dense_layer(result)
            if mode == tf.estimator.ModeKeys.TRAIN:
                result = dropout_layer(result)

            masked_predict = result * result_mask + MIN_FLOAT * (1 -
                                                                 result_mask)
            predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_predict,
                                                          axis=-1),
                                            axis=-1),
                                  dtype=tf.int32)

        loss = tf.constant(0.0, dtype=tf.float32)
        if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
                    ] and label_ids is not None:
            with tf.variable_scope("loss", reuse=tf.AUTO_REUSE):
                label = tf.cast(label_ids, dtype=tf.float32)
                label_mask = tf.cast(1 - input_masks, dtype=tf.float32)
                masked_label = tf.cast(label * label_mask, dtype=tf.int32)
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=masked_label, logits=masked_predict)
                loss = tf.reduce_sum(
                    cross_entropy * label_mask) / tf.reduce_sum(
                        tf.reduce_max(label_mask, axis=-1))

        return loss, predict_ids
Exemplo n.º 2
0
    def __init__(self, FLAGS=FLAGS, n_class=2, is_training=False):
        import xlnet, modeling
        import tensorflow as tf
        init_log()
        logging.info("Init semantic model ...")
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(FLAGS.spiece_model_file)
        tf.logging.set_verbosity(tf.logging.INFO)
        tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32
        self.input_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[None, None],
                                        name="input_ids")
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="segment_ids")
        self.input_mask = tf.placeholder(dtype=tf_float,
                                         shape=[None, None],
                                         name="input_mask")
        self.label_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[None],
                                        name="label_ids")
        bsz_per_core = tf.shape(self.input_ids)[0]
        inp = tf.transpose(self.input_ids, [1, 0])
        seg_id = tf.transpose(self.segment_ids, [1, 0])
        inp_mask = tf.transpose(self.input_mask, [1, 0])
        label = tf.reshape(self.label_ids, [bsz_per_core])
        self.sess = tf.Session()
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        self.summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                                  FLAGS.use_summ_proj)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            if FLAGS.cls_scope is not None and FLAGS.cls_scope:
                cls_scope = "classification_{}".format(FLAGS.cls_scope)
            else:
                cls_scope = "classification_{}".format(FLAGS.task_name.lower())
            per_example_loss, logits = modeling.classification_loss(
                hidden=self.summary,
                labels=label,
                n_class=n_class,
                initializer=xlnet_model.get_initializer(),
                scope=cls_scope,
                return_logits=True)
            total_loss = tf.reduce_mean(per_example_loss)
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))
        xlnet_model.saver.restore(self.sess, FLAGS.init_checkpoint)
        #### load pretrained models
        #scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
        logging.info("Init semantic model finished ...")
Exemplo n.º 3
0
def get_classification_outputs(FLAGS, features, is_training):
    """Loss for downstream classification tasks."""
    input_ids = features["input_ids"]
    seg_id = features["segment_ids"]
    input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32)
    input_mask = 1 - tf.cast(input_mask_int, tf.float32)
    num_choices = FLAGS.num_choices
    batch_size = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [batch_size, num_choices, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, batch_size * num_choices])
        return out

    if num_choices:
        input_ids = _transform_features(input_ids)
        seg_id = _transform_features(seg_id)
        input_mask = _transform_features(input_mask)
    else:
        input_ids = tf.transpose(input_ids, [1, 0])
        seg_id = tf.transpose(seg_id, [1, 0])
        input_mask = tf.transpose(input_mask, [1, 0])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=input_ids,
        seg_ids=seg_id,
        input_mask=input_mask)
    summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                         FLAGS.use_summ_proj)
    initializer = xlnet_model.get_initializer()
    return_dict = {}
    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        with tf.variable_scope("answer_class"):
            # race has 4 classes,
            # boolq has 2 classes
            if num_choices:
                num_classes = 1
            else:
                num_classes = FLAGS.num_classes
            cls_logits = tf.layers.dense(summary, num_classes,
                                         kernel_initializer=initializer,
                                         name="cls")
            if num_choices:
                cls_logits = tf.reshape(cls_logits, [batch_size, num_choices])
            cls_log_probs = tf.nn.log_softmax(cls_logits, -1)
    if is_training:
        return_dict["cls_log_probs"] = cls_log_probs
    return_dict["cls_logits"] = cls_logits

    return return_dict
Exemplo n.º 4
0
    def xlnet_layer(self):
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.xlnet_config)
        run_config = xlnet.create_run_config(self.is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=self.input_ids,
                                       seg_ids=self.segment_ids,
                                       input_mask=self.input_mask)

        self.embedded = xlnet_model.get_sequence_output()
        self.model_inputs = tf.nn.dropout(self.embedded, self.dropout)
Exemplo n.º 5
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        unique_ids = features["unique_ids"]
        inp = tf.transpose(features["input_ids"], [1, 0])
        seg_id = tf.transpose(features["segment_ids"], [1, 0])
        inp_mask = tf.transpose(features["input_mask"], [1, 0])

        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)

        # no need for dropout in prediction mode
        xlnet_config.dropout = 0.0
        xlnet_config.dropatt = 0.0

        run_config = xlnet.create_run_config(False, True, FLAGS)

        # no need for dropout in prediction mode
        run_config.dropout = 0.0
        run_config.dropatt = 0.0

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)

        # Check model parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))

        # load pretrained models
        scaffold_fn = init_from_checkpoint(FLAGS)

        # Get a sequence output
        seq_out = xlnet_model.get_sequence_output()

        tokens = tf.transpose(seq_out, [1, 0, 2])

        predictions = {
            "unique_id": unique_ids,
            'tokens': tokens,
            'input_mask': tf.transpose(inp_mask, [1, 0])
        }

        if FLAGS.use_tpu:
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
        else:
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=predictions)
        return output_spec
Exemplo n.º 6
0
def get_multi_task_weight_loss(
    FLAGS, features, is_training):
  bsz_per_core = tf.shape(features["input_ids"])[0]

  inp = tf.transpose(features["input_ids"], [1, 0])
  seg_id = tf.transpose(features["segment_ids"], [1, 0])
  inp_mask = tf.transpose(features["input_mask"], [1, 0])
  label = tf.reshape(features["label_ids"], [bsz_per_core, 7])
  target = label[:, 0]
  aux = label[:, 1:]
  weight = tf.reshape(features["weight"], [bsz_per_core])

  xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
  run_config = xlnet.create_run_config(is_training, True, FLAGS)

  xlnet_model = xlnet.XLNetModel(
      xlnet_config=xlnet_config,
      run_config=run_config,
      input_ids=inp,
      seg_ids=seg_id,
      input_mask=inp_mask)

  summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)

  with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
    with tf.variable_scope("regression_label"):
      target_logits = tf.layers.dense(
        summary,
        1,
        kernel_initializer=xlnet_model.get_initializer(),
        name='target')

      target_logits = tf.squeeze(target_logits)
      target_loss = tf.losses.sigmoid_cross_entropy(target, target_logits, weights=weight)

    with tf.variable_scope("regression_aux"):
      aux_logits = tf.layers.dense(
        summary,
        6,
        kernel_initializer=xlnet_model.get_initializer(),
        name='aux'
      )

      aux_loss = tf.losses.sigmoid_cross_entropy(aux, aux_logits)

    per_example_loss = target_loss + aux_loss
    total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, target_logits
Exemplo n.º 7
0
def get_decomposed_classification_outputs(FLAGS, features, is_training):
    seq1_ids = features["seq1_ids"]
    seq2_ids = features["seq2_ids"]
    seq_len = FLAGS.max_seq_length
    first_seq_len = FLAGS.max_first_length + 2
    second_seq_len = seq_len - first_seq_len
    seq1_attn_mask = get_attention_mask(seq1_ids, first_seq_len)
    seq2_attn_mask = get_attention_mask(seq2_ids, second_seq_len)
    seq_attn_mask = get_attention_mask(tf.concat([seq2_ids, seq1_ids], axis=0),
                                       seq_len)

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)
    initializer = xlnet._get_initializer(run_config)
    tfm_args = dict(
        n_token=xlnet_config.n_token,
        initializer=initializer,
        attn_type="bi",
        n_layer=xlnet_config.n_layer,
        d_model=xlnet_config.d_model,
        n_head=xlnet_config.n_head,
        d_head=xlnet_config.d_head,
        d_inner=xlnet_config.d_inner,
        ff_activation=xlnet_config.ff_activation,
        untie_r=xlnet_config.untie_r,
        is_training=run_config.is_training,
        use_bfloat16=run_config.use_bfloat16,
        use_tpu=run_config.use_tpu,
        dropout=run_config.dropout,
        dropatt=run_config.dropatt,

        # mem_len=run_config.mem_len,
        # reuse_len=run_config.reuse_len,
        # bi_data=run_config.bi_data,
        clamp_len=run_config.clamp_len,
        # same_length=run_config.same_length,
        ctx_ids=seq2_ids,
        q_ids=seq1_ids,
        q_seq_len=first_seq_len,
        ctx_seq_len=second_seq_len,
        sep_layer=FLAGS.sep_layer,
        q_attn_mask=seq1_attn_mask,
        c_attn_mask=seq2_attn_mask,
        qc_attn_mask=seq_attn_mask,
    )

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        upper_outputs = transformer_xl_decomposed(**tfm_args)
Exemplo n.º 8
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tpu_config = model_utils.configure_tpu(FLAGS)
    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)

    model_builder = XLNetModelBuilder(
        default_model_config=model_config,
        default_run_config=run_config,
        default_init_checkpoint=FLAGS.init_checkpoint,
        use_tpu=FLAGS.use_tpu)

    model_fn = model_builder.get_model_fn(model_config, run_config,
                                          FLAGS.init_checkpoint,
                                          FLAGS.model_type)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu,
                                            model_fn=model_fn,
                                            config=tpu_config,
                                            export_to_tpu=FLAGS.use_tpu,
                                            train_batch_size=1)

    tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file,
                               lower_case=FLAGS.lower_case)

    example_converter = XLNetExampleConverter(
        label_list=[],
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)

    features = example_converter.convert_examples_to_features(
        [PaddingInputExample()])

    input_fn = XLNetInputBuilder.get_input_builder(features,
                                                   FLAGS.max_seq_length, True,
                                                   False)
    estimator.train(input_fn, max_steps=1)

    tf.gfile.MakeDirs(FLAGS.export_dir)
    serving_input_fn = XLNetInputBuilder.get_serving_input_fn(
        FLAGS.max_seq_length)
    estimator.export_savedmodel(FLAGS.export_dir,
                                serving_input_fn,
                                as_text=False)
Exemplo n.º 9
0
def get_classification_loss(FLAGS, features, n_class, is_training):
    """Loss for downstream classification tasks."""

    bsz_per_core = tf.shape(features['input_ids'])[0]

    inp = tf.transpose(features['input_ids'], [1, 0])
    seg_id = tf.transpose(features['segment_ids'], [1, 0])
    inp_mask = tf.transpose(features['input_mask'], [1, 0])
    label = tf.reshape(features['label_ids'], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path = FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config = xlnet_config,
        run_config = run_config,
        input_ids = inp,
        seg_ids = seg_id,
        input_mask = inp_mask,
    )

    summary = xlnet_model.get_pooled_out(
        FLAGS.summary_type, FLAGS.use_summ_proj
    )

    with tf.variable_scope('model', reuse = tf.AUTO_REUSE):

        if FLAGS.cls_scope is not None and FLAGS.cls_scope:
            cls_scope = 'classification_{}'.format(FLAGS.cls_scope)
        else:
            cls_scope = 'classification_{}'.format(FLAGS.task_name.lower())

        per_example_loss, logits = modeling.classification_loss(
            hidden = summary,
            labels = label,
            n_class = n_class,
            initializer = xlnet_model.get_initializer(),
            scope = cls_scope,
            return_logits = True,
        )

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
Exemplo n.º 10
0
def get_ner_loss(FLAGS, features, is_training, num_labels):
    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label_id = tf.transpose(features["label_ids"], [1, 0])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    output_layer = xlnet_model.get_sequence_output()

    def hidden2tag(hiddenlayer, numclass):
        linear = tf.keras.layers.Dense(numclass, activation=None)
        return linear(hiddenlayer)

    def softmax_layer(logits, labels, num_labels, mask):
        logits = tf.reshape(logits, [-1, num_labels])
        labels = tf.reshape(labels, [-1])
        mask = tf.cast(mask, dtype=tf.float32)
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        loss = tf.losses.softmax_cross_entropy(logits=logits,
                                               onehot_labels=one_hot_labels)
        loss *= tf.reshape(mask, [-1])
        loss = tf.reduce_sum(loss)
        total_size = tf.reduce_sum(mask)
        total_size += 1e-12  # to avoid division by 0 for all-0 weights
        loss /= total_size
        # predict not mask we could filtered it in the prediction part.
        probabilities = tf.math.softmax(logits, axis=-1)
        predict = tf.math.argmax(probabilities, axis=-1)
        return loss, predict

    if is_training:
        output_layer = tf.keras.layers.Dropout(rate=0.1)(output_layer)
    logits = hidden2tag(output_layer, num_labels)
    logits = tf.reshape(logits, [-1, 128, num_labels])
    loss, predict = softmax_layer(logits, label_id, num_labels, inp_mask)

    return loss, logits, predict
Exemplo n.º 11
0
    def __init__(self, ckpt_num=156000, is_training=False):
        #init_log()
        self.logs = {}
        batch_size = 1
        logging.info("Init query weight model ...")
        self.sp = Tokenizer()
        self.lm = language_model()
        self.xgb_model = xgb.Booster(model_file=conf.rank_model)
        #self.xgb_dict = parse_xgb_dict(conf.rank_model + '.txt')
        tf.logging.set_verbosity(tf.logging.INFO)
        tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32
        self.input_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size, FLAGS.seq_len],
                                        name="input_ids")
        self.segment_ids = tf.placeholder(dtype=tf.int32,
                                          shape=[batch_size, FLAGS.seq_len],
                                          name="segment_ids")
        self.input_mask = tf.placeholder(dtype=tf_float,
                                         shape=[batch_size, FLAGS.seq_len],
                                         name="input_mask")
        self.label_ids = tf.placeholder(dtype=tf.int64,
                                        shape=[batch_size],
                                        name="label_ids")
        inp = tf.transpose(self.input_ids, [1, 0])
        seg_id = tf.transpose(self.segment_ids, [1, 0])
        inp_mask = tf.transpose(self.input_mask, [1, 0])
        self.sess = tf.Session()
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)

        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out

        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))
        xlnet_model.saver.restore(
            self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num))
        #### load pretrained models
        # scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
        logging.info("Init query weight model finished ...")
Exemplo n.º 12
0
def get_race_loss(FLAGS, features, is_training):
    """Loss for downstream multi-choice QA tasks such as RACE."""

    bsz_per_core = tf.shape(features['input_ids'])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 4, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 4])
        return out

    inp = _transform_features(features['input_ids'])
    seg_id = _transform_features(features['segment_ids'])
    inp_mask = _transform_features(features['input_mask'])
    label = tf.reshape(features['label_ids'], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path = FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config = xlnet_config,
        run_config = run_config,
        input_ids = inp,
        seg_ids = seg_id,
        input_mask = inp_mask,
    )
    summary = xlnet_model.get_pooled_out(
        FLAGS.summary_type, FLAGS.use_summ_proj
    )

    with tf.variable_scope('logits'):
        logits = tf.layers.dense(
            summary, 1, kernel_initializer = xlnet_model.get_initializer()
        )
        logits = tf.reshape(logits, [bsz_per_core, 4])

        one_hot_target = tf.one_hot(label, 4)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1
        )
        total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits
Exemplo n.º 13
0
def attention_net(input_x, input_val, is_training=True, scope='AttenNet', config=AttentionConfig()):
    debug_info = {}
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        bsz = tf.shape(input_x)[0]
        qlen = tf.shape(input_x)[1]
        segment_ids = tf.zeros([bsz, qlen], dtype=tf.int32)
        used = tf.ones([bsz, qlen], dtype=tf.int32)    #tf.sign(tf.abs(input_x))
        length = tf.reduce_sum(used, reduction_indices=1)
        lengths = tf.cast(length, tf.int32)
        # attention mask
        def cond(i, _length, _output):
            return tf.less(i, tf.shape(_length)[0])
        def body(i, _length, _output):
            return [i + 1, _length, _output.write(i, tf.concat([tf.zeros([_length[i]]), tf.ones(qlen - _length[i])], axis=-1))]
        Out = tf.TensorArray(size=0, dtype=tf.float32, dynamic_size=True, clear_after_read=False)
        res = tf.while_loop(cond, body, [0, lengths, Out])
        input_mask = tf.convert_to_tensor(res[-1].stack())
        #_input_mask_ = tf.equal(0, input_x)
        #input_mask_ = tf.cast(_input_mask_, tf.float32)
        inp = tf.transpose(input_x, [1, 0])
        seg_id = tf.transpose(segment_ids, [1, 0])
        inp_mask = tf.transpose(input_mask, [1, 0])
        # XLNetConfig contains hyperparameters that are specific to a model checkpoint.
        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        # RunConfig contains hyperparameters that could be different between pretraining and finetuning.
        run_config = xlnet.create_run_config(is_training=is_training, is_finetune=True, FLAGS=FLAGS)
        # Construct an XLNet model
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=run_config,
            input_ids=inp, input_vals=input_val,
            seg_ids=seg_id,
            input_mask=inp_mask)
        # Get a summary of the sequence using the last hidden state
        summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)
        # Get a sequence output
        seq_out_ = xlnet_model.get_sequence_output()
        seq_out = tf.transpose(seq_out_, [1, 0, 2])
        debug_info['input_x']=input_x;debug_info['segment_ids']=segment_ids;debug_info['input_mask']=input_mask
        debug_info['summary']=summary; #debug_info['input_mask_']=input_mask_
        debug_info['lengths']=lengths
    return summary, debug_info
def create_model(FLAGS,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 is_training=True):
    bsz_per_core = tf.shape(input_ids)[0]
    inp = tf.transpose(input_ids, [1, 0])
    seg_id = tf.transpose(segment_ids, [1, 0])
    inp_mask = tf.transpose(input_mask, [1, 0])
    label = tf.reshape(labels, [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                         FLAGS.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

        if FLAGS.cls_scope is not None and FLAGS.cls_scope:
            cls_scope = "classification_{}".format(FLAGS.cls_scope)
        else:
            cls_scope = "classification_{}".format(FLAGS.task_name.lower())

        per_example_loss, logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=FLAGS.num_labels,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
Exemplo n.º 15
0
def get_jigsaw_loss(FLAGS, features, n_output, is_training):
    """Loss for jigsaw task."""

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = features["label_ids"]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(FLAGS.summary_type,
                                         FLAGS.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

        if FLAGS.cls_scope is not None and FLAGS.cls_scope:
            cls_scope = "classification_{}".format(FLAGS.cls_scope)
        else:
            cls_scope = "classification_{}".format(FLAGS.task_name.lower())

        per_example_loss, logits = jigsaw_loss(
            hidden=summary,
            labels=label,
            n_output=n_output,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits
Exemplo n.º 16
0
def create_model(FLAGS, features, is_training, num_labels):
    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    inp = features["input_ids"]
    seg_id = features["segment_ids"]
    inp_mask = features["input_mask"]
    label_list = features["label_list"]

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)

    output = xlnet_model.get_sequence_output()

    with tf.variable_scope("loss"):
        if is_training:
            output_layer = tf.nn.dropout(output, keep_prob=0.9)
        logits = tf.layers.dense(output, num_labels, activation=None)
        logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels])

        input_mask = tf.cast(inp_mask, dtype=tf.float32)

        log_prob = tf.nn.log_softmax(logits, axis=-1)
        one_hot_labels = tf.one_hot(label_list, depth=num_labels, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_prob, axis=-1)
        input_mask *= -1
        input_mask += 1
        per_example_loss *= input_mask

        loss = tf.reduce_mean(per_example_loss)

    return loss, per_example_loss, logits
Exemplo n.º 17
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processor = NerProcessor(data_dir=FLAGS.data_dir,
                             task_name=FLAGS.task_name.lower())

    label_list = processor.get_labels()

    tpu_config = model_utils.configure_tpu(FLAGS)
    model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)

    model_builder = XLNetModelBuilder(
        default_model_config=model_config,
        default_run_config=run_config,
        default_init_checkpoint=FLAGS.init_checkpoint,
        use_tpu=FLAGS.use_tpu)

    model_fn = model_builder.get_model_fn(model_config, run_config,
                                          FLAGS.init_checkpoint, label_list)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=tpu_config,
        export_to_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file,
                               lower_case=FLAGS.lower_case)

    example_converter = XLNetExampleConverter(
        label_list=label_list,
        max_seq_length=FLAGS.max_seq_length,
        tokenizer=tokenizer)

    if FLAGS.do_train:
        train_examples = processor.get_train_examples()

        tf.logging.info("***** Run training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_steps)

        train_features = example_converter.convert_examples_to_features(
            train_examples)
        train_input_fn = XLNetInputBuilder.get_input_builder(
            train_features, FLAGS.max_seq_length, True, True)

        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples()

        tf.logging.info("***** Run evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_features = example_converter.convert_examples_to_features(
            eval_examples)
        eval_input_fn = XLNetInputBuilder.get_input_builder(
            eval_features, FLAGS.max_seq_length, False, False)

        result = estimator.evaluate(input_fn=eval_input_fn)

        precision = result["precision"]
        recall = result["recall"]
        f1_score = 2.0 * precision * recall / (precision + recall)

        tf.logging.info("***** Evaluation result *****")
        tf.logging.info("  Precision (token-level) = %s", str(precision))
        tf.logging.info("  Recall (token-level) = %s", str(recall))
        tf.logging.info("  F1 score (token-level) = %s", str(f1_score))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples()

        tf.logging.info("***** Run prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_features = example_converter.convert_examples_to_features(
            predict_examples)
        predict_input_fn = XLNetInputBuilder.get_input_builder(
            predict_features, FLAGS.max_seq_length, False, False)

        result = estimator.predict(input_fn=predict_input_fn)

        predict_recorder = XLNetPredictRecorder(
            output_dir=FLAGS.output_dir,
            label_list=label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        predicts = [{
            "input_ids": feature.input_ids,
            "input_masks": feature.input_masks,
            "label_ids": feature.label_ids,
            "predict_ids": predict["predict"].tolist()
        } for feature, predict in zip(predict_features, result)]

        predict_recorder.record(predicts)

    if FLAGS.do_export:
        tf.logging.info("***** Running exporting *****")
        tf.gfile.MakeDirs(FLAGS.export_dir)
        serving_input_fn = XLNetInputBuilder.get_serving_input_fn(
            FLAGS.max_seq_length)
        estimator.export_savedmodel(FLAGS.export_dir,
                                    serving_input_fn,
                                    as_text=False)
Exemplo n.º 18
0
def get_qa_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    input_ids = features["input_ids"]
    seg_id = features["segment_ids"]
    input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32)
    cls_index = tf.reshape(tf.reduce_sum(input_mask_int, axis=1), [-1])
    p_mask = tf.cast(tf.cast(seg_id, tf.bool), tf.float32)
    input_ids = tf.transpose(input_ids, [1, 0])
    input_mask = 1 - tf.cast(input_mask_int, tf.float32)
    input_mask = tf.transpose(input_mask, [1, 0])
    seg_id = tf.transpose(seg_id, [1, 0])
    seq_len = tf.shape(input_ids)[0]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=input_ids,
        seg_ids=seg_id,
        input_mask=input_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    return_dict = {}
    with tf.variable_scope("logits"):
        # logits: seq, batch_size, 2
        logits = tf.layers.dense(output, 2, kernel_initializer=initializer)

        # logits: 2, batch_size, seq
        logits = tf.transpose(logits, [2, 1, 0])

        # start_logits: batch_size, seq
        # end_logits: batch_size, seq
        start_logits, end_logits = tf.unstack(logits, axis=0)

        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)

    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs
    else:
        return_dict["start_logits"] = start_logits
        return_dict["end_logits"] = end_logits

    # an additional layer to predict answer class, 0: span, 1:yes, 2:no
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)
        ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='pooler')

        ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout,
                                        training=is_training)
        # hotpot has 3 classes,
        # squad 2.0 has 2 classes
        cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes,
                                     kernel_initializer=initializer,
                                     name="cls")
        cls_log_probs = tf.nn.log_softmax(cls_logits, -1)
    if is_training:
        return_dict["cls_log_probs"] = cls_log_probs
    return_dict["cls_logits"] = cls_logits

    return return_dict
Exemplo n.º 19
0
    def _create_model(self, input_ids, input_masks, segment_ids,
                      token_label_ids, sent_label_ids, token_label_list,
                      sent_label_list, mode):
        """Creates XLNet-NLU model"""
        model = xlnet.XLNetModel(xlnet_config=self.model_config,
                                 run_config=xlnet.create_run_config(
                                     mode == tf.estimator.ModeKeys.TRAIN, True,
                                     FLAGS),
                                 input_ids=tf.transpose(input_ids, perm=[1,
                                                                         0]),
                                 input_mask=tf.transpose(input_masks,
                                                         perm=[1, 0]),
                                 seg_ids=tf.transpose(segment_ids, perm=[1,
                                                                         0]))

        initializer = model.get_initializer()

        with tf.variable_scope("token", reuse=tf.AUTO_REUSE):
            token_result = tf.transpose(model.get_sequence_output(),
                                        perm=[1, 0, 2])
            token_result_mask = tf.cast(tf.expand_dims(1 - input_masks,
                                                       axis=-1),
                                        dtype=tf.float32)

            token_dense_layer = tf.keras.layers.Dense(
                units=len(token_label_list),
                activation=None,
                use_bias=True,
                kernel_initializer=initializer,
                bias_initializer=tf.zeros_initializer,
                kernel_regularizer=None,
                bias_regularizer=None,
                trainable=True)

            token_dropout_layer = tf.keras.layers.Dropout(
                rate=0.1, seed=np.random.randint(10000))

            token_result = token_dense_layer(token_result)
            if mode == tf.estimator.ModeKeys.TRAIN:
                token_result = token_dropout_layer(token_result)

            masked_token_predict = token_result * token_result_mask + MIN_FLOAT * (
                1 - token_result_mask)
            token_predict_ids = tf.cast(tf.argmax(tf.nn.softmax(
                masked_token_predict, axis=-1),
                                                  axis=-1),
                                        dtype=tf.int32)

        with tf.variable_scope("sent", reuse=tf.AUTO_REUSE):
            sent_result = model.get_pooled_out("last")
            sent_result_mask = tf.cast(tf.reduce_max(1 - input_masks,
                                                     axis=-1,
                                                     keepdims=True),
                                       dtype=tf.float32)

            sent_dense_layer = tf.keras.layers.Dense(
                units=len(sent_label_list),
                activation=None,
                use_bias=True,
                kernel_initializer=initializer,
                bias_initializer=tf.zeros_initializer,
                kernel_regularizer=None,
                bias_regularizer=None,
                trainable=True)

            sent_dropout_layer = tf.keras.layers.Dropout(
                rate=0.1, seed=np.random.randint(10000))

            sent_result = sent_dense_layer(sent_result)
            if mode == tf.estimator.ModeKeys.TRAIN:
                sent_result = sent_dropout_layer(sent_result)

            masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * (
                1 - sent_result_mask)
            sent_predict_ids = tf.cast(tf.argmax(tf.nn.softmax(
                masked_sent_predict, axis=-1),
                                                 axis=-1),
                                       dtype=tf.int32)

        loss = tf.constant(0.0, dtype=tf.float32)
        if mode not in [
                tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL
        ]:
            return loss, token_predict_ids, sent_predict_ids

        if token_label_ids is not None:
            with tf.variable_scope("token_loss", reuse=tf.AUTO_REUSE):
                token_label = tf.cast(token_label_ids, dtype=tf.float32)
                token_label_mask = tf.cast(1 - input_masks, dtype=tf.float32)
                masked_token_label = tf.cast(token_label * token_label_mask,
                                             dtype=tf.int32)
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=masked_token_label, logits=masked_token_predict)
                token_loss = tf.reduce_sum(
                    cross_entropy * token_label_mask) / tf.reduce_sum(
                        tf.reduce_max(token_label_mask, axis=-1))
                loss = loss + token_loss

        if sent_label_ids is not None:
            with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE):
                sent_label = tf.cast(sent_label_ids, dtype=tf.float32)
                sent_label_mask = tf.cast(tf.reduce_max(1 - input_masks,
                                                        axis=-1),
                                          dtype=tf.float32)
                masked_sent_label = tf.cast(sent_label * sent_label_mask,
                                            dtype=tf.int32)
                cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=masked_sent_label, logits=masked_sent_predict)
                sent_loss = tf.reduce_sum(
                    cross_entropy * sent_label_mask) / tf.reduce_sum(
                        tf.reduce_max(sent_label_mask, axis=-1))
                loss = loss + sent_loss

        return loss, token_predict_ids, sent_predict_ids
Exemplo n.º 20
0
def get_crf_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])

    if FLAGS.label_mode == "normal":
        label = features["label_ids"]
    elif FLAGS.label_mode == "X":
        label = features["label_x_id"]
    elif FLAGS.label_mode == "gather":
        label = features["label_gather"]
    else:
        raise ValueError("unsupport label mode {}".format(FLAGS.label_mode))

    if FLAGS.label_mask == "normal":
        mask = 1 - features["input_mask"]
        re_mask = features["label_mask_x"]
    elif FLAGS.label_mask == "X":
        mask = features["label_mask_x"]
        re_mask = features["label_mask_x"]
    elif FLAGS.label_mask == "gather":
        mask = features["label_mask_gather"]
        re_mask = features["label_mask_gather"]

    else:
        raise ValueError("unsupport mask mode {}".format(FLAGS.label_mode))

    max_seq_length = FLAGS.max_seq_length
    batch_size = tf.shape(inp)[1]
    if FLAGS.label_mode == "X":
        classes = FLAGS.crf_classes + 1
    else:
        classes = FLAGS.crf_classes

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    with tf.variable_scope("crf_layer"):
        output = tf.transpose(output, [1, 0, 2])
        start_logits = tf.layers.dense(output,
                                       classes,
                                       kernel_initializer=initializer)

        if FLAGS.label_mode == "gather":
            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * max_seq_length,
                [-1, 1])
            flat_positions = tf.reshape(features["label_index"] + flat_offsets,
                                        [-1])
            flat_sequence_tensor = tf.reshape(
                start_logits, [batch_size * max_seq_length, classes])
            start_logits = tf.gather(flat_sequence_tensor, flat_positions)
            start_logits = tf.reshape(start_logits,
                                      [batch_size, max_seq_length, classes])

        if FLAGS.no_crf:
            logits = tf.nn.log_softmax(start_logits)
            one_hot_target = tf.one_hot(label, classes)
            per_example_loss = -tf.reduce_sum(logits * one_hot_target, -1)
            numerator = tf.reduce_sum(tf.reshape(mask * per_example_loss,
                                                 [-1]))
            denominator = tf.reduce_sum(tf.reshape(mask, [-1])) + 1e-5
            total_loss = numerator / denominator
            logits = tf.argmax(logits, axis=-1)
        else:
            seq_len = tf.reduce_sum(tf.cast(mask, tf.int64), axis=-1)
            transition_params = tf.get_variable(
                'trans', [classes, classes],
                dtype=tf.float32,
                initializer=tf.zeros_initializer)
            per_example_loss, transition_params = tf.contrib.crf.crf_log_likelihood(
                start_logits,
                label,
                seq_len,
                transition_params=transition_params)
            logits, tf_score = tf.contrib.crf.crf_decode(
                start_logits, transition_params, seq_len)
            per_example_loss = -per_example_loss
            total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits, label, re_mask
Exemplo n.º 21
0
def get_ner_loss(FLAGS, features, is_training, lengths):
    """Loss for downstream sequence labelling such as NER."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 1, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 1])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    labels = tf.reshape(features["label_ids"],
                        [bsz_per_core, FLAGS.max_seq_length])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    # todo summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)
    embedded_chars = xlnet_model.get_sequence_output()
    embedding_dims = 768
    num_labels = 2
    # logits = project_crf_layer(embedded_chars)
    # loss, trans = crf_layer(logits)

    from tensorflow.contrib.layers.python.layers import initializers
    with tf.variable_scope("logits", reuse=tf.AUTO_REUSE):
        W = tf.get_variable("W",
                            shape=[embedding_dims, num_labels],
                            dtype=tf.float32,
                            initializer=initializers.xavier_initializer())

        b = tf.get_variable("b",
                            shape=[num_labels],
                            dtype=tf.float32,
                            initializer=tf.zeros_initializer())
        output = tf.reshape(embedded_chars,
                            shape=[-1, embedding_dims
                                   ])  # [batch_size, embedding_dims]
        pred = tf.tanh(tf.nn.xw_plus_b(output, W, b))
        logits = tf.reshape(pred, [-1, FLAGS.max_seq_length, num_labels])
        trans1 = tf.get_variable("transitions",
                                 shape=[num_labels, num_labels],
                                 initializer=initializers.xavier_initializer())
        # crf
        log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
            inputs=logits, tag_indices=labels, sequence_lengths=lengths)
        # log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
        #     inputs=logits,
        #     tag_indices=labels,
        #     transition_params=trans1,
        #     sequence_lengths=lengths)
        # CRF decode, pred_ids 是一条最大概率的标注路径
        pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits,
                                                transition_params=trans,
                                                sequence_length=lengths)
        # return (loss, logits, trans, pred_ids)
        one_hot_target = tf.one_hot(labels, num_labels)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)
    return total_loss, per_example_loss, logits
def get_ner_loss(FLAGS, features, is_training):  # , lengths):
    """Loss for downstream sequence labelling such as NER."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    input_ids = features["input_ids"]
    print("&&&&&&&&&&%%%%%%%%% the input_ids shape is ", input_ids.shape)
    used = tf.sign(tf.abs(input_ids))
    print("&&&&&&&&&&%%%%%%%%% the used shape is ", used.shape)
    # [batch_size] 大小的向量,包含了当前batch中的序列长度
    lengths = tf.reduce_sum(used, reduction_indices=1)
    print("&&&&&&&&&&%%%%%%%%% lengths shape is ", lengths.shape)

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 1, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 1])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    labels = tf.reshape(features["label_ids"],
                        [bsz_per_core, FLAGS.max_seq_length])
    print("&&&&&&&&&&%%%%%%%%% labels shape is ", labels.shape)

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    embedded_chars = xlnet_model.get_sequence_output()
    embedding_dims = embedded_chars.shape[-1]
    num_labels = 17
    with tf.variable_scope("logits", reuse=tf.AUTO_REUSE):
        W = tf.get_variable("W",
                            shape=[embedding_dims, num_labels],
                            dtype=tf.float32,
                            initializer=initializers.xavier_initializer())

        b = tf.get_variable("b",
                            shape=[num_labels],
                            dtype=tf.float32,
                            initializer=tf.zeros_initializer())
        x = tf.reshape(embedded_chars,
                       shape=[-1,
                              embedding_dims])  # [batch_size, embedding_dims]
        pred = tf.nn.xw_plus_b(x, W, b)
        print("&&&&&&&&&&%%%%%%%%% the embedded_chars shape is ",
              embedded_chars.shape)
        print("&&&&&&&&&&%%%%%%%%% W shape is ", W.shape)
        print("&&&&&&&&&&%%%%%%%%% b shape is ", b.shape)
        print("&&&&&&&&&&%%%%%%%%% output shape is ", x.shape)
        print("&&&&&&&&&&%%%%%%%%% pred shape is ", pred.shape)
        logits = tf.reshape(pred, [-1, FLAGS.max_seq_length, num_labels])
        print("&&&&&&&&&&%%%%%%%%% logits shape is ", logits.shape)
        # trans1 = tf.get_variable(
        #     "transitions",
        #     shape=[num_labels, num_labels],
        #     initializer=initializers.xavier_initializer())
        # crf
        log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
            inputs=logits, tag_indices=labels, sequence_lengths=lengths)
        # log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
        #     inputs=logits,
        #     tag_indices=labels,
        #     transition_params=trans1,
        #     sequence_lengths=lengths)
        # CRF decode, pred_ids 是一条最大概率的标注路径
        pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits,
                                                transition_params=trans,
                                                sequence_length=lengths)
        # return (loss, logits, trans, pred_ids)
        one_hot_target = tf.one_hot(labels, num_labels)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)
    return total_loss, per_example_loss, logits
Exemplo n.º 23
0
    def model_fn(features, labels, mode, params):
        #### Training or Evaluation
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        #### Get loss from inputs
        #********************************************************************************************#
        bsz_per_core = tf.shape(features["input_ids"])[0]
        inp = tf.transpose(features["input_ids"], [1, 0])
        seg_id = tf.transpose(features["segment_ids"], [1, 0])
        inp_mask = tf.transpose(features["input_mask"], [1, 0])
        label_ids = features["label_ids"]

        xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
        run_config = xlnet.create_run_config(is_training, True, FLAGS)
        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=inp,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask)
        #summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)
        # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
        xlnet_model_out = xlnet_model.get_sequence_output()
        embedding = tf.transpose(xlnet_model_out, [1, 0, 2])
        max_seq_length = embedding.shape[1].value
        # 算序列真实长度
        used = tf.sign(tf.abs(features["input_ids"]))
        lengths = tf.reduce_sum(
            used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
        # 添加CRF output layer
        blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                              hidden_unit=10,
                              cell_type="lstm",
                              num_layers=1,
                              dropout_rate=0.5,
                              initializers=initializers,
                              num_labels=n_class,
                              seq_length=max_seq_length,
                              labels=label_ids,
                              lengths=lengths,
                              is_training=is_training)
        total_loss, logits, trans, pred_ids = blstm_crf.add_blstm_crf_layer(
            crf_only=True)
        #********************************************************************************************#

        #### Check model parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))

        #### load pretrained models
        scaffold_fn = model_utils.init_from_checkpoint(FLAGS)

        #### Evaluation mode
        if mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(label_ids, pred_ids):
                return {
                    "eval_loss":
                    tf.metrics.mean_squared_error(labels=label_ids,
                                                  predictions=pred_ids),
                }

            eval_metrics = metric_fn(features["label_ids"], pred_ids)
            eval_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=total_loss, eval_metric_ops=eval_metrics)
            return eval_spec
        elif mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                "logits": logits,
                "labels": label_ids,
                "pred_ids": pred_ids,
                "input_mask": features["input_mask"]
            }
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=predictions)
            return output_spec

        #### Configuring the optimizer
        train_op, learning_rate, _ = model_utils.get_train_op(
            FLAGS, total_loss)

        monitor_dict = {}
        monitor_dict["lr"] = learning_rate

        #### Constucting training TPUEstimatorSpec with new cache.
        train_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                loss=total_loss,
                                                train_op=train_op)
        return train_spec
Exemplo n.º 24
0
def get_decomposed_qa_outputs(FLAGS, features, is_training):
    question_ids = features["question_ids"]
    context_ids = features["context_ids"]
    seq_len = FLAGS.max_seq_length
    q_seq_len = FLAGS.max_first_length + 2
    ctx_seq_len = seq_len - q_seq_len
    q_mask_int = tf.cast(tf.cast(question_ids, tf.bool), tf.int32)
    cls_index = tf.reshape(
        tf.reduce_sum(q_mask_int, axis=1) + ctx_seq_len, [-1])
    # 0 for mask out
    # q_zeros = tf.zeros_like(question_ids)
    # p_ids = tf.concat([context_ids, q_zeros], axis=1)
    # p_mask = tf.cast(tf.cast(p_ids, tf.bool), tf.float32)
    question_ids = tf.transpose(question_ids, [1, 0])
    context_ids = tf.transpose(context_ids, [1, 0])

    q_attn_mask = get_attention_mask(question_ids, q_seq_len)
    c_attn_mask = get_attention_mask(context_ids, ctx_seq_len)
    qc_attn_mask = get_attention_mask(
        tf.concat([context_ids, question_ids], axis=0), seq_len)

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)
    initializer = xlnet._get_initializer(run_config)
    tfm_args = dict(
        n_token=xlnet_config.n_token,
        initializer=initializer,
        attn_type="bi",
        n_layer=xlnet_config.n_layer,
        d_model=xlnet_config.d_model,
        n_head=xlnet_config.n_head,
        d_head=xlnet_config.d_head,
        d_inner=xlnet_config.d_inner,
        ff_activation=xlnet_config.ff_activation,
        untie_r=xlnet_config.untie_r,
        is_training=run_config.is_training,
        use_bfloat16=run_config.use_bfloat16,
        use_tpu=run_config.use_tpu,
        dropout=run_config.dropout,
        dropatt=run_config.dropatt,

        # mem_len=run_config.mem_len,
        # reuse_len=run_config.reuse_len,
        # bi_data=run_config.bi_data,
        clamp_len=run_config.clamp_len,
        # same_length=run_config.same_length,
        ctx_ids=context_ids,
        q_ids=question_ids,
        q_seq_len=q_seq_len,
        ctx_seq_len=ctx_seq_len,
        sep_layer=FLAGS.sep_layer,
        q_attn_mask=q_attn_mask,
        c_attn_mask=c_attn_mask,
        qc_attn_mask=qc_attn_mask,
    )

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        upper_outputs = transformer_xl_decomposed(**tfm_args)

    output = upper_outputs[-1]
    return_dict = {'upper_outputs': upper_outputs}
    with tf.variable_scope("logits"):
        # logits: seq, batch_size, 2
        logits = tf.layers.dense(output, 2, kernel_initializer=initializer)

        # logits: 2, batch_size, seq
        logits = tf.transpose(logits, [2, 1, 0])

        # start_logits: batch_size, seq
        # end_logits: batch_size, seq
        start_logits, end_logits = tf.unstack(logits, axis=0)

        # start_logits_masked = start_logits * p_mask - 1e30 * (1 - p_mask)
        # start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)
        start_log_probs = tf.nn.log_softmax(start_logits, -1)

        # end_logits_masked = end_logits * p_mask - 1e30 * (1 - p_mask)
        # end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        end_log_probs = tf.nn.log_softmax(end_logits, -1)

    return_dict["start_logits"] = start_logits
    return_dict["end_logits"] = end_logits
    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs

    # an additional layer to predict answer class, 0: span, 1:yes, 2:no
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)
        ans_feature = tf.layers.dense(cls_feature,
                                      xlnet_config.d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name='pooler')

        ans_feature = tf.layers.dropout(ans_feature,
                                        FLAGS.dropout,
                                        training=is_training)
        # hotpot has 3 classes,
        # squad 2.0 has 2 classes
        cls_logits = tf.layers.dense(ans_feature,
                                     FLAGS.num_classes,
                                     kernel_initializer=initializer,
                                     name="cls")
        cls_log_probs = tf.nn.log_softmax(cls_logits, -1)

    return_dict["cls_logits"] = cls_logits
    if is_training:
        return_dict["cls_log_probs"] = cls_log_probs

    return return_dict
Exemplo n.º 25
0
def prediction_graph_memory():
    """Gets features and
    return predicted tokens)
    features: Dict[str:tf.train.features] Contains following features:
              input_k
              seg_id
              input_mask
    """

    features = {
        "input_k": tf.placeholder(tf.int32, (None, None)),
        "seg_id": tf.placeholder(tf.int32, (None, None)),
        "input_mask": tf.placeholder(tf.float32, (None, None))
    }

    # Building prediction graph
    # Transforming features for batch channel on last axis
    inp = tf.transpose(features["input_k"], [1, 0])
    seg_id = tf.transpose(features["seg_id"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])

    # Model config
    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)
    run_config.mem_len = FLAGS.max_mem_length

    perm_mask = _create_mask(tf.shape(inp)[0], 0)[:, :, None]
    # Getting the hidden states for the prompts
    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask,
                                   perm_mask=perm_mask)

    # getting memory
    mems = xlnet_model.get_new_memory()

    latest_tokens = None
    prev_tokens = None
    prev_confs = None
    batch_size = tf.shape(mems[0])[1]

    def cond(*_):
        """Dummy condition since we stop based on iteration"""
        return True

    def body(mems, latest_tokens, mem_mask, prev_tokens, prev_confs):
        """The main body of sampling loop.
        mem: cache memory--calculated hidden states
             of previous tokens
        latest_tokens: latest sampled tokens
        mem_mask: masking for setting previous memory zero. Used for padding
        prev_tokens: all the previous tokens including latest_tokens
        prev_confs: confidences of respective tokens in prev_tokens
        """

        # get dummy input token and permutation mask
        input_k, seg_id, perm_mask, input_q, target_mapping = \
            inputs_and_mask(latest_tokens,
                            batch_size)

        input_k = tf.transpose(input_k, (1, 0))
        input_q = tf.transpose(input_q, (1, 0))
        seg_id = tf.transpose(seg_id, (1, 0))
        perm_mask = tf.transpose(perm_mask, (1, 2, 0))
        # Set the hidden state of the padded tokens to be zero[
        for i, mem in enumerate(mems):
            mems[i] = (1 - mem_mask[:, :, None]) * mems[i]
        # Get logits
        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=input_k,
                                       seg_ids=seg_id,
                                       perm_mask=perm_mask,
                                       mems=mems,
                                       input_mask=None,
                                       inp_q=input_q,
                                       target_mapping=target_mapping)

        logits = get_logits(xlnet_model, xlnet_config)

        # Getting new memory
        new_mems = xlnet_model.get_new_memory()

        # sample a token
        logits = tf.transpose(logits, (1, 0, 2))
        sampled_tokens, confs = sample_token(logits)
        sampled_tokens = sampled_tokens[:, -1, :]  # Last token
        confs = confs[:, -1, :]  # Last token
        prev_tokens = sampled_tokens if prev_tokens is None \
            else tf.concat([prev_tokens, sampled_tokens], axis=1)
        prev_confs = confs if prev_confs is None \
            else tf.concat([prev_confs, confs], axis=1)
        # Cache the memory of the the last latest_tokens
        if latest_tokens is not None:
            merged_mems = []

            for i, mem in enumerate(mems):
                merged_mems.append(
                    tf.concat([mems[i][1:], new_mems[i][-2:-1]], axis=0))
            mem_mask = tf.concat(
                [mem_mask[1:], tf.zeros_like(mem_mask[:1])], axis=0)
            return [
                merged_mems, sampled_tokens, mem_mask, prev_tokens, prev_confs
            ]

        return [mems, sampled_tokens, mem_mask, prev_tokens, prev_confs]

    mems, latest_tokens, mem_mask, prev_tokens, prev_confs = body(
        mems, latest_tokens, inp_mask, prev_tokens, prev_confs)

    args = tf.while_loop(
        cond=cond,
        body=body,
        maximum_iterations=FLAGS.num_toks_pred - 1,
        loop_vars=[mems, latest_tokens, mem_mask, prev_tokens, prev_confs],
        shape_invariants=[[
            tf.TensorShape([None, None, None]) for _ in range(len(mems))
        ],
                          tf.TensorShape([None, None]),
                          tf.TensorShape([None, None]),
                          tf.TensorShape([None, None]),
                          tf.TensorShape([None, None])])

    predicted_tokens, predicted_confs = args[-2:]
    return (predicted_tokens, predicted_confs), features
Exemplo n.º 26
0
def prediction_graph_no_memory():
    """Builds graphs and returns prediction and input features.
    Output:
    predictions: Tuple(Tensors) Currently returns sampled tokens and confidences
    features: Dict[str:tf.train.features] Contains following features:
              input_k
              seg_id
              input_mask
    """

    features = {
        "input_k": tf.placeholder(tf.int32, (None, None)),
        "seg_id": tf.placeholder(tf.int32, (None, None)),
        "input_mask": tf.placeholder(tf.float32, (None, None))
    }

    # Building prediction graph
    # Transforming features for batch channel on last axis
    inp = tf.transpose(features["input_k"], [1, 0])
    seg_id = tf.transpose(features["seg_id"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])

    # Model config
    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)
    run_config.mem_len = FLAGS.max_mem_length

    perm_mask = _create_mask(tf.shape(inp)[0], 0)[:, :, None]
    # Getting the hidden states for the prompts

    prev_tokens = None
    prev_conf = None
    # target mapping
    seq_len = tf.shape(inp)[0]
    batch_size = tf.shape(inp)[-1]
    target_mapping = tf.concat(
        [tf.zeros((1, seq_len - 1, batch_size)),
         tf.ones((1, 1, batch_size))],
        axis=1)

    def cond(*_):
        """Dummy condition since we stop based on iteration"""
        return True

    def recalc(inp, inp_mask, seg_id, perm_mask):
        """Augment the inputs for the new token. Appends 1 row or columns accordingly"""
        input_q = tf.zeros_like(inp, dtype=tf.float32)
        inp = tf.pad(inp,
                     tf.convert_to_tensor([[0, 1], [0, 0]]),
                     constant_values=0)
        inp_mask = tf.pad(inp_mask,
                          tf.convert_to_tensor([[0, 1], [0, 0]]),
                          constant_values=0)
        seg_id = tf.pad(seg_id,
                        tf.convert_to_tensor([[0, 1], [0, 0]]),
                        constant_values=0)
        col = tf.ones(tf.shape(perm_mask)[0:1], dtype=tf.float32)
        perm_mask = tf.concat([perm_mask, col[:, None, None]], axis=1)
        row = tf.concat([
            tf.zeros(tf.shape(perm_mask)[1:2] - 1, dtype=tf.float32),
            tf.ones([1], dtype=tf.float32)
        ],
                        axis=0)
        perm_mask = tf.concat([perm_mask, row[None, :, None]], axis=0)
        input_q = tf.pad(input_q,
                         tf.convert_to_tensor([[0, 1], [0, 0]]),
                         constant_values=1)

        return inp[1:], inp_mask[1:], perm_mask[1:,
                                                1:], input_q[1:], seg_id[1:]

    def body(inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf):
        """The main body of sampling loop.
        inp: input ids
        inp_mask: input masks for paddings, etc.
        seg_id: segment id. Zeros here.
        perm_mask: permutation mask to pass to transformer
        prev_tokens: all the previous tokens including latest_tokens
        prev_conf: confidences of respective tokens in prev_tokens
        """

        # get dummy input token and permutation mask
        input_k, input_mask, perm_mask, input_q, seg_id = recalc(
            inp, inp_mask, seg_id, perm_mask)
        # Get logits
        xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                       run_config=run_config,
                                       input_ids=input_k,
                                       seg_ids=seg_id,
                                       input_mask=inp_mask,
                                       perm_mask=perm_mask,
                                       inp_q=input_q,
                                       target_mapping=target_mapping)

        logits = get_logits(xlnet_model, xlnet_config)

        # sample a token
        logits = tf.transpose(logits, (1, 0, 2))
        sampled_tokens, confidences = sample_token(logits)
        sampled_tokens = sampled_tokens[:, -1, :]  # Last token
        confidences = confidences[:, -1, :]
        prev_tokens = sampled_tokens if prev_tokens is None \
            else tf.concat([prev_tokens, sampled_tokens], axis=1)
        prev_conf = confidences if prev_conf is None \
            else tf.concat([prev_conf, confidences], axis=1)

        input_k = tf.concat(
            [input_k[:-1], tf.transpose(sampled_tokens, (1, 0))], axis=0)
        perm_mask = _create_mask(tf.shape(input_k)[0], 0)[:, :, None]
        return [input_k, input_mask, seg_id, perm_mask, prev_tokens, prev_conf]

    inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf = body(
        inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf)
    args = tf.while_loop(
        cond=cond,
        body=body,
        maximum_iterations=FLAGS.num_toks_pred - 1,
        loop_vars=[inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf],
        shape_invariants=[
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None, None]),
            tf.TensorShape([None, None]),
            tf.TensorShape([None, None]),
        ])
    predicted_tokens, predicted_confs = args[-2:]
    return (predicted_tokens, predicted_confs), features
Exemplo n.º 27
0
def get_qa_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    cls_index = tf.reshape(features["cls_index"], [-1])

    seq_len = tf.shape(inp)[0]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    return_dict = {}

    # invalid position mask such as query and special symbols (PAD, SEP, CLS)
    p_mask = features["p_mask"]

    # logit of the start position
    with tf.variable_scope("start_logits"):
        start_logits = tf.layers.dense(output,
                                       1,
                                       kernel_initializer=initializer)
        start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

    # logit of the end position
    with tf.variable_scope("end_logits"):
        if is_training:
            # during training, compute the end logits based on the
            # ground truth of the start position

            start_positions = tf.reshape(features["start_positions"], [-1])
            start_index = tf.one_hot(start_positions,
                                     depth=seq_len,
                                     axis=-1,
                                     dtype=tf.float32)
            start_features = tf.einsum("lbh,bl->bh", output, start_index)
            start_features = tf.tile(start_features[None], [seq_len, 1, 1])
            end_logits = tf.layers.dense(tf.concat([output, start_features],
                                                   axis=-1),
                                         xlnet_config.d_model,
                                         kernel_initializer=initializer,
                                         activation=tf.tanh,
                                         name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(end_logits,
                                                      begin_norm_axis=-1)

            end_logits = tf.layers.dense(end_logits,
                                         1,
                                         kernel_initializer=initializer,
                                         name="dense_1")
            end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
            end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        else:
            # during inference, compute the end logits based on beam search

            start_top_log_probs, start_top_index = tf.nn.top_k(
                start_log_probs, k=FLAGS.start_n_top)
            start_index = tf.one_hot(start_top_index,
                                     depth=seq_len,
                                     axis=-1,
                                     dtype=tf.float32)
            start_features = tf.einsum("lbh,bkl->bkh", output, start_index)
            end_input = tf.tile(output[:, :, None],
                                [1, 1, FLAGS.start_n_top, 1])
            start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1])
            end_input = tf.concat([end_input, start_features], axis=-1)
            end_logits = tf.layers.dense(end_input,
                                         xlnet_config.d_model,
                                         kernel_initializer=initializer,
                                         activation=tf.tanh,
                                         name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(end_logits,
                                                      begin_norm_axis=-1)
            end_logits = tf.layers.dense(end_logits,
                                         1,
                                         kernel_initializer=initializer,
                                         name="dense_1")
            end_logits = tf.reshape(end_logits,
                                    [seq_len, -1, FLAGS.start_n_top])
            end_logits = tf.transpose(end_logits, [1, 2, 0])
            end_logits_masked = end_logits * (
                1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
            end_top_log_probs, end_top_index = tf.nn.top_k(end_log_probs,
                                                           k=FLAGS.end_n_top)
            end_top_log_probs = tf.reshape(
                end_top_log_probs, [-1, FLAGS.start_n_top * FLAGS.end_n_top])
            end_top_index = tf.reshape(
                end_top_index, [-1, FLAGS.start_n_top * FLAGS.end_n_top])

    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs
    else:
        return_dict["start_top_log_probs"] = start_top_log_probs
        return_dict["start_top_index"] = start_top_index
        return_dict["end_top_log_probs"] = end_top_log_probs
        return_dict["end_top_index"] = end_top_index

    # an additional layer to predict answerability
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)

        # get the representation of START
        start_p = tf.nn.softmax(start_logits_masked,
                                axis=-1,
                                name="softmax_start")
        start_feature = tf.einsum("lbh,bl->bh", output, start_p)

        # note(zhiliny): no dependency on end_feature so that we can obtain
        # one single `cls_logits` for each sample
        ans_feature = tf.concat([start_feature, cls_feature], -1)
        ans_feature = tf.layers.dense(ans_feature,
                                      xlnet_config.d_model,
                                      activation=tf.tanh,
                                      kernel_initializer=initializer,
                                      name="dense_0")
        ans_feature = tf.layers.dropout(ans_feature,
                                        FLAGS.dropout,
                                        training=is_training)
        cls_logits = tf.layers.dense(ans_feature,
                                     1,
                                     kernel_initializer=initializer,
                                     name="dense_1",
                                     use_bias=False)
        cls_logits = tf.squeeze(cls_logits, -1)

        return_dict["cls_logits"] = cls_logits

    return return_dict
Exemplo n.º 28
0
def get_qa_outputs(FLAGS, features, is_training):
  """Loss for downstream span-extraction QA tasks such as SQuAD."""
  outputs = []
  scopes = []
  with tf.variable_scope("preprocess_input",reuse=tf.AUTO_REUSE):
    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    cls_index = tf.reshape(features["cls_index"], [-1])

    seq_len = tf.shape(inp)[0]
    outputs.append(inp)
    scopes.append(tf.get_variable_scope().name)
  xlnet_config = xlnet.XLNetConfig(json_path="xl_net/xlnet_large/xlnet_config.json")
  run_config = xlnet.create_run_config(is_training, True, FLAGS)

  xlnet_model = xlnet.XLNetModel(
      xlnet_config=xlnet_config,
      run_config=run_config,
      input_ids=inp,
      seg_ids=seg_id,
      input_mask=inp_mask)
  output,outputs1,scopes1 = xlnet_model.get_sequence_output()
  initializer = xlnet_model.get_initializer()
  outputs = outputs+outputs1
  scopes = scopes+scopes1
  return_dict = {}

  # invalid position mask such as query and special symbols (PAD, SEP, CLS)


  # logit of the start position
  with tf.variable_scope("logits",reuse=tf.AUTO_REUSE):
    start_logits = tf.layers.dense(
        output,
        1,
        kernel_initializer=initializer)
    start_logits_masked = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
    start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)
  # logit of the end position
    if True:
      # during training, compute the end logits based on the
      # ground truth of the start position

      start_positions = tf.reshape(features["start_positions"], [-1])
      start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1,
                               dtype=tf.float32)
      start_features = tf.einsum("lbh,bl->bh", output, start_index)
      start_features = tf.tile(start_features[None], [seq_len, 1, 1])
      end_logits = tf.layers.dense(
          tf.concat([output, start_features], axis=-1), xlnet_config.d_model,
          kernel_initializer=initializer, activation=tf.tanh, name="dense_0")
      end_logits = tf.contrib.layers.layer_norm(
          end_logits, begin_norm_axis=-1)

      end_logits = tf.layers.dense(
          end_logits, 1,
          kernel_initializer=initializer,
          name="dense_1")
      end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
      end_log_probs = tf.nn.log_softmax(end_logits, -1)

      return_dict["start_log_probs"] = start_log_probs
      return_dict["end_log_probs"] = end_log_probs


  # an additional layer to predict answerability
    # get the representation of CLS
    cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
    cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)

    # get the representation of START
    start_p = tf.nn.softmax(start_logits_masked, axis=-1,
                            name="softmax_start")
    start_feature = tf.einsum("lbh,bl->bh", output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        xlnet_config.d_model,
        activation=tf.tanh,
        kernel_initializer=initializer, name="dense_0")
    ans_feature = tf.layers.dropout(ans_feature, 0.1,
                                    training=is_training)
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer=initializer,
        name="dense_1",
        use_bias=False)
    cls_logits = tf.squeeze(cls_logits, -1)

    return_dict["cls_logits"] = cls_logits
    outputs.append(cls_logits)
    scopes.append(tf.get_variable_scope().name)
  return return_dict,outputs,scopes
Exemplo n.º 29
0
def two_stream_loss(FLAGS, features, labels, mems, is_training):
    """Pretraining loss with two-stream attention Transformer-XL."""

    #### Unpack input
    mem_name = "mems"
    mems = mems.get(mem_name, None)

    inp_k = tf.transpose(features["input_k"], [1, 0])
    inp_q = tf.transpose(features["input_q"], [1, 0])

    seg_id = tf.transpose(features["seg_id"], [1, 0])

    inp_mask = None
    perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0])

    if FLAGS.num_predict is not None:
        # [num_predict x tgt_len x bsz]
        target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0])
    else:
        target_mapping = None

    # target for LM loss
    tgt = tf.transpose(features["target"], [1, 0])

    # target mask for LM loss
    tgt_mask = tf.transpose(features["target_mask"], [1, 0])

    # construct xlnet config and save to model_dir
    xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS)
    xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json"))

    # construct run config from FLAGS
    run_config = xlnet.create_run_config(is_training, False, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp_k,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask,
                                   mems=mems,
                                   perm_mask=perm_mask,
                                   target_mapping=target_mapping,
                                   inp_q=inp_q)

    output = xlnet_model.get_sequence_output()
    new_mems = {mem_name: xlnet_model.get_new_memory()}
    lookup_table = xlnet_model.get_embedding_table()

    initializer = xlnet_model.get_initializer()

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        # LM loss
        lm_loss = modeling.lm_loss(hidden=output,
                                   target=tgt,
                                   n_token=xlnet_config.n_token,
                                   d_model=xlnet_config.d_model,
                                   initializer=initializer,
                                   lookup_table=lookup_table,
                                   tie_weight=True,
                                   bi_data=run_config.bi_data,
                                   use_tpu=run_config.use_tpu)

    #### Quantity to monitor
    monitor_dict = {}

    if FLAGS.use_bfloat16:
        tgt_mask = tf.cast(tgt_mask, tf.float32)
        lm_loss = tf.cast(lm_loss, tf.float32)

    total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask)
    monitor_dict["total_loss"] = total_loss

    return total_loss, new_mems, monitor_dict
Exemplo n.º 30
0
def main(_):

    assert tf.gfile.Exists(FLAGS.init_checkpoint)

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    processor = SubLocProcessor()

    labels = processor.get_labels()
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    test_examples = processor.get_test_examples(FLAGS.data_dir)

    train_file_path = os.path.join(FLAGS.output_dir,
                                   get_basename(FLAGS.max_seq_length, "train"))
    test_file_path = os.path.join(FLAGS.output_dir,
                                  get_basename(FLAGS.max_seq_length, "test"))

    def tokenize_fn(text):
        text = preprocess_text(text)
        return encode_ids(text)

    # Create TF-Record for train examples
    file_based_convert_examples_to_features(train_examples, labels,
                                            FLAGS.max_seq_length, tokenize_fn,
                                            train_file_path)

    # Create TF-Record for test examples
    file_based_convert_examples_to_features(test_examples, labels,
                                            FLAGS.max_seq_length, tokenize_fn,
                                            test_file_path)

    train_set = get_dataset(train_file_path, FLAGS.max_seq_length,
                            FLAGS.batch_size)
    train_iter = train_set.make_one_shot_iterator()
    example = train_iter.get_next()

    inp = tf.transpose(example["input_ids"], [1, 0])
    seg_id = tf.transpose(example["segment_ids"], [1, 0])
    inp_mask = tf.transpose(example["input_mask"], [1, 0])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(False, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config,
                                   run_config=run_config,
                                   input_ids=inp,
                                   seg_ids=seg_id,
                                   input_mask=inp_mask)

    output = xlnet_model.get_sequence_output()

    init_from_checkpoint(FLAGS)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        try:
            while True:
                outs = sess.run(output)
                print(outs.shape)
        except tf.errors.OutOfRangeError:
            tf.logging.info("DONE")