Exemplo n.º 1
0
    def one_column_cached_transformer(self, decoder_input, cached_layers):
        hparams = self.hparams
        current_len = cached_layers[0].shape.as_list()[1]

        with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
            # Add positional embedding of shape [1, hid_size]
            pos_embedding, _ = modeling.embedding_lookup(
                input_ids=tf.constant([current_len]),  # [1]
                vocab_size=hparams.max_premise,  # >= premise_len
                embedding_size=hparams.hidden_size,
                initializer_range=hparams.initializer_range,
                word_embedding_name='positional_embedding',
            )
            pos_embedding = tf.reshape(pos_embedding,
                                       [1, 1, hparams.hidden_size])

            decoder_input = modeling.layer_norm_and_dropout(
                decoder_input +  # [batch, 1, hid_size]
                pos_embedding,  # [1,     1, hid_size]
                hparams.dropout_prob)  # [batch, 1, hid_size]

        with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
            # In this decoding transformer layer, our tensor can
            # attend to everything computed so far, including itself
            # => attention mask of shape: [batch, 1, current_len + 1]
            batch_size = tf.shape(decoder_input)[0]
            causal_attention_mask = tf.ones([batch_size, 1, current_len + 1])

            all_decoder_layers = modeling.cached_transformer_model(
                input_vector=decoder_input,
                cached_layers=cached_layers,
                attention_mask=causal_attention_mask,
                hidden_size=hparams.hidden_size,
                num_hidden_layers=hparams.num_decode_layers,
                num_attention_heads=hparams.num_attention_heads,
                intermediate_size=hparams.intermediate_size,
                intermediate_act_fn=modeling.get_activation(
                    hparams.hidden_act),
                hidden_dropout_prob=hparams.dropout_prob,
                attention_probs_dropout_prob=hparams.dropout_prob,
                initializer_range=hparams.initializer_range,
                do_return_all_layers=True,
                attention_top_k=hparams.attention_top_k,
                densify_attention_mask=hparams.densify_attention_mask)

            decoder_output = all_decoder_layers[-1]  # [batch, 1, hid_size]
        return decoder_output
Exemplo n.º 2
0
def bert_crf(bert_config, is_training, input_ids, segment_ids, input_mask,
             label_ids, sequence_length, num_labels, use_one_hot_embeddings):

    batch_size = tf.shape(input_ids)[0]
    bert_out = bert(bert_config, is_training, input_ids, input_mask,
                    segment_ids, use_one_hot_embeddings)
    #    hidden_size = tf.shape(bert_out)[-1]
    hidden_size = 768
    if is_training:
        bert_out = layer_norm_and_dropout(bert_out, 0.5)
    else:
        bert_out = layer_norm(bert_out)
    bert_out = tf.reshape(bert_out, [-1, hidden_size])
    linear_out = linear_layer(bert_out, hidden_size, num_labels, "linear")
    crf_out = crf_layer(linear_out, label_ids, batch_size, sequence_length,
                        num_labels, max_seq_length, "crf")
    return crf_out
Exemplo n.º 3
0
def bert_blstm_crf(bert_config, is_training, input_ids, segment_ids,
                   input_mask, label_ids, sequence_length, num_labels,
                   use_one_hot_embeddings):
    """combine bert + blstm + crf_layer

    :param bert_config: bert_config from model config file
    :type bert_config: dict
    :param is_training: train state
    :type is_training: bool
    :param input_ids: input text ids for each char
    :type input_ids: list
    :param segment_ids: 0 for first sentence and 1 for second sentence,
                        for this task, all is 0, length is max_seq_length
    :type segment_ids: list
    :param input_mask: mask for sentence to suit bert model,
                        for this task, all is 1, length is max_seq_length
    :type input_mask: list
    :param label_ids: BIO labels ids
    :type label_ids: list
    :param sequence_length: sequence length for each input sentence before padding
    :type sequence_length: list, [lengh_sentence1, 2,..]
    :param num_labels: nums of BIO labels
    :type num_labels: int
    :param use_one_hot_embeddings: wehter use_one_hot_embeddings
    :type use_one_hot_embeddings: bool
    :return: total_loss, per_example_loss, logits for ner, pred_ids using viterbi
    :rtype: tuple
    """

    batch_size = tf.shape(input_ids)[0]
    bert_out = bert(bert_config, is_training, input_ids, input_mask,
                    segment_ids, use_one_hot_embeddings)
    y_pred = blstm(is_training, bert_out)
    if is_training:
        y_pred = layer_norm_and_dropout(y_pred, 0.5)
    else:
        bert_out = layer_norm(bert_out)
    hidden_size = tf.shape(y_pred)[-1]
    blstm_out = linear_layer(y_pred, hidden_size, num_labels, "linear")
    crf_out = crf_layer(blstm_out, label_ids, batch_size, sequence_length,
                        num_labels, max_seq_length, "crf")
    return crf_out
Exemplo n.º 4
0
    def body(self, features):
        hparams = self.hparams
        if not self.is_training:
            hparams.dropout_prob = 0.0

        with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
            # attention_weights: [batch, n_head, from_len, to_len]
            sequence_output, cls_vector, attention_weights = self.build_encoder(
                features)

        if 'targets' not in features:
            assert self.hparams.dropout_prob == 0.0
            logits, losses = self.greedy_decode_8steps(cls_vector,
                                                       sequence_output)
            logits.update(attention_weights=attention_weights[:, :, 0, :])
            return logits, losses

        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
                premise = features[
                    'targets']  # [batch, premise_len=8] -bad naming:(
                # [batch, premise_len, hid_size]
                premise_vecs = premise_gather_nd(sequence_output, premise)

                batch_size = tf.shape(premise)[0]
                premise_len = premise.shape.as_list()[-1]
                theorem = features['theorem']  # batch, 1

                # [batch, 1, hid_size] and [num_theorems, hid_size]
                theorem_vec, theorem_emb_table = modeling.embedding_lookup(
                    input_ids=theorem,  # [batch, 1]
                    vocab_size=hparams.num_theorems,
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='theorem_embedding',
                )
                depth = features['depth']  # batch, 1

                decoder_input = tf.concat(
                    [
                        cls_vector,  # [batch, 1, hid_size]
                        theorem_vec,  # [batch, 1, hid_size]
                        premise_vecs[:, :
                                     -1, :]  # [batch, premise_len-1, hid_size]
                    ],
                    axis=1)  # [batch, premise_len + 1, hid_size]
                decode_length = decoder_input.shape.as_list()[1]
                assert decode_length == premise_len + 1

                # [decode_length, hid_size]
                pos_embedding, _ = modeling.embedding_lookup(
                    input_ids=tf.range(decode_length),  # [decode_length]
                    vocab_size=hparams.max_premise,  # >= premise_len
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='positional_embedding',
                )
                pos_embedding = tf.reshape(
                    pos_embedding, [1, decode_length, hparams.hidden_size])

                decoder_input = modeling.layer_norm_and_dropout(
                    decoder_input +  # [batch, decode_length, hid_size]
                    pos_embedding,  # [1,     decode_length, hid_size]
                    hparams.dropout_prob)  # [batch, decode_length, hid_size]

            with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
                causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part(
                    rows=decode_length,
                    cols=decode_length,
                    num_lower=-1,  # attend to everything before
                    num_upper=0,  # attend to nothing after
                    out_shape=[1, decode_length, decode_length
                               ])  # 1, decode_length, decode_length

                # [batch, decode_length, decode_length]
                causal_attention_mask = tf.tile(causal_attention_mask,
                                                [batch_size, 1, 1])

                all_decoder_layers = modeling.transformer_model(
                    input_tensor=decoder_input,
                    attention_mask=causal_attention_mask,
                    hidden_size=hparams.hidden_size,
                    num_hidden_layers=hparams.num_decode_layers,
                    num_attention_heads=hparams.num_attention_heads,
                    intermediate_size=hparams.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        hparams.hidden_act),
                    hidden_dropout_prob=hparams.dropout_prob,
                    attention_probs_dropout_prob=hparams.dropout_prob,
                    initializer_range=hparams.initializer_range,
                    do_return_all_layers=True,
                    attention_top_k=hparams.attention_top_k)

                decoder_output, _ = all_decoder_layers[
                    -1]  # [batch, dec_len, hid_size]
                theorem_feature = decoder_output[:, 0, :]  # [batch, hid_size]
                premise_feature = decoder_output[:,
                                                 1:, :]  # [batch, tar_len, hid_size]

        with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE):
            theorem_logits = tf.keras.layers.Dense(  # [batch, num_theorems]
                name='theorem',
                units=hparams.num_theorems,
                use_bias=True,
                kernel_initializer=modeling.create_initializer(
                    hparams.initializer_range))(theorem_feature)

            premise_logits = tf.matmul(
                a=premise_feature,  # [batch, premise_len, hid_size]
                b=sequence_output,  # [batch, sequence_len, hid_size]
                transpose_b=True,
            )  # [batch, premise_len, sequence_len]

            # [batch * premise_len, sequence_len]
            seq_len = premise_logits.shape.as_list()[-1]
            premise_logits = tf.reshape(premise_logits, [-1, seq_len])

            premise_weights = tf.cast(premise > 0,
                                      tf.float32)  # [batch, prem_len]
            premise_weights = tf.reshape(premise_weights,
                                         [-1])  # [batch * prem_len]
            premise = tf.reshape(premise, [-1, 1])  # [batch * prem_len, 1]

            theorem_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=theorem,  # [batch, 1]
                logits=theorem_logits  # [batch, num_theorems]
            )
            premise_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=premise,  # [batch * premise_len, 1]
                logits=premise_logits,  # [batch * premise_len, sequence_len]
                weights=premise_weights  # [batch * premise_len]
            )

            logits = dict(theorem_logits=theorem_logits,
                          theorem_labels=theorem,
                          premise_logits=premise_logits,
                          premise_labels=premise)

            losses = dict(training=theorem_loss + premise_loss,
                          theorem_loss=theorem_loss,
                          premise_loss=premise_loss)

        return logits, losses
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    # output_layer = model.get_pooled_output()
    output_layer = model.get_sequence_output()
    # bert后接入bilstm层
    # with tf.variable_scope('bilstm'):
    #     cell_fw = tf.contrib.rnn.BasicLSTMCell(512)
    #     cell_bw = tf.contrib.rnn.BasicLSTMCell(512)
    #     lstm_out, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw,
    #                                                   inputs=output_layer,
    #                                                   dtype=tf.float32)
    #     lstm_output = tf.concat(lstm_out, 2)

    # bert后接cnn+att
    # output_layer_expand = tf.expand_dims(output_layer, axis=3)
    # with tf.variable_scope('cnn'):
    #     filter_shape = [3, output_layer.shape[2], 1, 512]
    #     w = tf.get_variable('w', shape=filter_shape, initializer=tf.truncated_normal_initializer())
    #     b = tf.get_variable('b', shape=[512], initializer=tf.zeros_initializer())
    #     conv = tf.nn.conv2d(output_layer_expand, w, strides=[1, 1, 1, 1], padding='VALID')
    #     conv = tf.squeeze(conv, axis=2)
    #     conv = modeling.layer_norm_and_dropout(conv, bert_config.hidden_dropout_prob)
    #
    # # ttention_mask = modeling.create_attention_mask_from_input_mask(input_ids, input_mask)
    # att_output = modeling.attention_layer(from_tensor=conv, to_tensor=conv)
    # att_output = tf.reduce_mean(att_output, axis=1)
    # att_output = modeling.layer_norm_and_dropout(att_output, bert_config.hidden_dropout_prob)

    # bert+biattention
    output_layer_reverse = output_layer[:, ::-1, :]
    with tf.variable_scope('att_fw'):
        att_output = modeling.attention_layer(from_tensor=output_layer,
                                              to_tensor=output_layer)
    with tf.variable_scope('att_bw'):
        att_output_reverse = modeling.attention_layer(
            from_tensor=output_layer_reverse, to_tensor=output_layer_reverse)
    att_output_mix = tf.concat([att_output, att_output_reverse], axis=2)
    with tf.variable_scope('att_final'):
        att_output_final = modeling.attention_layer(from_tensor=att_output_mix,
                                                    to_tensor=att_output_mix)
    att_output_final = modeling.layer_norm_and_dropout(
        att_output_final, bert_config.hidden_dropout_prob)
    att_output_final = tf.reduce_mean(att_output_final, axis=1)

    with tf.variable_scope("pooler"):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token. We assume that this has been pre-trained
        # last_token_tensor = lstm_output[:, -1, :]
        last_token_tensor = att_output_final

        pooled_output = tf.layers.dense(
            last_token_tensor,
            bert_config.hidden_size,
            activation=tf.tanh,
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))

    #hidden_size = output_layer.shape[-1].value
    output_layer = pooled_output
    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities)