示例#1
0
 def __init__(self, config: bert.BertConfig, is_training, num_units, inputs, segments, inputs_length=None,
              answers=None, answers_length=None, layers=None):
     if inputs_length is None:
         inputs_mask = tf.ones(tf.shape(inputs), dtype=tf.bool)
     else:
         inputs_mask = tf.sequence_mask(inputs_length)
     if answers is None:
         assert answers_length is None
     if layers is None:
         layers = [config.num_hidden_layers - 1]
     self.layers = layers
     self.bertmodel = bert.BertModel(config=config, is_training=is_training, input_ids=inputs,
                                     input_mask=inputs_mask, token_type_ids=segments, scope='bert')
     self._num_units = num_units
     outputs = self.bertmodel.get_all_encoder_layers()
     self.outputs = {i: outputs[i] for i in self.layers}
     #with tf.device("cpu:0"):
     self.answers_finders = {i: AnswerFinder(num_units, name="answer_layer_" + str(i))
                             for i in self.layers}
     self._logprobs = []
     for i, answer_finder in self.answers_finders.items():
         x = answer_finder(inputs=self.outputs[i], mask=tf.cast(segments, tf.bool))
         self._logprobs.append((i, x))
     self._logprobs = dict(self._logprobs)
     self._predicts = {i: predict(logprob) for i, logprob in self._logprobs.items()}
     if answers is not None:
         if answers_length is None:
             answers_mask = tf.ones(tf.shape(answers)[0:2], dtype=tf.bool)
         else:
             answers_mask = tf.sequence_mask(answers_length)
         self._losses = {i: loss(self._logprobs[i], answers, answers_mask) for i in self.layers}
         self._accuracy = {i: accuracy(self._predicts[i], answers, answers_mask) for i in self.layers}
示例#2
0
 def _f():
     model_2 = bert.BertModel(config=config, trainable=True, name=name)
     inputs = tf.placeholder(shape=[None, None], dtype=tf.int32)
     mask = tf.placeholder(shape=[None, None], dtype=tf.int32)
     y = model_2(inputs, input_mask=mask)
     assigns = []
     variables = model_2.variables
     transformer_variables = sorted(zip(
         (var.name.lower() for var in variables), variables),
                                    key=lambda t: t[0])
     off_bert_pairs = sorted(zip((var.name.lower() for var in gb),
                                 official_bert_variables),
                             key=lambda t: t[0])
     for i in range(len(transformer_variables)):
         assigns.append(
             tf.assign(transformer_variables[i][1], off_bert_pairs[i][1]))
     return model_2, assigns
示例#3
0
    def _disambiguation_layer(self, seqs):
        with tf.variable_scope('disambiguation'):
            word_embeddings = self._make_word_embeddings(seqs)

            model = bert.BertModel(self._disambiguation_bert_config,
                                   self._training, word_embeddings,
                                   self._padding)

            # (batch_size, sentence_len, embedding_size)
            reps = model.get_output()
            # (batch_size, sentence_len, n_senses)
            sense_probs = self._calculate_sense_probs(seqs, reps)

            # (batch_size, sentence_len, embedding_size)
            disambiguated_reps = self._make_word_embeddings(
                seqs, sense_weights=sense_probs)

            return disambiguated_reps, sense_probs
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = bert.BertModel(config=bert_config,
                           is_training=is_training,
                           input_ids=input_ids,
                           input_mask=input_mask,
                           token_type_ids=segment_ids,
                           use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    # with tf.variable_scope("loss"):
    if is_training:
        # I.e., 0.1 dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    probabilities = tf.nn.softmax(logits, axis=-1)
    predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)

    return loss, per_example_loss, logits, probabilities, predictions
示例#5
0
    def _f():
        config.num_hidden_layers = len(corresponding_blocks)
        model_2 = bert.BertModel(config=config, trainable=True, name=name)
        inputs = tf.placeholder(shape=[None, None], dtype=tf.int32)
        mask = tf.placeholder(shape=[None, None], dtype=tf.int32)
        y = model_2(inputs, input_mask=mask)
        assigns = []
        variables = model_2.variables

        def atoi(text):
            return int(text) if text.isdigit() else text

        transformer_variables = sorted(
            zip((var.name.lower() for var in variables), variables),
            key=lambda t: [atoi(c) for c in re.split(r'(\d+)', t[0])])
        off_bert_pairs = sorted(
            zip((var.name.lower() for var in gb), official_bert_variables),
            key=lambda t: [atoi(c) for c in re.split(r'(\d+)', t[0])])

        embedding_variables = 5
        layer_variables = 16
        pooling_variables = 2

        off_bert_pairs_by_block = [off_bert_pairs[0:pooling_variables]]
        for j in range(num_blocks):
            off_bert_pairs_by_block += [
                off_bert_pairs[pooling_variables +
                               j * layer_variables:pooling_variables +
                               (j + 1) * layer_variables]
            ]
        off_bert_pairs_by_block += [off_bert_pairs[-embedding_variables:]]

        transformer_variables_by_block = [
            transformer_variables[0:pooling_variables]
        ]
        for j in range(len(corresponding_blocks)):
            transformer_variables_by_block += [
                transformer_variables[pooling_variables +
                                      j * layer_variables:pooling_variables +
                                      (j + 1) * layer_variables]
            ]
        transformer_variables_by_block += [
            transformer_variables[-embedding_variables:]
        ]

        for j in range(len(corresponding_blocks) + 2):
            if j == 0:
                for k in range(pooling_variables):
                    assigns.append(
                        tf.assign(transformer_variables_by_block[j][k][1],
                                  off_bert_pairs_by_block[j][k][1]))
            elif j == len(corresponding_blocks) + 2 - 1:
                for k in range(embedding_variables):
                    assigns.append(
                        tf.assign(transformer_variables_by_block[j][k][1],
                                  off_bert_pairs_by_block[j][k][1]))
            else:
                for k in range(layer_variables):
                    assigns.append(
                        tf.assign(
                            transformer_variables_by_block[j][k][1],
                            off_bert_pairs_by_block[
                                corresponding_blocks[j]][k][1]))

        return model_2, assigns
示例#6
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        # MTF setup.
        graph = mtf.Graph()
        # mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
        # layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)
        if FLAGS.mode == "auto_parallel":
            mesh_shape_map = {
                1: [("processor_rows", 1)],
                2: [("processor_rows", 2)],
                4: [("processor_rows", 2), ("processor_cols", 2)],
                8: [("processor_rows", 2), ("processor_cols", 4)]
            }
        elif FLAGS.mode == "data_parallel":
            mesh_shape_map = {
                1: [("processor_rows", 1)],
                2: [("processor_rows", 2)],
                4: [("processor_rows", 4)],
                8: [("processor_rows", 8)]
            }
        else:
            raise ValueError

        mesh_shape = mesh_shape_map[FLAGS.gpu_num]
        devices = [f"gpu:{i}" for i in range(FLAGS.gpu_num)]

        var_placer = None
        mesh = mtf.Mesh(graph, "bert_mesh", var_placer)
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1)

        batch_size = input_ids.get_shape()[0].value
        batch_dim = mtf.Dimension("batch", batch_size)

        seq_length = input_ids.get_shape()[1].value
        seq_dim = mtf.Dimension("seq", seq_length)
        max_predictions_per_seq = masked_lm_positions.get_shape()[1].value
        max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq",
                                                    max_predictions_per_seq)

        mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids,
                                             [batch_dim, seq_dim])
        mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask,
                                              [batch_dim, seq_dim])
        mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids,
                                               [batch_dim, seq_dim])
        mtf_masked_lm_positions = mtf.import_tf_tensor(
            mesh, masked_lm_positions,
            [batch_dim, max_predictions_per_seq_dim])
        mtf_masked_lm_ids = mtf.import_tf_tensor(
            mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim])

        mtf_masked_lm_weights = mtf.import_tf_tensor(
            mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim])
        mtf_next_sentence_labels = mtf.import_tf_tensor(
            mesh, next_sentence_labels, [batch_dim])

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = bert_lib.BertModel(config=bert_config,
                                   is_training=is_training,
                                   input_ids=mtf_input_ids,
                                   input_mask=mtf_input_mask,
                                   token_type_ids=mtf_segment_ids,
                                   mesh_shape=mesh_shape)

        (masked_lm_loss, masked_lm_example_loss,
         masked_lm_logits) = model.get_masked_lm_output(
             mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss, next_sentence_logits
         ) = model.get_next_sentence_output(mtf_next_sentence_labels)

        extra_loss = model.get_extra_loss()

        total_loss = masked_lm_loss + next_sentence_loss
        total_loss = mtf.anonymize(total_loss)
        masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss)
        masked_lm_logits = mtf.anonymize(masked_lm_logits)
        next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss)
        next_sentence_logits = mtf.anonymize(next_sentence_logits)

        outputs = [total_loss]
        if FLAGS.mode == "auto_parallel":
            layout_rules = mtf.auto_mtf.layout(graph, mesh_shape, outputs)
        elif FLAGS.mode == "data_parallel":
            layout_rules = [('batch', 'processor_rows')]
        else:
            raise ValueError

        variables = graph._all_variables
        for v in variables:
            tf.logging.info(
                "[parameter] (name,shape,dtype): ({},{},{})".format(
                    v.name, v.shape, v.dtype.master_dtype))
        tf.logging.info("layout rules: {}".format(layout_rules))
        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
            mesh_shape, layout_rules, devices)
        # TRAIN mode
        if mode == tf.estimator.ModeKeys.TRAIN:
            _, update_ops = optimization_lib.create_optimizer(
                total_loss + extra_loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                optimizer=FLAGS.optimizer,
                clip_gradients=FLAGS.clip_gradients)

        lowering = mtf.Lowering(graph, {mesh: mesh_impl})

        tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss))

        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.train.get_global_step()
            tf_update_ops = [
                lowering.lowered_operation(op) for op in update_ops
            ]
            tf_update_ops.append(tf.assign_add(global_step, 1))
            tf.logging.info("tf_update_ops: {}".format(tf_update_ops))
            train_op = tf.group(tf_update_ops)

        with mtf.utils.outside_all_rewrites():
            # Copy master variables to slices. Must be called first.
            restore_hook = mtf.MtfRestoreHook(lowering)
            if mode == tf.estimator.ModeKeys.TRAIN:
                saver = tf.train.Saver(tf.global_variables(),
                                       sharded=True,
                                       max_to_keep=10,
                                       keep_checkpoint_every_n_hours=2,
                                       defer_build=False,
                                       save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                # saver_listener = mtf.MtfCheckpointSaverListener(lowering)
                # saver_hook = tf.train.CheckpointSaverHook(
                #     FLAGS.output_dir,
                #     save_steps=1000,
                #     saver=saver,
                #     listeners=[saver_listener])

                return tf.estimator.EstimatorSpec(
                    tf.estimator.ModeKeys.TRAIN,
                    loss=tf_loss,
                    train_op=train_op,
                    training_hooks=[restore_hook])
示例#7
0
print("------------------------------------")
print("Prepared data...")
print("Number of labels: ", len(labels))
print("Number of training examples: ", len(training_examples))
print("Number of training steps: ", num_training_steps)
print("Number of evaluation examples: ", len(evaluation_examples))
print("Number of evaluation steps: ", num_evaluation_steps)
print("Number of test examples: ", len(test_examples))
print("Number of test steps: ", num_test_steps)

# Define BERT Model
print("------------------------------------")
print("Define BERT Model...")
model = bert.BertModel(config=bert_config,
                       is_training=True,
                       input_ids=input_ids,
                       input_mask=input_mask,
                       token_type_ids=segment_ids,
                       use_one_hot_embeddings=False)

# In the demo, we are doing a simple classification task on the entire segment.
# If you want to use the token-level output, use model.get_sequence_output() instead.
output_layer = model.get_pooled_output()

hidden_size = output_layer.shape[-1].value

output_weights = tf.get_variable(
    "output_weights", [len(labels), hidden_size],
    initializer=tf.truncated_normal_initializer(stddev=0.02))

output_bias = tf.get_variable("output_bias", [len(labels)],
                              initializer=tf.zeros_initializer())
示例#8
0
 def _prediction_layer(self, reps):
     with tf.variable_scope('prediction'):
         model = bert.BertModel(self._prediction_bert_config,
                                self._training, reps, self._padding)
         return model.get_output()
示例#9
0
def create_model(bert_config, is_training, use_pcnn, input_ids, input_mask,
                 head_ids, tail_ids, num_labels, use_one_hot_embeddings,
                 segment_mask, position1, position2):
    model = bert.BertModel(config=bert_config,
                           is_training=is_training,
                           input_ids=input_ids,
                           input_mask=input_mask,
                           head_ids=head_ids,
                           tail_ids=tail_ids,
                           use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()

    head_embedding, neg_head_embedding = model.get_head_embedding()
    tail_embedding, neg_tail_embedding = model.get_tail_embedding()

    pos_embedding = network.pos_embedding(
        position1,
        position2,
        pos_embedding_dim=FLAGS.pos_embedding_dim,
        max_length=FLAGS.max_seq_length)
    output_layer = tf.concat([output_layer, pos_embedding], -1)

    # [batch_size, hidden_size]
    sentence_embedding = tf.layers.conv1d(
        inputs=output_layer,
        filters=bert_config.hidden_size,
        kernel_size=3,
        strides=1,
        padding="same",
        kernel_initializer=tf.contrib.layers.xavier_initializer())
    if use_pcnn:
        mask_embedding = tf.constant(
            [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
        mask = tf.nn.embedding_lookup(mask_embedding, segment_mask)
        sentence_embedding = tf.reduce_max(tf.expand_dims(
            mask * 100, 2) + tf.expand_dims(sentence_embedding, 3),
                                           axis=1) - 100
        return tf.reshape(sentence_embedding,
                          [-1, bert_config.hidden_size * 3])
    else:
        sentence_embedding = tf.reduce_max(sentence_embedding, axis=-2)

    # sentence_embedding = network.encoder(output_layer, segment_mask, bert_config.hidden_size, use_pcnn)

    output_weights = tf.get_variable(
        "output_weights", [num_labels, bert_config.hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            sentence_embedding = tf.nn.dropout(sentence_embedding,
                                               keep_prob=0.9)
        positive = tf.add(head_embedding, tail_embedding)
        positive = abs(tf.add(positive, -sentence_embedding))
        positive = tf.reduce_sum(positive, axis=1, keep_dims=True)
        negative = tf.add(neg_head_embedding, neg_tail_embedding)
        negative = abs(tf.add(negative, -sentence_embedding))
        negative = tf.reduce_sum(negative, axis=1, keep_dims=True)

        per_trans_loss = tf.maximum(positive - negative + FLAGS.margin, 0)
        total_trans_loss = tf.reduce_mean(per_trans_loss)

        logits = tf.matmul(sentence_embedding,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits)

        return per_trans_loss, total_trans_loss, logits, probabilities
示例#10
0
import tensorflow as tf
import bert
import HP

if __name__ == "__main__":
    bert_config = bert.BertConfig.from_json_file(HP.bert_config)
    x = tf.placeholder(dtype=tf.float32, shape=[1, 1])
    model = bert.BertModel(bert_config, False, x, scope='bert')
    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    saver1 = tf.train.Saver()
    saver2 = tf.train.Saver({v.name: v for v in tf.global_variables()})
    saver1.restore(sess, HP.start1_checkpoint)
    saver2.save(sess, HP.start1_checkpoint)