예제 #1
0
    def test_same_result(self):
        base_location = './google_bert/downloads/multilingual_L-12_H-768_A-12/'
        bert_config = BertConfig.from_json_file(base_location +
                                                'bert_config.json')
        init_checkpoint = base_location + 'bert_model.ckpt'

        def model_fn_builder(bert_config, init_checkpoint):
            """Returns `model_fn` closure for TPUEstimator."""
            def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
                """The `model_fn` for TPUEstimator."""

                unique_ids = features["unique_ids"]
                input_ids = features["input_ids"]
                input_mask = features["input_mask"]
                input_type_ids = features["input_type_ids"]

                model = BertModel(config=bert_config,
                                  is_training=False,
                                  input_ids=input_ids,
                                  input_mask=input_mask,
                                  token_type_ids=input_type_ids,
                                  use_one_hot_embeddings=False)

                if mode != tf.estimator.ModeKeys.PREDICT:
                    raise ValueError("Only PREDICT modes are supported: %s" %
                                     (mode))

                tvars = tf.trainable_variables()
                scaffold_fn = None
                (assignment_map, _) = get_assignment_map_from_checkpoint(
                    tvars, init_checkpoint)
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

                predictions = {
                    "unique_id": unique_ids,
                    "seq_out": model.get_sequence_output()
                }

                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    predictions=predictions,
                    scaffold_fn=scaffold_fn)
                return output_spec

            return model_fn

        batch_size = 8
        seq_len = 5
        xmb = np.random.randint(106, bert_config.vocab_size - 106,
                                (batch_size, seq_len))
        xmb2 = np.random.randint(0, 2, (batch_size, seq_len), dtype=np.int32)
        xmb3 = np.random.randint(0, 2, (batch_size, seq_len), dtype=np.int32)

        def input_fn(params):
            d = tf.data.Dataset.from_tensor_slices({
                "unique_ids":
                tf.constant([0, 1, 2], shape=[batch_size], dtype=tf.int32),
                "input_ids":
                tf.constant(xmb, shape=[batch_size, seq_len], dtype=tf.int32),
                "input_mask":
                tf.constant(xmb2, shape=[batch_size, seq_len], dtype=tf.int32),
                "input_type_ids":
                tf.constant(xmb3, shape=[batch_size, seq_len], dtype=tf.int32),
            })

            d = d.batch(batch_size=batch_size, drop_remainder=False)
            return d

        model_fn = model_fn_builder(bert_config=bert_config,
                                    init_checkpoint=init_checkpoint)
        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            master=None,
            tpu_config=tf.contrib.tpu.TPUConfig(
                num_shards=8, per_host_input_for_training=is_per_host))
        estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False,
                                                model_fn=model_fn,
                                                config=run_config,
                                                predict_batch_size=batch_size)
        tf_result = [r for r in estimator.predict(input_fn)]

        import tensorflow.keras.backend as K

        K.set_learning_phase(0)
        my_model = load_google_bert(base_location, max_len=seq_len)

        from data.dataset import create_attention_mask, generate_pos_ids

        pos = generate_pos_ids(batch_size, seq_len)
        k_mask = create_attention_mask(xmb2, False, None, None, True)
        bert_encoder = BERTTextEncoder(base_location + 'vocab.txt')
        for b in range(len(xmb)):
            xmb[b] = np.array(bert_encoder.standardize_ids(xmb[b].tolist()))
        k_output = my_model.predict([xmb, xmb3, pos, k_mask])
        max_max = 0
        for i in range(batch_size):
            if k_mask[i].mean(
            ) != 0:  # TODO (when mask == full zero, keras_res != tf_res)
                new_max = np.abs(k_output[i] - tf_result[i]['seq_out']).max()
                if new_max > max_max:
                    max_max = new_max
        assert max_max < 5e-5, max_max  # TODO reduce the error (I think it's because of the LayerNorm)
예제 #2
0
    if args.const_folding:
        outputs = [
            tf.identity(tf.identity(logits, name="logits"),
                        name="logits_identity")
        ]
    else:
        outputs = [tf.identity(logits, name="logits")]
elif args.model_name == 'bert':
    print('>> Converting graph bert')
    batch_size = 1
    seq_len = 128
    num_layers = 2
    bert_config = BertConfig(
        vocab_size=30522,
        hidden_size=1024,  # 768,
        num_hidden_layers=num_layers,  # 12,
        num_attention_heads=16,  #12,
        intermediate_size=4096,  #3072,
        type_vocab_size=2,
    )
    input_ids = tf.placeholder(tf.int32, shape=(batch_size, seq_len))
    input_mask = tf.placeholder(tf.int32, shape=(batch_size, seq_len))
    segment_ids = tf.placeholder(tf.int32, shape=(batch_size, seq_len))

    model = BertModel(config=bert_config,
                      is_training=False,
                      input_ids=input_ids,
                      input_mask=input_mask,
                      token_type_ids=segment_ids,
                      use_one_hot_embeddings=False)

    output_layer = model.get_pooled_output()
예제 #3
0
파일: model.py 프로젝트: wqw123/BERT-keras
def load_bert(
        base_location: str = './google_bert/model/uncased_L-12_H-768_A-12/',
        use_attn_mask: bool = True,
        max_len: int = 512) -> keras.Model:
    print(1)
    import tensorflow as tf
    from google_bert.modeling import BertConfig
    print(2)
    bert_config = BertConfig.from_json_file(base_location + 'bert_config.json')
    init_checkpoint = base_location + 'bert_model.ckpt'
    print(3)
    var_names = tf.train.list_variables(init_checkpoint)
    print(4)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    print(5)
    model = create_transformer(
        embedding_layer_norm=True,
        neg_inf=-10000.0,
        use_attn_mask=use_attn_mask,
        vocab_size=bert_config.vocab_size - TextEncoder.SPECIAL_COUNT,
        accurate_gelu=True,
        ln_epsilon=1e-12,
        max_len=max_len,
        use_one_embedding_dropout=True,
        d_hid=bert_config.intermediate_size,
        embedding_dim=bert_config.hidden_size,
        num_layers=bert_config.hidden_size,
        num_heads=bert_config.num_attention_heads,
        residual_dropout=bert_config.hidden_dropout_prob,
        attention_dropout=bert_config.attention_probs_dropout_prob)
    print(6)
    if K.backend() == 'tensorflow':
        weights = [np.zeros(w.shape) for w in model.weights]
    else:
        weights = [np.zeros(w.get_value().shape) for w in model.weights]
    for var_name, _ in var_names:
        w_id = None
        qkv = None
        is_pos_embedding = False
        unsqueeze = False
        parts = var_name.split('/')
        first_vars_size = 5
        if parts[1] == 'embeddings':
            n = parts[-1]
            if n == 'token_type_embeddings':  # TODO handle special_tokens
                w_id = 0
            elif n == 'position_embeddings':
                w_id = 1
                is_pos_embedding = True
            elif n == 'word_embeddings':
                w_id = 2
            elif n == 'gamma':
                w_id = 3
            elif n == 'beta':
                w_id = 4
            else:
                raise ValueError()
        elif parts[2].startswith('layer_'):
            layer_number = int(parts[2][len('layer_'):])
            if parts[3] == 'attention':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 5
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 4
                elif parts[-2] == 'dense':
                    if parts[-1] == 'bias':
                        w_id = first_vars_size + layer_number * 12 + 3
                    elif parts[-1] == 'kernel':
                        w_id = first_vars_size + layer_number * 12 + 2
                        unsqueeze = True
                    else:
                        raise ValueError()
                elif parts[-2] == 'key' or parts[-2] == 'query' or parts[
                        -2] == 'value':
                    w_id = first_vars_size + layer_number * 12 + (
                        0 if parts[-1] == 'kernel' else 1)
                    unsqueeze = parts[-1] == 'kernel'
                    qkv = parts[-2][0]
                else:
                    raise ValueError()
            elif parts[3] == 'intermediate':
                if parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 7
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 6
                    unsqueeze = True
                else:
                    raise ValueError()
            elif parts[3] == 'output':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 11
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 10
                elif parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 9
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 8
                    unsqueeze = True
                else:
                    raise ValueError()

        if w_id is not None and qkv is None:
            print(var_name, ' -> ', model.weights[w_id].name)
            if is_pos_embedding:
                weights[w_id][:max_len, :] = check_point.get_tensor(
                    var_name
                )[:max_len, :] if not unsqueeze else check_point.get_tensor(
                    var_name)[None, :max_len, :]
            else:
                weights[w_id][:] = check_point.get_tensor(var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
        elif w_id is not None:
            print(var_name, ' -> ', model.weights[w_id].name, '::', qkv)
            p = {'q': 0, 'k': 1, 'v': 2}[qkv]
            if weights[w_id].ndim == 3:
                dim_size = weights[w_id].shape[1]
                weights[w_id][0, :, p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(
                    var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
            else:
                dim_size = weights[w_id].shape[0] // 3
                weights[w_id][p * dim_size:(p + 1) *
                              dim_size] = check_point.get_tensor(var_name)
        else:
            print(
                'not mapped: ',
                var_name)  # TODO pooler, cls/predictions, cls/seq_relationship
    model.set_weights(weights)
    return model
예제 #4
0
def load_google_bert(
        base_location:
    str = './google_bert/downloads/multilingual_L-12_H-768_A-12/',
        use_attn_mask: bool = True,
        max_len: int = 512,
        verbose: bool = False) -> keras.Model:
    bert_config = BertConfig.from_json_file(base_location + 'bert_config.json')
    init_checkpoint = base_location + 'bert_model.ckpt'
    var_names = tf.train.list_variables(init_checkpoint)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    vocab_size = bert_config.vocab_size - TextEncoder.BERT_SPECIAL_COUNT - TextEncoder.BERT_UNUSED_COUNT
    model = create_transformer(
        embedding_layer_norm=True,
        neg_inf=-10000.0,
        use_attn_mask=use_attn_mask,
        vocab_size=vocab_size,
        accurate_gelu=True,
        layer_norm_epsilon=1e-12,
        max_len=max_len,
        use_one_embedding_dropout=True,
        d_hid=bert_config.intermediate_size,
        embedding_dim=bert_config.hidden_size,
        num_layers=bert_config.num_hidden_layers,
        num_heads=bert_config.num_attention_heads,
        residual_dropout=bert_config.hidden_dropout_prob,
        attention_dropout=bert_config.attention_probs_dropout_prob)
    if K.backend() == 'tensorflow':
        weights = [np.zeros(w.shape) for w in model.weights]
    else:
        weights = [np.zeros(w.get_value().shape) for w in model.weights]
    for var_name, _ in var_names:
        w_id = None
        qkv = None
        unsqueeze = False
        parts = var_name.split('/')
        first_vars_size = 5
        if parts[1] == 'embeddings':
            n = parts[-1]
            if n == 'token_type_embeddings':
                w_id = 0
            elif n == 'position_embeddings':
                w_id = 1
            elif n == 'word_embeddings':
                w_id = 2
            elif n == 'gamma':
                w_id = 3
            elif n == 'beta':
                w_id = 4
            else:
                raise ValueError()
        elif parts[2].startswith('layer_'):
            layer_number = int(parts[2][len('layer_'):])
            if parts[3] == 'attention':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 5
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 4
                elif parts[-2] == 'dense':
                    if parts[-1] == 'bias':
                        w_id = first_vars_size + layer_number * 12 + 3
                    elif parts[-1] == 'kernel':
                        w_id = first_vars_size + layer_number * 12 + 2
                        unsqueeze = True
                    else:
                        raise ValueError()
                elif parts[-2] == 'key' or parts[-2] == 'query' or parts[
                        -2] == 'value':
                    w_id = first_vars_size + layer_number * 12 + (
                        0 if parts[-1] == 'kernel' else 1)
                    unsqueeze = parts[-1] == 'kernel'
                    qkv = parts[-2][0]
                else:
                    raise ValueError()
            elif parts[3] == 'intermediate':
                if parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 7
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 6
                    unsqueeze = True
                else:
                    raise ValueError()
            elif parts[3] == 'output':
                if parts[-1] == 'beta':
                    w_id = first_vars_size + layer_number * 12 + 11
                elif parts[-1] == 'gamma':
                    w_id = first_vars_size + layer_number * 12 + 10
                elif parts[-1] == 'bias':
                    w_id = first_vars_size + layer_number * 12 + 9
                elif parts[-1] == 'kernel':
                    w_id = first_vars_size + layer_number * 12 + 8
                    unsqueeze = True
                else:
                    raise ValueError()

        if w_id is not None and qkv is None:
            if verbose:
                print(var_name, ' -> ', model.weights[w_id].name)
            if w_id == 1:  # pos embedding
                weights[w_id][:max_len, :] = check_point.get_tensor(
                    var_name
                )[:max_len, :] if not unsqueeze else check_point.get_tensor(
                    var_name)[None, :max_len, :]
            elif w_id == 2:  # word embedding
                # ours: unk, [vocab], pad, msk(mask), bos(cls), del(use sep again), eos(sep)
                # theirs: pad, 99 unused, unk, cls, sep, mask, [vocab]
                saved = check_point.get_tensor(
                    var_name)  # vocab_size, emb_size
                # weights[our_position] = saved[their_position]
                weights[w_id][0] = saved[1 +
                                         TextEncoder.BERT_UNUSED_COUNT]  # unk
                weights[w_id][1:vocab_size] = saved[-vocab_size + 1:]
                weights[w_id][vocab_size + TextEncoder.PAD_OFFSET] = saved[0]
                weights[w_id][vocab_size + TextEncoder.MSK_OFFSET] = saved[
                    4 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.BOS_OFFSET] = saved[
                    2 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.DEL_OFFSET] = saved[
                    3 + TextEncoder.BERT_UNUSED_COUNT]
                weights[w_id][vocab_size + TextEncoder.EOS_OFFSET] = saved[
                    3 + TextEncoder.BERT_UNUSED_COUNT]
            else:
                weights[w_id][:] = check_point.get_tensor(var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
        elif w_id is not None:
            if verbose:
                print(var_name, ' -> ', model.weights[w_id].name, '::', qkv)
            p = {'q': 0, 'k': 1, 'v': 2}[qkv]
            if weights[w_id].ndim == 3:
                dim_size = weights[w_id].shape[1]
                weights[w_id][0, :, p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(
                    var_name) if not unsqueeze else \
                    check_point.get_tensor(var_name)[
                        None, ...]
            else:
                dim_size = weights[w_id].shape[0] // 3
                weights[w_id][p * dim_size:(p + 1) *
                              dim_size] = check_point.get_tensor(var_name)
        else:
            if verbose:
                print('not mapped: ', var_name
                      )  # TODO pooler, cls/predictions, cls/seq_relationship
    model.set_weights(weights)
    return model