示例#1
0
 def test_TFBertForPreTraining(self):
     from transformers import BertConfig, TFBertForPreTraining
     keras.backend.clear_session()
     # pretrained_weights = 'bert-base-uncased'
     tokenizer_file = 'bert_bert-base-uncased.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = BertConfig()
     model = TFBertForPreTraining(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2,
                          atol=1.e-4))
def run_chinese_bert():
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = TFBertForPreTraining.from_pretrained('bert-base-chinese', output_attentions=True)
    # print(f"model config:{model.config}")
    # model.config.output_attentions = True
    input_ids = tf.constant(tokenizer.encode("习近平总书记讲到三个关联“做好疫情防控工作,直接关系人民生命安全和身体健康,直接关系经济社会大局稳定,也事关我国对外开放”"))[None,:]  # Batch size 1

    outputs = model(input_ids)
    print(f"outputs shapes:{outputs}")
    logits = outputs[0]
    # print(logits)
    # print("-" * 30)
    # print(f"outputs:{outputs}")
    print("-" * 30)
    # print(tf.math.argmax(logits, axis=2)[0, :])
    print(tokenizer.decode(tf.math.argmax(logits, axis=2)[0, :]))
示例#3
0
 def test_TFBertForPreTraining(self):
     from transformers import BertTokenizer, TFBertForPreTraining
     pretrained_weights = 'bert-base-uncased'
     tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFBertForPreTraining.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name,
                          onnx_model,
                          inputs_onnx,
                          predictions,
                          self.model_files,
                          rtol=1.e-2,
                          atol=1.e-4))
def get_model(label_list):
    K.clear_session()

    bert_model = TFBertForPreTraining.from_pretrained(bert_path, from_pt=True)

    input_indices = Input(shape=(None, ), dtype='int32')

    bert_output = bert_model(input_indices)
    projection_logits = bert_output[0]
    bert_cls = Lambda(lambda x: x[:, 0])(
        projection_logits)  # 取出[CLS]对应的向量用来做分类

    dropout = Dropout(0.5)(bert_cls)
    output = Dense(len(label_list), activation='softmax')(dropout)

    model = Model(input_indices, output)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(1e-5),  #用足够小的学习率
        metrics=['accuracy'])
    print(model.summary())
    return model
from official.modeling import tf_utils

# # BERT模型
# `BERT`模型基于`Transformer`的编码器,由12层或更多的`EncoderLayer`组成,如下图所示,其中输入数据在模型中的维度变化如图右所示:
# <img src="../images/bert结构.png" width="80%">

# +
# huggingface 的 bert 模型
from transformers import BertTokenizer, TFBertForPreTraining

tokenizer = BertTokenizer.from_pretrained(
    '../models/bert/vocabulary.txt',  # 从保存有词汇表的本地文件载入
    do_lower_case=True)

model = TFBertForPreTraining.from_pretrained(
    "../../H/models/huggingface/bert-base-uncased/")

# 模型所有的参数,以 list 的形式
params = model.weights
# -

model.summary()

# ## 模型配置
# 创建模型时需要指定的参数:
# - `vocab_size`, 词汇表的大小,用于词嵌入矩阵
# - `hidden_size=768`, 编码层的尺寸
# - `num_hidden_layers=12`, 编码层的层数
# - `num_attention_heads=12`, 多头注意力的头数
# - `intermediate_size=3072`, 编码器中前向层的尺寸
# - `hidden_act="gelu"`, 编码器中激活函数
        print()
        print(
            "===============================================================")
        print("Processing model:", model_name)
        print("Will be saved to:", cache_dir, "\n")
        time.sleep(2)

        if model_name.startswith("bert"):
            is_encoder_decoder = False
            if "uncased" in model_name:
                vocab_size = 30522
            else:
                vocab_size = 28996

            model = TFBertForPreTraining.from_pretrained(
                model_name, cache_dir=model_cache_dir, **model_kwargs)
            tokenizer = BertTokenizer.from_pretrained(
                model_name, cache_dir=token_cache_dir)

        else:
            is_encoder_decoder = True
            vocab_size = 50265

            model = TFBartForConditionalGeneration.from_pretrained(
                model_name, cache_dir=model_cache_dir, **model_kwargs)
            tokenizer = BartTokenizer.from_pretrained(
                model_name, cache_dir=token_cache_dir)

        print("Exporting Model to SavedModel at:", pb_model_dir)
        hf_model = HFModel(
            model,
def parse_text(filename):
    np.random.seed(42)
    max_seq_length = 512
    seq_length = 512
    max_predictions_per_seq = 20

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Store the document in ram
    with open(filename, "r") as infile:
        # Grab two sentences
        text = infile.read()
        lines = [
            line for line in text.split("\n \n \n")
            if (len(line) > 0 and not line.isspace())
        ]

    # TODO: Tokenizer batch encode
    breakpoint()

    # Let's just grab one document for now.
    idx = np.random.randint(len(lines) - 1)
    line1, line2 = lines[idx], lines[idx + 1]

    line = np.random.choice(lines)
    # And choose a subset of tokens.
    tokens = line.split(" ")
    if len(tokens) < seq_length:
        seq_length = len(tokens)
        start = 0
    else:
        start = np.random.randint(len(tokens) - seq_length)

    span = np.array(max_seq_length)
    span = np.empty(512, dtype=str)
    span[:seq_length] = np.array(tokens[start:start + seq_length])

    # Now tokenize
    mask = np.random.choice(np.arange(seq_length),
                            size=max_predictions_per_seq,
                            replace=False)

    span[mask] = "[MASK]"
    # TODO: Corrupt some of these instead of mask

    input_ids = span
    # 0 represents a padding token
    attention_mask = np.ones(max_seq_length)
    attention_mask[seq_length:] = 0
    # 0 represents sentence A, 1 is sentence B
    token_type_ids = np.ones(max_seq_length)

    # token_type_ids =

    print(span)

    my_dict = tokenizer.encode_plus(line, line, return_tensors="tf")

    input_dict = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
    }

    # 80% should remain the same
    # 10% should be masked
    # 10% should be corrupted

    model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
    model(my_dict)

    breakpoint()