示例#1
0
 def test_extract_embeddings_invalid_pooling(self):
     with self.assertRaises(ValueError):
         extract_embeddings(
             self.model_path,
             [
                 ('all work and no play', 'makes jack a dull boy'),
                 ('makes jack a dull boy', 'all work and no play'),
             ],
             poolings=['invalid'],
         )
示例#2
0
 def test_extract_embeddings_default(self):
     embeddings = extract_embeddings(
         self.model_path,
         ['all work and no play', 'makes jack a dull boy~'])
     self.assertEqual(2, len(embeddings))
     self.assertEqual((7, 4), embeddings[0].shape)
     self.assertEqual((8, 4), embeddings[1].shape)
示例#3
0
 def test_extract_embeddings_pair(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((13, 4), embeddings[0].shape)
示例#4
0
 def test_extract_embeddings_single_pooling(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
         poolings=POOL_NSP,
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((4, ), embeddings[0].shape)
示例#5
0
 def test_extract_embeddings_multi_pooling(self):
     embeddings = extract_embeddings(
         self.model_path,
         [
             ('all work and no play', 'makes jack a dull boy'),
             ('makes jack a dull boy', 'all work and no play'),
         ],
         poolings=[POOL_NSP, POOL_MAX, POOL_AVE],
         output_layer_num=2,
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((24, ), embeddings[0].shape)
示例#6
0
 def test_extract_embeddings_variable_lengths(self):
     tokens = [
         '[PAD]',
         '[UNK]',
         '[CLS]',
         '[SEP]',
         'all',
         'work',
         'and',
         'no',
         'play',
         'makes',
         'jack',
         'a',
         'dull',
         'boy',
         '~',
     ]
     token_dict = {token: i for i, token in enumerate(tokens)}
     inputs, outputs = get_model(
         token_num=len(tokens),
         pos_num=20,
         seq_len=None,
         embed_dim=13,
         transformer_num=1,
         feed_forward_dim=17,
         head_num=1,
         training=False,
     )
     model = keras.models.Model(inputs, outputs)
     embeddings = extract_embeddings(
         model,
         [
             ('all work and no play', 'makes jack'),
             ('a dull boy', 'all work and no play and no play'),
         ],
         vocabs=token_dict,
         batch_size=2,
     )
     self.assertEqual(2, len(embeddings))
     self.assertEqual((10, 13), embeddings[0].shape)
     self.assertEqual((14, 13), embeddings[1].shape)
示例#7
0
 def test_extract_embeddings_from_file(self):
     with codecs.open(os.path.join(self.model_path, 'vocab.txt'), 'r',
                      'utf8') as reader:
         texts = map(lambda x: x.strip(), reader)
         embeddings = extract_embeddings(self.model_path, texts)
     self.assertEqual(15, len(embeddings))
示例#8
0
from bertTAT.bert import extract_embeddings
from bertTAT.bert import load_trained_model_from_checkpoint

import numpy as np
import os, codecs

# 1. 提取预训练模型文件的路径
now_path = os.path.dirname(__file__)
pretrained_path = now_path + "/../pretrained_model/chinese_L-12_H-768_A-12"

# 1. 如果不需要微调,只想提取词/句子的特征,如提取每个句子对应的全部词的特征
texts = ["世上无难事", '只要肯攀登!']
embeddings = extract_embeddings(pretrained_path, texts)
print("embedding:", np.array(embeddings[0]).shape)

# 2. 输入是成对的句子,想使用最后4层特征,且提取NSP位位置输出和max-pooling的结果
# 输出结果中不再包含词的特征,NSP和max-pooling的输出会拼接在一起,每个numpy数组的大小为(768 x 4 x 2,)
from bertTAT.bert import extract_embeddings, POOL_NSP, POOL_MAX

texts = [('公司加班很严重', '但也要保持学习!'), ('算法学习', '永不止步。')]
embeddings = extract_embeddings(pretrained_path,
                                texts,
                                output_layer_num=4,
                                poolings=[POOL_NSP, POOL_MAX])
print("句子对:", np.array(embeddings).shape)

# 3. 可以使用adapter来对预训练模型进行微调,下面的代码只让adapter和layer normalization成为可训练的层
layer_num = 12
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
model = load_trained_model_from_checkpoint(