def test_re_cut(self): current_path = os.path.dirname(os.path.abspath(__file__)) spm_path = os.path.join(current_path, 'spiece.model') tokenizer = Tokenizer(spm_path) text = '123,456,789.00' ids = tokenizer.encode(text) self.assertEqual(text, tokenizer.decode(ids))
def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size): print('--' * 10 + ' Load xlnet model start ' + '--' * 10) gpu_option(gpu_name, gpu_num) self.seq_max_len = seq_max_len # same to train self.batch_size = batch_size spiece_model = 'models/Xlnet/xlnet_model/spiece.model' self.tokenizer = Tokenizer(spiece_model) MODEL_SAVE_PATH = 'models/Xlnet/fine_tune_model/xlnet_fine_tune.hdf5' model = load_model(MODEL_SAVE_PATH, custom_objects=get_custom_objects(), compile=False) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print('--' * 10 + ' Load xlnet model end ' + '--' * 10)
def get_encode(pos, neg): """ :param pos:正样本 :param neg:负样本 :return: """ all_data = pos + neg X = [] tokenizer = Tokenizer(vocab_path) for line in all_data: tokens = tokenizer.encode(line) X.append(tokens) X = sequence.pad_sequences(X, maxlen=maxlen, padding='post', truncating='post') return X
class FineTuneXlnet: def __init__(self, gpu_name, gpu_num, seq_max_len, batch_size): print('--' * 10 + ' Load xlnet model start ' + '--' * 10) gpu_option(gpu_name, gpu_num) self.seq_max_len = seq_max_len # same to train self.batch_size = batch_size spiece_model = 'models/Xlnet/xlnet_model/spiece.model' self.tokenizer = Tokenizer(spiece_model) MODEL_SAVE_PATH = 'models/Xlnet/fine_tune_model/xlnet_fine_tune.hdf5' model = load_model(MODEL_SAVE_PATH, custom_objects=get_custom_objects(), compile=False) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print('--' * 10 + ' Load xlnet model end ' + '--' * 10) def data_generator(self, data): steps = len(data) // self.batch_size if len(data) % self.batch_size != 0: steps += 1 X1, X2, X3, X4 = [], [], [], [] for i in range(len(data)): d = data[i] text1 = d[0] text2 = d[1] tokens = self.tokenizer.encode(text1 + '|' + text2) tokens = tokens + [0] * (self.seq_max_len - len(tokens)) if len( tokens) < self.seq_max_len else tokens[ 0:self.seq_max_len] # padding token_input = np.array(tokens) mask_input = [0 if ids == 0 else 1 for ids in tokens] mask_input_ = copy.deepcopy(mask_input) segment_input = create_seg_array(tokens, mask_input_) memory_length_input = np.zeros(1) X1.append(token_input) X2.append(segment_input) X3.append(memory_length_input) X4.append(mask_input) if len(X1) == self.batch_size or i == (len(data) - 1): yield np.array(X1), np.array(X2), np.array(X3), np.array(X4) X1, X2, X3, X4 = [], [], [], [] def classify(self, texts): pred = [] my_iter = self.data_generator(texts) for m_token, m_segment, m_memory, m_mask in my_iter: p = self.par_model.predict([m_token, m_segment, m_memory, m_mask]) pred += sum(p.tolist(), []) return pred
def test_tokenizer(self): current_path = os.path.dirname(os.path.abspath(__file__)) spm_path = os.path.join(current_path, 'spiece.model') tokenizer = Tokenizer( spm_path, remove_spaces=True, remove_accents=True, cased=True, sample=True, ) text = 'build XLNet' for _ in range(10): ids = tokenizer.encode(text) self.assertEqual(text, tokenizer.decode(ids)) tokenizer = Tokenizer( spm_path, remove_spaces=False, remove_accents=False, cased=False, sample=False, ) ids = tokenizer.encode(text) self.assertEqual([1266, 3512, 368, 1942], ids) self.assertEqual(text.lower(), tokenizer.decode(ids))
def build(self): from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # set_custom_objects() # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) layer_real = [i for i in range(25)] + [-i for i in range(25)] # 简要判别一下 self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [] layer_0 = 7 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0) layer_dict.append(247) # 测试 get_output_at # def get_number(index): # try: # model_node = model.get_output_at(node_index=index) # gg = 0 # except: # print('node index wrong!') # print(index) # list_index = [i for i in range(25)] + [-i for i in range(25)] # for li in list_index: # get_number(li) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).get_output_at( node_index=0) else: encoder_layer = model.get_layer( index=layer_dict[-1]).get_output_at(node_index=0) # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).get_output_at( node_index=0) if lay in layer_real else model.get_layer( index=layer_dict[-1]).get_output_at( node_index=0) # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) # def xlnet_concat(x): # x_concat = K.concatenate(x, axis=1) # return x_concat # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp)
import os import sys import numpy as np from keras_xlnet import PretrainedList, get_pretrained_paths from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint from keras_xlnet import ATTENTION_TYPE_UNI, ATTENTION_TYPE_BI checkpoint_path = "/home/xsq/nlp_code/sentiment_analysis/pretrain_model/chinese_xlnet_mid_L-24_H-768_A-12" vocab_path = os.path.join(checkpoint_path, 'spiece.model') config_path = os.path.join(checkpoint_path, 'xlnet_config.json') model_path = os.path.join(checkpoint_path, 'xlnet_model.ckpt') # Tokenize inputs tokenizer = Tokenizer(vocab_path) text = "这个苹果很好吃" tokens = tokenizer.encode(text) print(np.array(tokens).shape) token_input = np.expand_dims(np.array(tokens), axis=0) print(token_input.shape) segment_input = np.zeros_like(token_input) print(segment_input.shape) memory_length_input = np.zeros((1, 1)) # Load pre-trained model model = load_trained_model_from_checkpoint( config_path=config_path, checkpoint_path=model_path, batch_size=1,
class KerasXlnetVector(): def __init__(self): self.attention_type = ATTENTION_TYPE_BI if args.attention_type[ 0] == 'bi' else ATTENTION_TYPE_UNI self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, args.batch_size self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name self.layer_indexes, self.in_train_phase = args.layer_indexes, False print("load KerasXlnetEmbedding start! ") # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.get_default_graph() global model # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.in_train_phase, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(args.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) print("load KerasXlnetEmbedding end") model.summary(132) def xlnet_encode(self, texts): # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1) masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / ( np.sum(m, axis=1, keepdims=True) + 1e-9) # 文本预处理 predicts = [] for text in texts: # print(text) tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) if len( tokens) < self.target_len else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) mask_input = np.array([0 if ids == 0 else 1 for ids in tokens]) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros((1, 1)) # 全局使用,使其可以django、flask、tornado等调用 with graph.as_default(): predict = model.predict( [token_input, segment_input, memory_length_input], batch_size=1) # print(predict) prob = predict[0] pooled = masked_reduce_mean(prob, [mask_input]) pooled = pooled.tolist() predicts.append(pooled[0]) return predicts
def __init__(self, batch_size, gpu_name, gpu_num): set_gpu_option(gpu_name, gpu_num) self.attention_type = ATTENTION_TYPE_BI if args.attention_type[0] == 'bi' else ATTENTION_TYPE_UNI self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, batch_size self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name self.layer_indexes, self.in_train_phase = args.layer_indexes, False print("##### load KerasXlnet start #####") self.graph = tf.get_default_graph() # 模型加载 self.model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.in_train_phase, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(args.spiece_model) # debug时候查看layers self.model_layers = self.model.layers len_layers = self.model_layers.__len__() len_couche = int((len_layers - 6) / 10) # 一共126个layer # 每层10个layer,第一是7个layer的输入和embedding层 # 一共12层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......12] all_layers = [self.model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else self.model.get_layer(index=layer_dict[-3]).output # 如果给出不正确,就默认输出倒数第二层 for lay in self.layer_indexes] all_layers = all_layers[1:] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) output_layer = NonMaskingLayer()(encoder_layer) model = Model(self.model.inputs, output_layer) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print("##### load KerasXlnet end #####")
data.append([row[0], row[1], int(row[2])]) return data # 读数据并划分为训练集和验证集 all_data = read_data(data_path) valid_num = int(len(all_data) * valid_data_ratio) train_num = len(all_data) - valid_num train_data = all_data[:train_num] valid_data = all_data[train_num:] print('data number:', len(all_data)) print('train data number:', len(train_data)) print('valid data number:', len(valid_data)) # 加载Tokenizer tokenizer = Tokenizer(spiece_model) def create_seg_array(tk, mask_arr): for index, i in enumerate(mask_arr[:tk.index(7505) + 1]): # |||||| mask_arr[index] = 0 return np.array(mask_arr) # 数据的生成器 class data_generator: def __init__(self, data, batch_size): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0:
class KerasXlnetVector: def __init__(self, batch_size, gpu_name, gpu_num): set_gpu_option(gpu_name, gpu_num) self.attention_type = ATTENTION_TYPE_BI if args.attention_type[0] == 'bi' else ATTENTION_TYPE_UNI self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, batch_size self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name self.layer_indexes, self.in_train_phase = args.layer_indexes, False print("##### load KerasXlnet start #####") self.graph = tf.get_default_graph() # 模型加载 self.model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.in_train_phase, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(args.spiece_model) # debug时候查看layers self.model_layers = self.model.layers len_layers = self.model_layers.__len__() len_couche = int((len_layers - 6) / 10) # 一共126个layer # 每层10个layer,第一是7个layer的输入和embedding层 # 一共12层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......12] all_layers = [self.model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else self.model.get_layer(index=layer_dict[-3]).output # 如果给出不正确,就默认输出倒数第二层 for lay in self.layer_indexes] all_layers = all_layers[1:] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) output_layer = NonMaskingLayer()(encoder_layer) model = Model(self.model.inputs, output_layer) if gpu_num >= 2: self.par_model = multi_gpu_model(model, gpus=gpu_num) else: self.par_model = model print("##### load KerasXlnet end #####") # model.summary() def xlnet_encode(self, texts): """输入句子的列表,返回句向量列表""" predicts = [] def create_array(): # 将输入的文本转换为词典序号的形式 data = [] for text in texts: tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) if len(tokens) < self.target_len else tokens[0:self.target_len] # padding token_input = np.array(tokens) mask_input = [0 if ids == 0 else 1 for ids in tokens].count(1) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros(1) data.append([token_input, mask_input, segment_input, memory_length_input]) return data array = create_array() my_iter = data_iter(array, batch_size=self.batch_size) for w1, w2, w3, w4 in my_iter: m_token_input = np.array(w1) m_mask_input = w2 m_segment_input = np.array(w3) m_memory_length_input = np.array(w4) with self.graph.as_default(): predict = self.par_model.predict([m_token_input, m_segment_input, m_memory_length_input], batch_size=self.batch_size) for index, prob in enumerate(predict): # pooled为句向量 pooled = sen_embed_cal(prob, m_mask_input[index]) pooled = pooled.tolist() predicts.append(pooled) return predicts
def build(self): from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI from keras_xlnet import load_trained_model_from_checkpoint self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(model.inputs, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp)
class XlnetEmbedding(BaseEmbedding): def __init__(self, hyper_parameters): self.layer_indexes = hyper_parameters['embedding'].get( 'layer_indexes', [24]) self.xlnet_embed = hyper_parameters['embedding'].get('xlnet_embed', {}) self.batch_size = hyper_parameters['model'].get('batch_size', 1) super().__init__(hyper_parameters) def build(self): from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI from keras_xlnet import load_trained_model_from_checkpoint self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 32) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(model.inputs, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp) def sentence2idx(self, text): tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) \ if len(tokens) < self.target_len \ else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros((1, 1)) return [token_input, segment_input, memory_length_input]
EPOCH = 10 BATCH_SIZE = 16 SEQ_LEN = 100 MODEL_NAME = 'RTE.h5' CLASSES = { 'not_entailment': 0, 'entailment': 1, } current_path = os.path.dirname(os.path.abspath(__file__)) train_path = os.path.join(current_path, 'train.tsv') dev_path = os.path.join(current_path, 'dev.tsv') paths = get_pretrained_paths(PretrainedList.en_cased_base) tokenizer = Tokenizer(paths.vocab) # Read data class DataSequence(keras.utils.Sequence): def __init__(self, x, y): self.x = x self.y = y def __len__(self): return (len(self.y) + BATCH_SIZE - 1) // BATCH_SIZE def __getitem__(self, index): s = slice(index * BATCH_SIZE, (index + 1) * BATCH_SIZE) return [item[s] for item in self.x], self.y[s]
def __init__(self): self.attention_type = ATTENTION_TYPE_BI if args.attention_type[ 0] == 'bi' else ATTENTION_TYPE_UNI self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, args.batch_size self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name self.layer_indexes, self.in_train_phase = args.layer_indexes, False print("load KerasXlnetEmbedding start! ") # 全局使用,使其可以django、flask、tornado等调用 global graph graph = tf.get_default_graph() global model # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.in_train_phase, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(args.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) output_layer = NonMaskingLayer()(encoder_layer) model = Model(model.inputs, output_layer) print("load KerasXlnetEmbedding end") model.summary(132)
# -*- coding: utf-8 -*- import os from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI from keras.utils import plot_model checkpoint_path = './xlnet_cased_L-12_H-768_A-12' tokenizer = Tokenizer(os.path.join(checkpoint_path, 'spiece.model')) model = load_trained_model_from_checkpoint( config_path=os.path.join(checkpoint_path, 'xlnet_config.json'), checkpoint_path=os.path.join(checkpoint_path, 'xlnet_model.ckpt'), batch_size=16, memory_len=512, target_len=128, in_train_phase=False, attention_type=ATTENTION_TYPE_BI, ) model.summary() plot_model(model, to_file="xlnet.png", show_shapes=True)
def init_tokenizer(spiece_model): global tokenizer tokenizer = Tokenizer(spiece_model)