def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 self.layer_indexes = [i if i in [0,1,2,3,4,5,6,7,8,9,10,11, -1,-2] else -1 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers = self.layer_indexes) self.input = self.model.inputs self.output = self.model.outputs[0] # model_l = model.layers print('load bert model end!') # albert model all layers layer_dict = [8, 13] layer_0 = 13 for i in range(10): layer_0 = layer_0 + 2 layer_dict.append(layer_0) layer_dict.append(36) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [self.model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else self.model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def create_model(self, hyper_parameters): """ 构建神经网络 :param hyper_parameters:json, hyper parameters of network :return: tensor, moedl """ super().create_model( hyper_parameters) # 这里的embedding不用,只提取token(即onehot-index) # embedding = self.word_embedding.output # embed_layer = SpatialDropout1D(self.dropout)(embedding) encoder_input = keras.layers.Input(shape=(self.len_max, ), name='Encoder-Input') encoder_embed_layer = EmbeddingRet( input_dim=self.word_embedding.vocab_size, output_dim=self.word_embedding.embed_size, mask_zero=False, weights=None, trainable=self.trainable, name='Token-Embedding', ) encoder_embedding = encoder_embed_layer(encoder_input) encoder_embed = TriglePositiomEmbedding( mode=TriglePositiomEmbedding.MODE_ADD, name='Encoder-Embedding', )(encoder_embedding[0]) encoded_layer = build_encoders( encoder_num=self.encoder_num, input_layer=encoder_embed, head_num=self.head_num, hidden_dim=self.hidden_dim, attention_activation=self.activate_classify, feed_forward_activation=self.activate_classify, dropout_rate=self.dropout, trainable=self.trainable, use_adapter=self.use_adapter, adapter_units=self.adapter_units, adapter_activation=self.adapter_activation, ) encoded_layer = NonMaskingLayer()(encoded_layer) encoded_layer_flat = Flatten()(encoded_layer) encoded_layer_drop = Dropout(self.dropout)(encoded_layer_flat) output = Dense(self.label, activation=self.activate_classify)(encoded_layer_drop) self.model = Model(inputs=encoder_input, outputs=output) self.model.summary(120)
def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 # self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint( self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers=None) # self.layer_indexes) import json config = {} for file_name in os.listdir(self.corpus_path): if file_name.startswith('albert_config_base.json'): with open(os.path.join(self.corpus_path, file_name)) as reader: config = json.load(reader) break num_hidden_layers = config.get("num_hidden_layers", 0) layer_real = [i for i in range(num_hidden_layers) ] + [-i for i in range(num_hidden_layers)] self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] # self.input = self.model.inputs # self.output = self.model.outputs[0] model_l = self.model.layers print('load bert model end!') # albert model all layers layer_dict = [4, 8, 11, 13] layer_0 = 13 for i in range(num_hidden_layers): layer_0 = layer_0 + 1 layer_dict.append(layer_0) # layer_dict.append(34) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = self.model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer( index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real else self.model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # set_custom_objects() # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) layer_real = [i for i in range(25)] + [-i for i in range(25)] # 简要判别一下 self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [] layer_0 = 7 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0) layer_dict.append(247) # 测试 get_output_at # def get_number(index): # try: # model_node = model.get_output_at(node_index=index) # gg = 0 # except: # print('node index wrong!') # print(index) # list_index = [i for i in range(25)] + [-i for i in range(25)] # for li in list_index: # get_number(li) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).get_output_at( node_index=0) else: encoder_layer = model.get_layer( index=layer_dict[-1]).get_output_at(node_index=0) # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).get_output_at( node_index=0) if lay in layer_real else model.get_layer( index=layer_dict[-1]).get_output_at( node_index=0) # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) # def xlnet_concat(x): # x_concat = K.concatenate(x, axis=1) # return x_concat # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp)
def build(self): import keras_bert self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=self.len_max, trainable=self.trainable) print('load bert model end!') # bert model all layers layer_dict = [6] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer( index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) self.embedding_size = self.model.output_shape[-1] # word2idx = {} # with open(dict_path, 'r', encoding='utf-8') as f: # words = f.read().splitlines() # for idx, word in enumerate(words): # word2idx[word] = idx # for key, value in self.ot_dict.items(): # word2idx[key] = value # # self.token2idx = word2idx # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # set_custom_objects() self.build_config(self.config_path) # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # # debug时候查看layers # self.model_layers = model.layers # len_layers = self.model_layers.__len__() # print(len_layers) num_hidden_layers = self.configs.get("n_layer", 12) layer_real = [i for i in range(num_hidden_layers)] + [-i for i in range(num_hidden_layers)] # 简要判别一下 self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] output_layer = "FeedForward-Normal-{0}" layer_dict = [model.get_layer(output_layer.format(i + 1)).get_output_at(node_index=0) for i in range(num_hidden_layers)] # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = layer_dict[self.layer_indexes[0]] else: encoder_layer = layer_dict[-1] # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [layer_dict[lay] if lay in layer_real else layer_dict[-1] # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) # def xlnet_concat(x): # x_concat = K.concatenate(x, axis=1) # return x_concat # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp)
def build(self): from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI from keras_xlnet import load_trained_model_from_checkpoint self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [5] layer_0 = 6 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0 - 2) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [model.get_layer(index=layer_dict[lay]).output if lay in [i + 1 for i in range(len_couche + 1)] else model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(model.inputs, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp)