def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 self.layer_indexes = [i if i in [0,1,2,3,4,5,6,7,8,9,10,11, -1,-2] else -1 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers = self.layer_indexes) self.input = self.model.inputs self.output = self.model.outputs[0] # model_l = model.layers print('load bert model end!') # albert model all layers layer_dict = [8, 13] layer_0 = 13 for i in range(10): layer_0 = layer_0 + 2 layer_dict.append(layer_0) layer_dict.append(36) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [self.model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else self.model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 # self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint( self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers=None) # self.layer_indexes) import json config = {} for file_name in os.listdir(self.corpus_path): if file_name.startswith('albert_config_base.json'): with open(os.path.join(self.corpus_path, file_name)) as reader: config = json.load(reader) break num_hidden_layers = config.get("num_hidden_layers", 0) layer_real = [i for i in range(num_hidden_layers) ] + [-i for i in range(num_hidden_layers)] self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] # self.input = self.model.inputs # self.output = self.model.outputs[0] model_l = self.model.layers print('load bert model end!') # albert model all layers layer_dict = [4, 8, 11, 13] layer_0 = 13 for i in range(num_hidden_layers): layer_0 = layer_0 + 1 layer_dict.append(layer_0) # layer_dict.append(34) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = self.model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer( index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real else self.model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)