Пример #1
0
    def build(self):
        from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint
        import keras_bert

        self.embedding_type = 'albert'
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        # 简要判别一下
        self.layer_indexes = [i if i in [0,1,2,3,4,5,6,7,8,9,10,11, -1,-2] else -1 for i in self.layer_indexes]
        self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path,
                                                     training=self.trainable,
                                                     seq_len=self.len_max,
                                                     output_layers = self.layer_indexes)
        self.input = self.model.inputs
        self.output = self.model.outputs[0]

        # model_l = model.layers
        print('load bert model end!')
        # albert model all layers
        layer_dict = [8, 13]
        layer_0 = 13
        for i in range(10):
            layer_0 = layer_0 + 2
            layer_dict.append(layer_0)
        layer_dict.append(36)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = self.model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output
            else:
                encoder_layer = self.model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [self.model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)]
                          else self.model.get_layer(index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                          for lay in self.layer_indexes]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = self.model.inputs
        self.model = Model(self.input, self.output)

        # self.embedding_size = self.model.output_shape[-1]

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)
Пример #2
0
 def create_model(self, hyper_parameters):
     """
         构建神经网络
     :param hyper_parameters:json,  hyper parameters of network
     :return: tensor, moedl
     """
     super().create_model(
         hyper_parameters)  # 这里的embedding不用,只提取token(即onehot-index)
     # embedding = self.word_embedding.output
     # embed_layer = SpatialDropout1D(self.dropout)(embedding)
     encoder_input = keras.layers.Input(shape=(self.len_max, ),
                                        name='Encoder-Input')
     encoder_embed_layer = EmbeddingRet(
         input_dim=self.word_embedding.vocab_size,
         output_dim=self.word_embedding.embed_size,
         mask_zero=False,
         weights=None,
         trainable=self.trainable,
         name='Token-Embedding',
     )
     encoder_embedding = encoder_embed_layer(encoder_input)
     encoder_embed = TriglePositiomEmbedding(
         mode=TriglePositiomEmbedding.MODE_ADD,
         name='Encoder-Embedding',
     )(encoder_embedding[0])
     encoded_layer = build_encoders(
         encoder_num=self.encoder_num,
         input_layer=encoder_embed,
         head_num=self.head_num,
         hidden_dim=self.hidden_dim,
         attention_activation=self.activate_classify,
         feed_forward_activation=self.activate_classify,
         dropout_rate=self.dropout,
         trainable=self.trainable,
         use_adapter=self.use_adapter,
         adapter_units=self.adapter_units,
         adapter_activation=self.adapter_activation,
     )
     encoded_layer = NonMaskingLayer()(encoded_layer)
     encoded_layer_flat = Flatten()(encoded_layer)
     encoded_layer_drop = Dropout(self.dropout)(encoded_layer_flat)
     output = Dense(self.label,
                    activation=self.activate_classify)(encoded_layer_drop)
     self.model = Model(inputs=encoder_input, outputs=output)
     self.model.summary(120)
Пример #3
0
    def build(self):
        from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint
        import keras_bert

        self.embedding_type = 'albert'
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        # 简要判别一下
        # self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes]
        self.model = load_brightmart_albert_zh_checkpoint(
            self.corpus_path,
            training=self.trainable,
            seq_len=self.len_max,
            output_layers=None)  # self.layer_indexes)
        import json
        config = {}
        for file_name in os.listdir(self.corpus_path):
            if file_name.startswith('albert_config_base.json'):
                with open(os.path.join(self.corpus_path, file_name)) as reader:
                    config = json.load(reader)
                break

        num_hidden_layers = config.get("num_hidden_layers", 0)
        layer_real = [i for i in range(num_hidden_layers)
                      ] + [-i for i in range(num_hidden_layers)]
        self.layer_indexes = [
            i if i in layer_real else -2 for i in self.layer_indexes
        ]

        # self.input = self.model.inputs
        # self.output = self.model.outputs[0]
        model_l = self.model.layers
        print('load bert model end!')
        # albert model all layers
        layer_dict = [4, 8, 11, 13]
        layer_0 = 13
        for i in range(num_hidden_layers):
            layer_0 = layer_0 + 1
            layer_dict.append(layer_0)
        # layer_dict.append(34)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = self.model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = self.model.get_layer(
                    index=layer_dict[self.layer_indexes[0]]).output
            else:
                encoder_layer = self.model.get_layer(
                    index=layer_dict[-2]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [
                self.model.get_layer(index=layer_dict[lay]).output
                if lay in layer_real else self.model.get_layer(
                    index=layer_dict[-2]).output  # 如果给出不正确,就默认输出最后一层
                for lay in self.layer_indexes
            ]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = self.model.inputs
        self.model = Model(self.input, self.output)

        # self.embedding_size = self.model.output_shape[-1]

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)
Пример #4
0
    def build(self):
        from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects
        from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI

        self.embedding_type = 'xlnet'
        self.checkpoint_path = os.path.join(self.corpus_path,
                                            'xlnet_model.ckpt')
        self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json')
        self.spiece_model = os.path.join(self.corpus_path, 'spiece.model')

        self.attention_type = self.xlnet_embed.get('attention_type',
                                                   'bi')  # or 'uni'
        self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI
        self.memory_len = self.xlnet_embed.get('memory_len', 0)
        self.target_len = self.xlnet_embed.get('target_len', 5)
        print('load xlnet model start!')
        # 模型加载
        model = load_trained_model_from_checkpoint(
            checkpoint_path=self.checkpoint_path,
            attention_type=self.attention_type,
            in_train_phase=self.trainable,
            config_path=self.config_path,
            memory_len=self.memory_len,
            target_len=self.target_len,
            batch_size=self.batch_size,
            mask_index=0)
        #
        set_custom_objects()
        # 字典加载
        self.tokenizer = Tokenizer(self.spiece_model)
        # debug时候查看layers
        self.model_layers = model.layers
        len_layers = self.model_layers.__len__()
        print(len_layers)

        layer_real = [i for i in range(25)] + [-i for i in range(25)]
        # 简要判别一下
        self.layer_indexes = [
            i if i in layer_real else -2 for i in self.layer_indexes
        ]

        len_couche = int((len_layers - 6) / 10)
        # 一共246个layer
        # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层
        # 一共24层
        layer_dict = []
        layer_0 = 7
        for i in range(len_couche):
            layer_0 = layer_0 + 10
            layer_dict.append(layer_0)
        layer_dict.append(247)
        # 测试 get_output_at
        # def get_number(index):
        #     try:
        #        model_node = model.get_output_at(node_index=index)
        #        gg = 0
        #     except:
        #         print('node index wrong!')
        #         print(index)
        # list_index = [i for i in range(25)] + [-i for i in range(25)]
        # for li in list_index:
        #     get_number(li)

        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,取得不正确的话就取倒数第二层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = model.get_layer(
                    index=layer_dict[self.layer_indexes[0]]).get_output_at(
                        node_index=0)
            else:
                encoder_layer = model.get_layer(
                    index=layer_dict[-1]).get_output_at(node_index=0)
        # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
        else:
            # layer_indexes must be [0, 1, 2,3,......24]
            all_layers = [
                model.get_layer(index=layer_dict[lay]).get_output_at(
                    node_index=0) if lay in layer_real else model.get_layer(
                        index=layer_dict[-1]).get_output_at(
                            node_index=0)  # 如果给出不正确,就默认输出倒数第一层
                for lay in self.layer_indexes
            ]
            print(self.layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)

            # def xlnet_concat(x):
            #     x_concat = K.concatenate(x, axis=1)
            #     return x_concat
            # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers)

        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)
        print("load KerasXlnetEmbedding end")
        model.summary(132)

        self.embedding_size = self.model.output_shape[-1]
        self.vocab_size = len(self.tokenizer.sp)
Пример #5
0
    def build(self):
        import keras_bert

        self.embedding_type = 'bert'
        config_path = os.path.join(self.corpus_path, 'bert_config.json')
        check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        model = keras_bert.load_trained_model_from_checkpoint(
            config_path,
            check_point_path,
            seq_len=self.len_max,
            trainable=self.trainable)
        print('load bert model end!')
        # bert model all layers
        layer_dict = [6]
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 8
            layer_dict.append(layer_0)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = model.get_layer(
                    index=layer_dict[self.layer_indexes[0] - 1]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [
                model.get_layer(index=layer_dict[lay - 1]).output
                if lay in [i + 1 for i in range(13)] else model.get_layer(
                    index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                for lay in self.layer_indexes
            ]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)

        self.embedding_size = self.model.output_shape[-1]
        # word2idx = {}
        # with open(dict_path, 'r', encoding='utf-8') as f:
        #     words = f.read().splitlines()
        # for idx, word in enumerate(words):
        #     word2idx[word] = idx
        # for key, value in self.ot_dict.items():
        #     word2idx[key] = value
        #
        # self.token2idx = word2idx

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)
Пример #6
0
    def build(self):
        from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects
        from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI

        self.embedding_type = 'xlnet'
        self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt')
        self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json')
        self.spiece_model = os.path.join(self.corpus_path, 'spiece.model')

        self.attention_type = self.xlnet_embed.get('attention_type', 'bi')  # or 'uni'
        self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI
        self.memory_len = self.xlnet_embed.get('memory_len', 0)
        self.target_len = self.xlnet_embed.get('target_len', 5)
        print('load xlnet model start!')
        # 模型加载
        model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path,
                                                   attention_type=self.attention_type,
                                                   in_train_phase=self.trainable,
                                                   config_path=self.config_path,
                                                   memory_len=self.memory_len,
                                                   target_len=self.target_len,
                                                   batch_size=self.batch_size,
                                                   mask_index=0)
        #
        set_custom_objects()
        self.build_config(self.config_path)
        # 字典加载
        self.tokenizer = Tokenizer(self.spiece_model)
        # # debug时候查看layers
        # self.model_layers = model.layers
        # len_layers = self.model_layers.__len__()
        # print(len_layers)
        num_hidden_layers = self.configs.get("n_layer", 12)

        layer_real = [i for i in range(num_hidden_layers)] + [-i for i in range(num_hidden_layers)]
        # 简要判别一下
        self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes]
        output_layer = "FeedForward-Normal-{0}"
        layer_dict = [model.get_layer(output_layer.format(i + 1)).get_output_at(node_index=0)
                          for i in range(num_hidden_layers)]

        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,取得不正确的话就取倒数第二层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = layer_dict[self.layer_indexes[0]]
            else:
                encoder_layer = layer_dict[-1]
        # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
        else:
            # layer_indexes must be [0, 1, 2,3,......24]
            all_layers = [layer_dict[lay] if lay in layer_real
                          else layer_dict[-1] # 如果给出不正确,就默认输出倒数第一层
                          for lay in self.layer_indexes]
            print(self.layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)

            # def xlnet_concat(x):
            #     x_concat = K.concatenate(x, axis=1)
            #     return x_concat
            # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers)

        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(self.input, self.output)
        print("load KerasXlnetEmbedding end")
        model.summary(132)

        self.embedding_size = self.model.output_shape[-1]
        self.vocab_size = len(self.tokenizer.sp)
Пример #7
0
    def build(self):
        from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI
        from keras_xlnet import load_trained_model_from_checkpoint

        self.embedding_type = 'xlnet'
        self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt')
        self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json')
        self.spiece_model = os.path.join(self.corpus_path, 'spiece.model')

        self.attention_type = self.xlnet_embed.get('attention_type', 'bi')  # or 'uni'
        self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI
        self.memory_len =  self.xlnet_embed.get('memory_len', 0)
        self.target_len = self.xlnet_embed.get('target_len', 5)
        print('load xlnet model start!')
        # 模型加载
        model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path,
                                                   attention_type=self.attention_type,
                                                   in_train_phase=self.trainable,
                                                   config_path=self.config_path,
                                                   memory_len=self.memory_len,
                                                   target_len=self.target_len,
                                                   batch_size=self.batch_size,
                                                   mask_index=0)
        # 字典加载
        self.tokenizer = Tokenizer(self.spiece_model)
        # debug时候查看layers
        self.model_layers = model.layers
        len_layers = self.model_layers.__len__()
        print(len_layers)
        len_couche = int((len_layers - 6) / 10)
        # 一共246个layer
        # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层
        # 一共24层
        layer_dict = [5]
        layer_0 = 6
        for i in range(len_couche):
            layer_0 = layer_0 + 10
            layer_dict.append(layer_0 - 2)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层,取得不正确的话就取倒数第二层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]:
                encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数
        else:
            # layer_indexes must be [0, 1, 2,3,......24]
            all_layers = [model.get_layer(index=layer_dict[lay]).output
                          if lay in [i + 1 for i in range(len_couche + 1)]
                          else model.get_layer(index=layer_dict[-1]).output  # 如果给出不正确,就默认输出倒数第一层
                          for lay in self.layer_indexes]
            print(self.layer_indexes)
            print(all_layers)
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = model.inputs
        self.model = Model(model.inputs, self.output)
        print("load KerasXlnetEmbedding end")
        model.summary(132)

        self.embedding_size = self.model.output_shape[-1]
        self.vocab_size = len(self.tokenizer.sp)