示例#1
0
    def build(self):
        from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint
        import keras_bert

        self.embedding_type = 'albert'
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        # 简要判别一下
        self.layer_indexes = [i if i in [0,1,2,3,4,5,6,7,8,9,10,11, -1,-2] else -1 for i in self.layer_indexes]
        self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path,
                                                     training=self.trainable,
                                                     seq_len=self.len_max,
                                                     output_layers = self.layer_indexes)
        self.input = self.model.inputs
        self.output = self.model.outputs[0]

        # model_l = model.layers
        print('load bert model end!')
        # albert model all layers
        layer_dict = [8, 13]
        layer_0 = 13
        for i in range(10):
            layer_0 = layer_0 + 2
            layer_dict.append(layer_0)
        layer_dict.append(36)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = self.model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in [i + 1 for i in range(13)]:
                encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output
            else:
                encoder_layer = self.model.get_layer(index=layer_dict[-1]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [self.model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)]
                          else self.model.get_layer(index=layer_dict[-1]).output  # 如果给出不正确,就默认输出最后一层
                          for lay in self.layer_indexes]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = self.model.inputs
        self.model = Model(self.input, self.output)

        # self.embedding_size = self.model.output_shape[-1]

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)
    def build(self):
        from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint
        import keras_bert

        self.embedding_type = 'albert'
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        print('load bert model start!')
        # 简要判别一下
        # self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes]
        self.model = load_brightmart_albert_zh_checkpoint(
            self.corpus_path,
            training=self.trainable,
            seq_len=self.len_max,
            output_layers=None)  # self.layer_indexes)
        import json
        config = {}
        for file_name in os.listdir(self.corpus_path):
            if file_name.startswith('albert_config_base.json'):
                with open(os.path.join(self.corpus_path, file_name)) as reader:
                    config = json.load(reader)
                break

        num_hidden_layers = config.get("num_hidden_layers", 0)
        layer_real = [i for i in range(num_hidden_layers)
                      ] + [-i for i in range(num_hidden_layers)]
        self.layer_indexes = [
            i if i in layer_real else -2 for i in self.layer_indexes
        ]

        # self.input = self.model.inputs
        # self.output = self.model.outputs[0]
        model_l = self.model.layers
        print('load bert model end!')
        # albert model all layers
        layer_dict = [4, 8, 11, 13]
        layer_0 = 13
        for i in range(num_hidden_layers):
            layer_0 = layer_0 + 1
            layer_dict.append(layer_0)
        # layer_dict.append(34)
        print(layer_dict)
        # 输出它本身
        if len(self.layer_indexes) == 0:
            encoder_layer = self.model.output
        # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层
        elif len(self.layer_indexes) == 1:
            if self.layer_indexes[0] in layer_real:
                encoder_layer = self.model.get_layer(
                    index=layer_dict[self.layer_indexes[0]]).output
            else:
                encoder_layer = self.model.get_layer(
                    index=layer_dict[-2]).output
        # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [
                self.model.get_layer(index=layer_dict[lay]).output
                if lay in layer_real else self.model.get_layer(
                    index=layer_dict[-2]).output  # 如果给出不正确,就默认输出最后一层
                for lay in self.layer_indexes
            ]
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
        self.output = NonMaskingLayer()(encoder_layer)
        self.input = self.model.inputs
        self.model = Model(self.input, self.output)

        # self.embedding_size = self.model.output_shape[-1]

        # reader tokenizer
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.vocab_size = len(self.token_dict)
        self.tokenizer = keras_bert.Tokenizer(self.token_dict)