class CustomEmbeddingsTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(CustomEmbeddingsTest, self).__init__(*args, **kwargs) self.embedding = CustomEmbedding('empty_embedding', sequence_length=SEQUENCE_LENGTH, embedding_size=100) def test_build_word2idx(self): corpus = [['我', '们', '变', '而', '以', '书', '会', '友', ',', '以', '书', '结', '缘', ',', '把', '欧', '美', '、', '港', '台', '流', '行', '的', '食', '品', '类', '图', '谱', '、', '画', '册', '、', '工', '具', '书', '汇', '集', '一', '堂', '。'], ['为', '了', '跟', '踪', '国', '际', '最', '新', '食', '品', '工', '艺', '、', '流', '行', '趋', '势', ',', '大', '量', '搜', '集', '海', '外', '专', '业', '书', '刊', '资', '料', '是', '提', '高', '技', '艺', '的', '捷', '径', '。'], ['其', '中', '线', '装', '古', '籍', '逾', '千', '册', ';', '民', '国', '出', '版', '物', '几', '百', '种', ';', '珍', '本', '四', '册', '、', '稀', '见', '本', '四', '百', '余', '册', ',', '出', '版', '时', '间', '跨', '越', '三', '百', '余', '年', '。'], ['有', '的', '古', '木', '交', '柯', ',', '春', '机', '荣', '欣', ',', '从', '诗', '人', '句', '中', '得', '之', ',', '而', '入', '画', '中', ',', '观', '之', '令', '人', '心', '驰', '。', '我']] self.embedding.build_token2idx_dict(x_data=corpus, min_count=2) def test_build(self): self.test_build_word2idx() self.assertEqual(self.embedding.token_count, 33) self.assertTrue(all(isinstance(x, str) for x in self.embedding.token2idx.keys())) self.assertTrue(all(isinstance(x, int) for x in self.embedding.token2idx.values())) self.assertEqual(self.embedding.idx2token[0], k.PAD) self.assertEqual(self.embedding.idx2token[1], k.BOS) self.assertEqual(self.embedding.idx2token[2], k.EOS) self.assertEqual(self.embedding.idx2token[3], k.UNK) def test_tokenize(self): self.test_build_word2idx() sentence = ['我', '想', '看', '电影', '%%##!$#%'] tokens = self.embedding.tokenize(sentence) logging.info('tokenize test: {} -> {}'.format(sentence, tokens)) self.assertEqual(len(tokens), len(sentence) + 2) self.assertEqual(tokens[-2], 3, msg='check unk value') token_list = self.embedding.tokenize([sentence]) self.assertEqual(len(token_list[0]), len(sentence) + 2) def test_embed(self): self.test_build_word2idx() sentence = ['我', '想', '看', '电影', '%%##!$#%'] embedded_sentence = self.embedding.embed(sentence) logging.info('embed test: {} -> {}'.format(sentence, embedded_sentence)) self.assertEqual(embedded_sentence.shape, (SEQUENCE_LENGTH, self.embedding.embedding_size)) embedded_sentences = self.embedding.embed([sentence]) self.assertEqual(embedded_sentences.shape, (1, SEQUENCE_LENGTH, self.embedding.embedding_size))
class SequenceLabelingModel(object): __base_hyper_parameters__ = {} @property def hyper_parameters(self): return self._hyper_parameters_ def __init__(self, embedding: BaseEmbedding = None, hyper_parameters: Dict = None): if embedding is None: self.embedding = CustomEmbedding('custom', sequence_length=0, embedding_size=100) else: self.embedding = embedding self.model: Model = None self._hyper_parameters_ = self.__base_hyper_parameters__.copy() self._label2idx = {} self._idx2label = {} if hyper_parameters: self._hyper_parameters_.update(hyper_parameters) self.model_info = {} @property def label2idx(self) -> Dict[str, int]: return self._label2idx @property def token2idx(self) -> Dict[str, int]: return self.embedding.token2idx @label2idx.setter def label2idx(self, value): self._label2idx = value self._idx2label = dict([(val, key) for (key, val) in value.items()]) def build_model(self, loss_f=None, optimizer=None, metrics=None, **kwargs): """ build model function :return: """ raise NotImplementedError() def build_token2id_label2id_dict(self, x_train: List[List[str]], y_train: List[List[str]], x_validate: List[List[str]] = None, y_validate: List[str] = None): x_data = x_train y_data = y_train if x_validate: x_data += x_validate y_data += y_validate self.embedding.build_token2idx_dict(x_data, 3) label_set = [] for seq in y_data: for y in seq: if y not in label_set: label_set.append(y) label2idx = {k.PAD: 0, k.BOS: 1, k.EOS: 2} label_set = [i for i in label_set if i not in label2idx] for label in label_set: label2idx[label] = len(label2idx) self.label2idx = label2idx def convert_labels_to_idx( self, label: Union[List[List[str]], List[str]], add_eos_bos: bool = True) -> Union[List[List[int]], List[int]]: def tokenize_tokens(seq: List[str]): tokens = [self._label2idx[i] for i in seq] if add_eos_bos: tokens = [self._label2idx[k.BOS] ] + tokens + [self._label2idx[k.EOS]] return tokens if isinstance(label[0], str): return tokenize_tokens(label) else: return [tokenize_tokens(l) for l in label] def convert_idx_to_labels( self, idx: Union[List[List[int]], List[int]], tokens_length: Union[List[int], int], remove_eos_bos: bool = True) -> Union[List[str], str]: def reverse_tokenize_tokens(idx_item, seq_length): if remove_eos_bos: seq = idx_item[1:1 + seq_length] else: seq = idx_item tokens = [self._idx2label[i] for i in seq] return tokens if isinstance(idx[0], int): return reverse_tokenize_tokens(idx, tokens_length) else: labels = [] for index in range(len(idx)): idx_item = idx[index] seq_length = tokens_length[index] labels.append(reverse_tokenize_tokens(idx_item, seq_length)) return labels def get_data_generator(self, x_data: List[List[str]], y_data: List[List[str]], batch_size: int = 64, is_bert: bool = False): while True: page_list = list(range(len(x_data) // batch_size + 1)) random.shuffle(page_list) for page in page_list: start_index = page * batch_size end_index = start_index + batch_size target_x = x_data[start_index:end_index] target_y = y_data[start_index:end_index] if len(target_x) == 0: target_x = x_data[0:batch_size] target_y = y_data[0:batch_size] tokenized_x = self.embedding.tokenize(target_x) tokenized_y = self.convert_labels_to_idx(target_y) padded_x = sequence.pad_sequences( tokenized_x, maxlen=self.embedding.sequence_length, padding='post') padded_y = sequence.pad_sequences( tokenized_y, maxlen=self.embedding.sequence_length, padding='post') one_hot_y = to_categorical(padded_y, num_classes=len(self.label2idx)) if is_bert: padded_x_seg = np.zeros( shape=(len(padded_x), self.embedding.sequence_length)) x_input_data = [padded_x, padded_x_seg] else: x_input_data = padded_x yield (x_input_data, one_hot_y) def fit(self, x_train: List[List[str]], y_train: List[List[str]], x_validate: List[List[str]] = None, y_validate: List[List[str]] = None, batch_size: int = 64, epochs: int = 5, labels_weight: bool = None, default_labels_weight: float = 50.0, fit_kwargs: Dict = None, **kwargs): """ :param x_train: list of training data. :param y_train: list of training target label data. :param batch_size: batch size for trainer model :param epochs: Number of epochs to train the model. :param x_validate: list of validation data. :param y_validate: list of validation target label data. :param y_validate: list of validation target label data. :param y_validate: list of validation target label data. :param labels_weight: set class weights for imbalanced classes :param default_labels_weight: default weight for labels not in labels_weight dict :param fit_kwargs: additional kwargs to be passed to :func:`~keras.models.Model.fit` :return: """ assert len(x_train) == len(y_train) self.build_token2id_label2id_dict(x_train, y_train, x_validate, y_validate) if len(x_train) < batch_size: batch_size = len(x_train) // 2 if not self.model: if self.embedding.sequence_length == 0: self.embedding.sequence_length = sorted( [len(y) for y in y_train])[int(0.95 * len(y_train))] logging.info('sequence length set to {}'.format( self.embedding.sequence_length)) if labels_weight: weights = [] initial_weights = {k.PAD: 1, k.BOS: 1, k.EOS: 1, 'O': 1} for label in self.label2idx.keys(): weights.append( initial_weights.get(label, default_labels_weight)) loss_f = helper.weighted_categorical_crossentropy( np.array(weights)) self.model_info['loss'] = { 'func': 'weighted_categorical_crossentropy', 'weights': weights } self.build_model(loss_f=loss_f, metrics=['categorical_accuracy', 'acc']) else: self.build_model() train_generator = self.get_data_generator( x_train, y_train, batch_size, is_bert=self.embedding.is_bert) if fit_kwargs is None: fit_kwargs = {} if x_validate: validation_generator = self.get_data_generator( x_validate, y_validate, batch_size, is_bert=self.embedding.is_bert) fit_kwargs['validation_data'] = validation_generator fit_kwargs['validation_steps'] = len(x_validate) // batch_size self.model.fit_generator(train_generator, steps_per_epoch=len(x_train) // batch_size, epochs=epochs, **fit_kwargs) def predict(self, sentence: Union[List[str], List[List[str]]], batch_size=None): tokens = self.embedding.tokenize(sentence) is_list = not isinstance(sentence[0], str) if is_list: seq_length = [len(item) for item in sentence] padded_tokens = sequence.pad_sequences( tokens, maxlen=self.embedding.sequence_length, padding='post') else: seq_length = [len(sentence)] padded_tokens = sequence.pad_sequences( [tokens], maxlen=self.embedding.sequence_length, padding='post') if self.embedding.is_bert: x = [ padded_tokens, np.zeros(shape=(len(padded_tokens), self.embedding.sequence_length)) ] else: x = padded_tokens predict_result = self.model.predict(x, batch_size=batch_size).argmax(-1) labels = self.convert_idx_to_labels(predict_result, seq_length) if is_list: return labels else: return labels[0] def evaluate(self, x_data, y_data, batch_size=None) -> Tuple[float, float, Dict]: y_pred = self.predict(x_data, batch_size=batch_size) weighted_f1 = f1_score(y_data, y_pred) weighted_recall = recall_score(y_data, y_pred) report = classification_report(y_data, y_pred) print(classification_report(y_data, y_pred)) return weighted_f1, weighted_recall, report def save(self, model_path: str): pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) with open(os.path.join(model_path, 'labels.json'), 'w', encoding='utf-8') as f: f.write(json.dumps(self.label2idx, indent=2, ensure_ascii=False)) with open(os.path.join(model_path, 'words.json'), 'w', encoding='utf-8') as f: f.write( json.dumps(self.embedding.token2idx, indent=2, ensure_ascii=False)) with open(os.path.join(model_path, 'model.json'), 'w', encoding='utf-8') as f: f.write(json.dumps(self.model_info, indent=2, ensure_ascii=False)) self.model.save(os.path.join(model_path, 'model.model')) logging.info('model saved to {}'.format(os.path.abspath(model_path))) @staticmethod def create_custom_objects(model_info): custom_objects = {} loss = model_info.get('loss') if loss and loss['name'] == 'weighted_categorical_crossentropy': loss_f = helper.weighted_categorical_crossentropy( np.array(loss['weights'])) custom_objects['loss'] = loss_f if loss and loss['name'] == 'crf': custom_objects['CRF'] = CRF custom_objects['crf_loss'] = crf_loss custom_objects['crf_viterbi_accuracy'] = CRF(128).accuracy return custom_objects @staticmethod def load_model(model_path: str): with open(os.path.join(model_path, 'labels.json'), 'r', encoding='utf-8') as f: label2idx = json.load(f) with open(os.path.join(model_path, 'words.json'), 'r', encoding='utf-8') as f: token2idx = json.load(f) with open(os.path.join(model_path, 'model.json'), 'r', encoding='utf-8') as f: model_info = json.load(f) agent = SequenceLabelingModel() custom_objects = SequenceLabelingModel.create_custom_objects( model_info) if custom_objects: logging.debug('prepared custom objects: {}'.format(custom_objects)) agent.model = keras.models.load_model(os.path.join( model_path, 'model.model'), custom_objects=custom_objects) agent.model.summary() agent.embedding.sequence_length = agent.model.input_shape[-1] agent.label2idx = label2idx agent.embedding.token2idx = token2idx logging.info('loaded model from {}'.format( os.path.abspath(model_path))) return agent