def test_load_ner(self): x_train, y_train = load_ner_data_and_labels(self.test_file) assert len(x_train) == len(y_train) assert len(x_train) > 0 assert len(x_train[0]) == len(y_train[0]) assert len(x_train[0]) > 0 assert x_train[:5] != y_train[:5]
def test_load_ner_split(self): x_train, y_train, x_test, y_test = load_ner_data_and_labels( self.test_file, split=True) assert len(x_train) == len(y_train) and len(x_test) == len(y_test) assert len(x_train) > 0 and len(x_train) > 0 assert len(x_train[0]) == len(y_train[0]) and len(x_test[0]) == len( y_test[0]) assert len(x_train[0]) > 0 and len(x_test[0]) > 0 assert x_train[:5] != y_train[:5] and x_test[:5] != y_test[:5] assert x_train[:5] != x_test[:5] and y_train[:5] != y_test[:5]
def setup_class(self): self.train_data, self.train_labels, self.valid_data, self.valid_labels = \ load_ner_data_and_labels(self.test_file, split=True) self.checkpoint_dir = os.path.dirname(__file__) self.model_name = 'bilstm_cnn_ner' self.json_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.json') self.weights_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.hdf5') self.swa_weights_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner_swa.hdf5') self.preprocessor_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_preprocessor.pkl')
def test_ner_generator(self): test_file = os.path.join(os.path.dirname(__file__), '../../../data/ner/msra/example.txt') x_train, y_train = load_ner_data_and_labels(test_file) preprocessor = NERPreprocessor(x_train, y_train) generator = NERGenerator(preprocessor, x_train, batch_size=64) assert len(generator) == math.ceil(len(x_train) / 64) for i, (features, y) in enumerate(generator): if i < len(generator) - 1: assert features.shape[0] == 64 assert y is None else: assert features.shape[0] == len(x_train) - 64 * (len(generator) - 1) assert y is None
def setup_class(self): x_train, y_train = load_ner_data_and_labels(self.test_file) self.preprocessor = NERPreprocessor( x_train, y_train, use_char=True, use_bert=True, use_word=True, bert_vocab_file=self.bert_vocab_file, char_embed_type='word2vec', word_embed_type='word2vec', max_len=16) self.num_class = self.preprocessor.num_class self.char_embeddings = self.preprocessor.char_embeddings self.char_vocab_size = self.preprocessor.char_vocab_size self.char_embed_dim = self.preprocessor.char_embed_dim self.word_embeddings = self.preprocessor.word_embeddings self.word_vocab_size = self.preprocessor.word_vocab_size self.word_embed_dim = self.preprocessor.word_embed_dim self.checkpoint_dir = os.path.dirname(__file__)
def setup_class(self): self.test_corpus, _ = load_ner_data_and_labels(self.test_file) self.test_vocab = {'<PAD>': 0, '<UNK>': 1} for token in set(self.test_corpus[0]): self.test_vocab[token] = len(self.test_vocab)
def setup_class(self): self.train_data, self.train_labels, self.valid_data, self.valid_labels = \ load_ner_data_and_labels(self.test_file, split=True) self.preprocessor = NERPreprocessor( self.train_data + self.valid_data, self.train_labels + self.valid_labels, use_bert=True, use_word=True, bert_vocab_file=self.bert_vocab_file, char_embed_type='word2vec', word_embed_type='word2vec', max_len=16) self.num_class = self.preprocessor.num_class self.char_embeddings = self.preprocessor.char_embeddings self.char_vocab_size = self.preprocessor.char_vocab_size self.char_embed_dim = self.preprocessor.char_embed_dim self.word_embeddings = self.preprocessor.word_embeddings self.word_vocab_size = self.preprocessor.word_vocab_size self.word_embed_dim = self.preprocessor.word_embed_dim self.checkpoint_dir = os.path.dirname(__file__) self.ner_model = BiLSTMCNNNER( num_class=self.num_class, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=True, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, max_len=self.preprocessor.max_len, use_crf=True).build_model() self.swa_model = BiLSTMCNNNER( num_class=self.num_class, use_char=True, char_embeddings=self.char_embeddings, char_vocab_size=self.char_vocab_size, char_embed_dim=self.char_embed_dim, char_embed_trainable=False, use_bert=True, bert_config_file=self.bert_config_file, bert_checkpoint_file=self.bert_model_file, use_word=True, word_embeddings=self.word_embeddings, word_vocab_size=self.word_vocab_size, word_embed_dim=self.word_embed_dim, word_embed_trainable=False, max_len=self.preprocessor.max_len, use_crf=True).build_model() self.ner_trainer = NERTrainer(self.ner_model, self.preprocessor) self.json_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.json') self.weights_file = os.path.join(self.checkpoint_dir, 'bilstm_cnn_ner.hdf5')
import os from fancy_nlp.utils import load_ner_data_and_labels from fancy_nlp.applications import NER msra_train_file = 'datasets/ner/msra/train_data' msra_dev_file = 'datasets/ner/msra/test_data' checkpoint_dir = 'pretrained_models' model_name = 'msra_ner_bilstm_cnn_crf' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_data, train_labels = load_ner_data_and_labels(msra_train_file) dev_data, dev_labels = load_ner_data_and_labels(msra_dev_file) ner = NER(use_pretrained=False) ner.fit(train_data, train_labels, dev_data, dev_labels, ner_model_type='bilstm_cnn', char_embed_trainable=True, callback_list=['modelcheckpoint', 'earlystopping', 'swa'], checkpoint_dir=checkpoint_dir, model_name=model_name, load_swa_model=True)
import os import tensorflow as tf from fancy_nlp.utils import load_ner_data_and_labels from fancy_nlp.applications import NER msra_train_file = 'datasets/ner/msra/train_data' msra_dev_file = 'datasets/ner/msra/test_data' checkpoint_dir = 'pretrained_models' model_name = 'msra_ner_bert_crf' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_data, train_labels = load_ner_data_and_labels(msra_train_file) valid_data, valid_labels = load_ner_data_and_labels(msra_dev_file) ner = NER(use_pretrained=False) ner.fit(train_data, train_labels, valid_data, valid_labels, ner_model_type='bert', use_char=False, use_word=False, use_bert=True, # 传入bert模型各文件的路径 bert_vocab_file='pretrained_embeddings/chinese_L-12_H-768_A-12/vocab.txt', bert_config_file='pretrained_embeddings/chinese_L-12_H-768_A-12/bert_config.json', bert_checkpoint_file='pretrained_embeddings/chinese_L-12_H-768_A-12/bert_model.ckpt', bert_trainable=True, optimizer=tf.keras.optimizers.Adam(1e-5), callback_list=['modelcheckpoint', 'earlystopping', 'swa'],