Пример #1
0
 def test_load_ner(self):
     x_train, y_train = load_ner_data_and_labels(self.test_file)
     assert len(x_train) == len(y_train)
     assert len(x_train) > 0
     assert len(x_train[0]) == len(y_train[0])
     assert len(x_train[0]) > 0
     assert x_train[:5] != y_train[:5]
Пример #2
0
 def test_load_ner_split(self):
     x_train, y_train, x_test, y_test = load_ner_data_and_labels(
         self.test_file, split=True)
     assert len(x_train) == len(y_train) and len(x_test) == len(y_test)
     assert len(x_train) > 0 and len(x_train) > 0
     assert len(x_train[0]) == len(y_train[0]) and len(x_test[0]) == len(
         y_test[0])
     assert len(x_train[0]) > 0 and len(x_test[0]) > 0
     assert x_train[:5] != y_train[:5] and x_test[:5] != y_test[:5]
     assert x_train[:5] != x_test[:5] and y_train[:5] != y_test[:5]
Пример #3
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_ner_data_and_labels(self.test_file, split=True)

        self.checkpoint_dir = os.path.dirname(__file__)
        self.model_name = 'bilstm_cnn_ner'
        self.json_file = os.path.join(self.checkpoint_dir,
                                      'bilstm_cnn_ner.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'bilstm_cnn_ner.hdf5')
        self.swa_weights_file = os.path.join(self.checkpoint_dir,
                                             'bilstm_cnn_ner_swa.hdf5')
        self.preprocessor_file = os.path.join(self.checkpoint_dir,
                                              'bilstm_cnn_preprocessor.pkl')
Пример #4
0
    def test_ner_generator(self):
        test_file = os.path.join(os.path.dirname(__file__), '../../../data/ner/msra/example.txt')
        x_train, y_train = load_ner_data_and_labels(test_file)

        preprocessor = NERPreprocessor(x_train, y_train)
        generator = NERGenerator(preprocessor, x_train, batch_size=64)
        assert len(generator) == math.ceil(len(x_train) / 64)
        for i, (features, y) in enumerate(generator):
            if i < len(generator) - 1:
                assert features.shape[0] == 64
                assert y is None
            else:
                assert features.shape[0] == len(x_train) - 64 * (len(generator) - 1)
                assert y is None
Пример #5
0
    def setup_class(self):
        x_train, y_train = load_ner_data_and_labels(self.test_file)
        self.preprocessor = NERPreprocessor(
            x_train,
            y_train,
            use_char=True,
            use_bert=True,
            use_word=True,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=16)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)
Пример #6
0
 def setup_class(self):
     self.test_corpus, _ = load_ner_data_and_labels(self.test_file)
     self.test_vocab = {'<PAD>': 0, '<UNK>': 1}
     for token in set(self.test_corpus[0]):
         self.test_vocab[token] = len(self.test_vocab)
Пример #7
0
    def setup_class(self):
        self.train_data, self.train_labels, self.valid_data, self.valid_labels = \
            load_ner_data_and_labels(self.test_file, split=True)
        self.preprocessor = NERPreprocessor(
            self.train_data + self.valid_data,
            self.train_labels + self.valid_labels,
            use_bert=True,
            use_word=True,
            bert_vocab_file=self.bert_vocab_file,
            char_embed_type='word2vec',
            word_embed_type='word2vec',
            max_len=16)
        self.num_class = self.preprocessor.num_class
        self.char_embeddings = self.preprocessor.char_embeddings
        self.char_vocab_size = self.preprocessor.char_vocab_size
        self.char_embed_dim = self.preprocessor.char_embed_dim

        self.word_embeddings = self.preprocessor.word_embeddings
        self.word_vocab_size = self.preprocessor.word_vocab_size
        self.word_embed_dim = self.preprocessor.word_embed_dim
        self.checkpoint_dir = os.path.dirname(__file__)

        self.ner_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.swa_model = BiLSTMCNNNER(
            num_class=self.num_class,
            use_char=True,
            char_embeddings=self.char_embeddings,
            char_vocab_size=self.char_vocab_size,
            char_embed_dim=self.char_embed_dim,
            char_embed_trainable=False,
            use_bert=True,
            bert_config_file=self.bert_config_file,
            bert_checkpoint_file=self.bert_model_file,
            use_word=True,
            word_embeddings=self.word_embeddings,
            word_vocab_size=self.word_vocab_size,
            word_embed_dim=self.word_embed_dim,
            word_embed_trainable=False,
            max_len=self.preprocessor.max_len,
            use_crf=True).build_model()

        self.ner_trainer = NERTrainer(self.ner_model, self.preprocessor)

        self.json_file = os.path.join(self.checkpoint_dir,
                                      'bilstm_cnn_ner.json')
        self.weights_file = os.path.join(self.checkpoint_dir,
                                         'bilstm_cnn_ner.hdf5')
Пример #8
0
import os

from fancy_nlp.utils import load_ner_data_and_labels
from fancy_nlp.applications import NER

msra_train_file = 'datasets/ner/msra/train_data'
msra_dev_file = 'datasets/ner/msra/test_data'

checkpoint_dir = 'pretrained_models'
model_name = 'msra_ner_bilstm_cnn_crf'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

train_data, train_labels = load_ner_data_and_labels(msra_train_file)
dev_data, dev_labels = load_ner_data_and_labels(msra_dev_file)

ner = NER(use_pretrained=False)

ner.fit(train_data,
        train_labels,
        dev_data,
        dev_labels,
        ner_model_type='bilstm_cnn',
        char_embed_trainable=True,
        callback_list=['modelcheckpoint', 'earlystopping', 'swa'],
        checkpoint_dir=checkpoint_dir,
        model_name=model_name,
        load_swa_model=True)
Пример #9
0
import os
import tensorflow as tf

from fancy_nlp.utils import load_ner_data_and_labels
from fancy_nlp.applications import NER

msra_train_file = 'datasets/ner/msra/train_data'
msra_dev_file = 'datasets/ner/msra/test_data'

checkpoint_dir = 'pretrained_models'
model_name = 'msra_ner_bert_crf'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

train_data, train_labels = load_ner_data_and_labels(msra_train_file)
valid_data, valid_labels = load_ner_data_and_labels(msra_dev_file)

ner = NER(use_pretrained=False)
ner.fit(train_data, train_labels, valid_data, valid_labels,
        ner_model_type='bert',
        use_char=False,
        use_word=False,
        use_bert=True,
        # 传入bert模型各文件的路径
        bert_vocab_file='pretrained_embeddings/chinese_L-12_H-768_A-12/vocab.txt',
        bert_config_file='pretrained_embeddings/chinese_L-12_H-768_A-12/bert_config.json',
        bert_checkpoint_file='pretrained_embeddings/chinese_L-12_H-768_A-12/bert_model.ckpt',
        bert_trainable=True,
        optimizer=tf.keras.optimizers.Adam(1e-5),
        callback_list=['modelcheckpoint', 'earlystopping', 'swa'],