Пример #1
0
 def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
     pred = transform_indices_to_labels(self.index_to_label,
                                        predict(self.model, self.generator))
     f1 = compute_f1(pred, self.valid_labels)
     logs['valid_f1'] = f1[0]
     print(f'Precision: {f1[0]}, Recall: {f1[1]}, F1: {f1[2]}')
Пример #2
0
def test(model, test_set, idx2Label, package):
    test_batch, test_batch_len = createBatches(test_set)
    predLabels, correctLabels = tag_dataset(test_batch, model, package)
    pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels,
                                             idx2Label)
    print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" %
          (pre_test, rec_test, f1_test))
    return pre_test, rec_test, f1_test
Пример #3
0
    def train(self):
        """Default training"""

        self.f1_test_history = []
        self.f1_dev_history = []

        for epoch in range(self.epochs):
            print("Epoch {}/{}".format(epoch, self.epochs))
            for i, batch in enumerate(
                    iterate_minibatches(self.train_batch,
                                        self.train_batch_len)):
                labels, tokens, casing, char = batch
                self.model.train_on_batch([tokens, casing, char], labels)

            # compute F1 scores
            predLabels, correctLabels = self.tag_dataset(
                self.test_batch, self.model)
            pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels,
                                                     self.idx2Label)
            self.f1_test_history.append(f1_test)
            print("f1 test ", round(f1_test, 4))

            predLabels, correctLabels = self.tag_dataset(
                self.dev_batch, self.model)
            pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels,
                                                  self.idx2Label)
            self.f1_dev_history.append(f1_dev)
            print("f1 dev ", round(f1_dev, 4), "\n")

        print("Final F1 test score: ", f1_test)

        print("Training finished.")

        # save model
        self.modelName = "{}_{}_{}_{}_{}_{}_{}".format(
            self.epochs, self.dropout, self.dropout_recurrent,
            self.lstm_state_size, self.conv_size, self.learning_rate,
            self.optimizer.__class__.__name__)

        modelName = self.modelName + ".h5"
        self.model.save(modelName)
        print("Model weights saved.")

        self.model.set_weights(self.init_weights)  # clear model
        print("Model weights cleared.")
Пример #4
0
    def evaluate(self, predictions, groundTruths, *args, **kwargs):
        """
        :param predictions:     Predictions from predict() function
        :param groundTruths:    ground Truth values
        :param args:            None
        :param kwargs:          None
        :return:                A tuple with precision, recall and F1 scores
        """

        precision, recall, f1 = compute_f1(predictions, groundTruths,
                                           self.cnnBLSTM_Obj.idx2Label)

        return (precision, recall, f1)
Пример #5
0
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)
model = Model(inputs=[words_input, casing_input, character_input],
              outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()
# plot_model(model, to_file='model.png')

for epoch in range(epochs):
    print("Epoch %d/%d" % (epoch, epochs))
    a = Progbar(len(train_batch_len))
    for i, batch in enumerate(iterate_minibatches(train_batch,
                                                  train_batch_len)):
        labels, tokens, casing, char = batch
        model.train_on_batch([tokens, casing, char], labels)
        a.update(i)
    a.update(i + 1)
    print(' ')

model.save("models/model.h5")

#   Performance on dev dataset
predLabels, correctLabels = tag_dataset(dev_batch)
pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, idx2Label)
print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))

#   Performance on test dataset
predLabels, correctLabels = tag_dataset(test_batch)
pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" %
      (pre_test, rec_test, f1_test))
Пример #6
0
def main():
    conll_2003_data_dir = './data/conll-2003'
    conll_2003_indices = [0, 3]
    conll_2003_ignore = ('-DOCSTART-', )

    char_cnn_width = 30

    train_data, train_labels = load_conll(
        os.path.join(conll_2003_data_dir, 'train.txt'), conll_2003_indices,
        conll_2003_ignore)
    validate_data, validate_labels = load_conll(
        os.path.join(conll_2003_data_dir, 'dev.txt'), conll_2003_indices,
        conll_2003_ignore)
    test_data, test_labels = load_conll(
        os.path.join(conll_2003_data_dir, 'test.txt'), conll_2003_indices,
        conll_2003_ignore)

    label_classes = sorted({
        l
        for sent_labels in train_labels + validate_labels + test_labels
        for l in sent_labels
    })
    label_to_index = build_labels_mapping(label_classes)
    index_to_label = build_indices_mapping(label_classes)

    characters = sorted({
        ch
        for sent in train_data + validate_data + test_data for word in sent
        for ch in word
    })

    word_set = {
        w
        for sent in train_data + validate_data + test_data for w in sent
    }

    print(f'{len(word_set)} unique words found.')

    embed = Embeddings('./embeddings/eng/glove.6B.300d.txt',
                       True,
                       word_set=word_set)
    embed_matrix = embed.matrix

    train_inputs = make_ner_inputs(train_data, embed, characters,
                                   char_cnn_width)
    train_outputs = make_ner_one_hot_outputs(train_labels, label_to_index)
    validate_inputs = make_ner_inputs(validate_data, embed, characters,
                                      char_cnn_width)
    test_inputs = make_ner_inputs(test_data, embed, characters, char_cnn_width)

    model = build_model_char_cnn_lstm_crf(len(label_classes), embed_matrix, 30,
                                          len(characters))

    train_generator = DataGenerator(train_inputs, train_outputs, 32)

    evaluator = ModelEval(DataGenerator(validate_inputs), validate_labels,
                          index_to_label)

    model_saver = ModelCheckpoint(filepath='./checkpoints/' +
                                  model.name.replace(' ', '_') +
                                  '_{epoch:02d}.hdf5',
                                  verbose=1,
                                  save_best_only=True,
                                  monitor='valid_f1')

    time_stamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    csv_logger = CSVLogger(f"./logs/NER_log_{time_stamp}.csv", append=False)

    # model.load_weights('./checkpoints/char_cnn_bilstm_crf17.hdf5')

    model.fit_generator(train_generator,
                        epochs=1,
                        callbacks=[evaluator, model_saver, csv_logger])

    test_pred_indices = predict(model, DataGenerator(test_inputs))

    test_pred_labels = transform_indices_to_labels(index_to_label,
                                                   test_pred_indices)

    print(compute_f1(test_pred_labels, test_labels))

    with open('ner_results.txt', 'wt') as file:
        for sent, sent_true_labels, sent_pred_labels in zip(
                test_data, test_labels, test_pred_labels):
            file.writelines([
                ' '.join(z) + '\n'
                for z in zip(sent, sent_true_labels, sent_pred_labels)
            ])
            file.write('\n')
Пример #7
0
    def train(self):
        """Default training"""

        self.f1_test_history = []
        self.f1_dev_history = []

        if self.verify_model_exists() is False:

            for epoch in range(self.epochs):
                print("Epoch {}/{}".format(epoch, self.epochs))
                for i, batch in enumerate(
                        iterate_minibatches(self.train_batch,
                                            self.train_batch_len)):
                    labels, tokens, casing, char = batch
                    self.model.train_on_batch([tokens, casing, char], labels)

                # compute F1 scores
                predLabels, correctLabels = self.tag_dataset(
                    self.test_batch, self.model)
                pre_test, rec_test, f1_test = compute_f1(
                    predLabels, correctLabels, self.idx2Label)
                self.f1_test_history.append(f1_test)
                print("\nprec test ", round(pre_test, 4))
                print("rec test", round(rec_test, 4))
                print("f1 test ", round(f1_test, 4))

                predLabels, correctLabels = self.tag_dataset(
                    self.dev_batch, self.model)
                pre_dev, rec_dev, f1_dev = compute_f1(predLabels,
                                                      correctLabels,
                                                      self.idx2Label)
                self.f1_dev_history.append(f1_dev)
                print("\nprec dev ", round(pre_dev, 4))
                print("rec dev ", round(rec_dev, 4))
                print("f1 dev ", round(f1_dev, 4), "\n")

                print("Final F1 dev score: ", f1_dev)

            self.writeToFile()

            modelName = "./models/" + self.modelName + ".h5"
            self.model.save(modelName)
            print("Model weights saved.")
            print("Training finished.")
        else:
            # compute F1 scores
            predLabels, correctLabels = self.tag_dataset(
                self.test_batch, self.model)
            pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels,
                                                     self.idx2Label)
            self.f1_test_history.append(f1_test)

            print("\nprec test ", round(pre_test, 4))
            print("rec test", round(rec_test, 4))
            print("f1 test ", round(f1_test, 4))

            predLabels, correctLabels = self.tag_dataset(
                self.dev_batch, self.model)
            pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels,
                                                  self.idx2Label)
            self.f1_dev_history.append(f1_dev)

            print("\nprec dev ", round(pre_dev, 4))
            print("rec dev ", round(rec_dev, 4))
            print("f1 dev ", round(f1_dev, 4), "\n")

            print("Final F1 dev score: ", f1_dev)

        self.model.set_weights(self.init_weights)  # clear model
        print("Model weights cleared.")
Пример #8
0
model.summary()

epochs = 80
for epoch in range(epochs):
    print("Epoch %d/%d" % (epoch, epochs))
    a = Progbar(len(train_batch_len))
    for i, batch in enumerate(iterate_minibatches(train_batch,
                                                  train_batch_len)):
        labels, tokens, casing, char = batch
        model.train_on_batch([tokens, casing, char], labels)
        a.update(i)
    print('\n')

#   Performance on test dataset
predLabels, correctLabels, sentences = tag_dataset(model, test_batch)

sents = []
for s in sentences:
    word = ""
    for number in s[0]:
        for char in word2Idx:
            word_ind = word2Idx[char]
            if number == word_ind:
                word += " " + char
    sents.append(word)

pre_test, rec_test, f1_test = compute_f1(predLabels, correctLabels, idx2Label,
                                         sents)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" %
      (pre_test, rec_test, f1_test))