Пример #1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Пример #2
0
    def test_load_word_embeddings(self):
        self.DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
        filename = os.path.join(self.DATA_DIR, 'glove.50d.txt')
        vocab = load_glove_vocab(filename)
        vocab = {w: i for i, w in enumerate(vocab)}
        dim = 50
        embeddings = load_word_embeddings(vocab, filename, dim=dim)
        self.assertEqual(embeddings.shape[1], dim)

        dim = 10
        embeddings = load_word_embeddings(vocab, filename, dim=dim)
        self.assertEqual(embeddings.shape[1], dim)

        dim = 1000
        actual_dim = 50
        embeddings = load_word_embeddings(vocab, filename, dim=dim)
        self.assertNotEqual(embeddings.shape[1], dim)
        self.assertEqual(embeddings.shape[1], actual_dim)
Пример #3
0
from anago.models import SeqLabeling
import numpy as np
from anago.reader import load_word_embeddings, load_data_and_labels

DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
    nopad_prediction = prediction[:sentence_length.item()]
    label_indices = [np.argmax(x) for x in nopad_prediction]
    labels = p.inverse_transform(label_indices)

    print "\n".join(["{}\t{}".format(w, l) for w, l in zip(words, labels)])
Пример #4
0
def train_anago(keras_model_name="WCP",
                data_name="laptops",
                task_name="ATEPC2",
                hand_features=None):
    DATA_ROOT = 'data'
    SAVE_ROOT = './models'  # trained models
    LOG_ROOT = './logs'  # checkpoint, tensorboard
    w_embedding_path = '/home/s1610434/Documents/Data/Vector/glove.twitter.27B.100d.txt'
    c_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.char.100.txt'
    pos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.pos.100.txt'
    unipos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.unipos.100.txt'

    model_config = prepare_modelconfig(keras_model_name)
    training_config = TrainingConfig()
    training_config.max_epoch = 100
    training_config.early_stopping = 30

    print("-----{0}-----{1}-----{2}-----{3}-----".format(
        task_name, data_name, keras_model_name, hand_features))
    save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name)
    train_path = os.path.join(DATA_ROOT,
                              '{0}.{1}.train.tsv'.format(data_name, task_name))
    test_path = os.path.join(DATA_ROOT,
                             '{0}.{1}.test.tsv'.format(data_name, task_name))
    train_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name))
    test_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name))

    # train set
    x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path)
    x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path)

    # test set
    X_test, Y_test, _ = collect_data_from_tsv(test_path)
    X_test_dep = collect_dept_data_from_tsv(test_dep_path)

    # train_test set
    X_train_test = np.concatenate((x_train_valid, X_test), 0)
    X_train_test_dep = np.concatenate((x_train_valid_dep, X_test_dep), 0)
    Y_train_test = np.concatenate((y_train_valid, Y_test), 0)

    # preprocessor
    p = prepare_preprocessor(list(zip(X_train_test, X_train_test_dep)),
                             Y_train_test,
                             keras_model_name=keras_model_name,
                             hand_features=hand_features)

    print(len(p.vocab_word))
    print(len(p.vocab_char))
    model_config.vocab_size = len(p.vocab_word)
    model_config.char_vocab_size = len(p.vocab_char)
    if keras_model_name.find("P") != -1:
        if hand_features is not None:
            if "UNIPOS" in hand_features:
                pos_embedding_path = unipos_embedding_path
        model_config.pos_vocab_size = len(p.pos_extractor.features_dict)
    if keras_model_name.find("H") != -1:
        # model_config.hand_feature_size = gen_no_hand_dimension(data_name, hand_features, keras_model_name)
        model_config.hand_feature_size = 53
        print("model_config.hand_feature_size: ",
              str(model_config.hand_feature_size))

    # load embedding
    W_embeddings = load_word_embeddings(p.vocab_word, w_embedding_path,
                                        model_config.word_embedding_size)
    print("Load W_embeddings: {0}".format(W_embeddings.shape))
    C_embeddings = None
    POS_embeddings = None
    # if "C" in keras_model_name:
    #     C_embeddings = load_word_embeddings(p.vocab_char, c_embedding_path, model_config.char_embedding_size)
    #     print("Load C_embeddings: {0}".format(C_embeddings.shape))
    # if "P" in keras_model_name:
    #     POS_embeddings = load_word_embeddings(p.pos_extractor.features_dict, pos_embedding_path, model_config.pos_embedding_size)
    #     print("Load POS_embeddings: {0}".format(POS_embeddings.shape))

    atepc_evaluator = ATEPCEvaluator()
    results = []

    # TODO Kfold split
    kf = KFold(n_splits=10)
    i_fold = 0
    for train_index, valid_index in kf.split(x_train_valid):
        model_name = "{0}.{1}.{2}".format(keras_model_name,
                                          "{0}".format(hand_features), i_fold)
        X_train, X_valid = x_train_valid[train_index], x_train_valid[
            valid_index]
        X_train_dep, X_valid_dep = x_train_valid_dep[
            train_index], x_train_valid_dep[valid_index]
        Y_train, Y_valid = y_train_valid[train_index], y_train_valid[
            valid_index]

        print("Data train: ", X_train.shape, Y_train.shape)
        print("Data valid: ", X_valid.shape, Y_valid.shape)
        print("Data  test: ", X_test.shape, Y_test.shape)

        trainer = Trainer(model_config=model_config,
                          training_config=training_config,
                          checkpoint_path=LOG_ROOT,
                          save_path=save_path,
                          preprocessor=p,
                          W_embeddings=W_embeddings,
                          C_embeddings=C_embeddings,
                          POS_embeddings=POS_embeddings,
                          keras_model_name=keras_model_name,
                          model_name=model_name)

        # trainer = Trainer2(model_config=model_config,
        #                         training_config=training_config,
        #                         checkpoint_path=LOG_ROOT,
        #                         save_path=save_path,
        #                         preprocessor=p,
        #                         W_embeddings=W_embeddings,
        #                         C_embeddings=C_embeddings,
        #                         POS_embeddings=POS_embeddings,
        #                         keras_model_name = keras_model_name,
        #                         model_name=model_name)

        trainer.train(list(zip(X_train, X_train_dep)), Y_train,
                      list(zip(X_valid, X_valid_dep)), Y_valid)

        evaluator = anago.Evaluator(model_config,
                                    weights=model_name,
                                    save_path=save_path,
                                    preprocessor=p,
                                    keras_model_name=keras_model_name)
        print("--- Test phrase --- " + model_name)
        print("Train ")
        f1_score_train = evaluator.eval(list(zip(X_train, X_train_dep)),
                                        Y_train)
        print("Validation ")
        f1_score_valid = evaluator.eval(list(zip(X_valid, X_valid_dep)),
                                        Y_valid)
        print("Test ")
        f1_score_test = evaluator.eval(list(zip(X_test, X_test_dep)), Y_test)
        print("---")
        i_fold += 1

        f_out_name = "data/{0}.{1}.test.pred.tsv".format(data_name, task_name)
        f_out = open(f_out_name, "w")
        tagger = anago.Tagger(model_config,
                              model_name,
                              save_path=save_path,
                              preprocessor=p,
                              keras_model_name=keras_model_name)
        for x, y in zip(list(zip(X_test, X_test_dep)), Y_test):
            result = tagger.predict(x)
            for word, label, pred in zip(x[0], y, result):
                f_out.write("{0}\t{1}\t{2}\n".format(word, label, pred))
            f_out.write("\n")
        f_out.close()
        ate_f1, apc_acc, c_apc_acc = atepc_evaluator.evaluate(f_out_name)
        results.append([ate_f1, apc_acc, c_apc_acc])
        print(results[-1])

    print("-----All-----{0}--{1}".format(keras_model_name, data_name))
    for result in results:
        print(result)
    print("-----AVG-----")
    results_np = np.array(results, dtype=np.float32)
    print(results_np.mean(axis=0))
    print("-------------")