def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH, model_config.word_embedding_size) model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) trainer = anago.Trainer(model, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p, embeddings=embeddings) trainer.train(x_train, y_train, x_valid, y_valid) model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
def test_load_word_embeddings(self): self.DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') filename = os.path.join(self.DATA_DIR, 'glove.50d.txt') vocab = load_glove_vocab(filename) vocab = {w: i for i, w in enumerate(vocab)} dim = 50 embeddings = load_word_embeddings(vocab, filename, dim=dim) self.assertEqual(embeddings.shape[1], dim) dim = 10 embeddings = load_word_embeddings(vocab, filename, dim=dim) self.assertEqual(embeddings.shape[1], dim) dim = 1000 actual_dim = 50 embeddings = load_word_embeddings(vocab, filename, dim=dim) self.assertNotEqual(embeddings.shape[1], dim) self.assertEqual(embeddings.shape[1], actual_dim)
from anago.models import SeqLabeling import numpy as np from anago.reader import load_word_embeddings, load_data_and_labels DATA_ROOT = 'data/conll2003/en/ner' LOAD_ROOT = './models' # trained model LOG_ROOT = './logs' # checkpoint, tensorboard embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec' model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'train.small.txt') x_test, y_test = load_data_and_labels(test_path) p = prepare_preprocessor(x_test, y_test) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) model_path = os.path.join(LOAD_ROOT, 'mymodel.h5') model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) model.load(model_path) X, y = p.transform(x_test, y_test) predictions = model.predict(X) for words, prediction, sentence_length in zip(x_test, predictions, X[2]): nopad_prediction = prediction[:sentence_length.item()] label_indices = [np.argmax(x) for x in nopad_prediction] labels = p.inverse_transform(label_indices) print "\n".join(["{}\t{}".format(w, l) for w, l in zip(words, labels)])
def train_anago(keras_model_name="WCP", data_name="laptops", task_name="ATEPC2", hand_features=None): DATA_ROOT = 'data' SAVE_ROOT = './models' # trained models LOG_ROOT = './logs' # checkpoint, tensorboard w_embedding_path = '/home/s1610434/Documents/Data/Vector/glove.twitter.27B.100d.txt' c_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.char.100.txt' pos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.pos.100.txt' unipos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.unipos.100.txt' model_config = prepare_modelconfig(keras_model_name) training_config = TrainingConfig() training_config.max_epoch = 100 training_config.early_stopping = 30 print("-----{0}-----{1}-----{2}-----{3}-----".format( task_name, data_name, keras_model_name, hand_features)) save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name) train_path = os.path.join(DATA_ROOT, '{0}.{1}.train.tsv'.format(data_name, task_name)) test_path = os.path.join(DATA_ROOT, '{0}.{1}.test.tsv'.format(data_name, task_name)) train_dep_path = os.path.join( DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name)) test_dep_path = os.path.join( DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name)) # train set x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path) x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path) # test set X_test, Y_test, _ = collect_data_from_tsv(test_path) X_test_dep = collect_dept_data_from_tsv(test_dep_path) # train_test set X_train_test = np.concatenate((x_train_valid, X_test), 0) X_train_test_dep = np.concatenate((x_train_valid_dep, X_test_dep), 0) Y_train_test = np.concatenate((y_train_valid, Y_test), 0) # preprocessor p = prepare_preprocessor(list(zip(X_train_test, X_train_test_dep)), Y_train_test, keras_model_name=keras_model_name, hand_features=hand_features) print(len(p.vocab_word)) print(len(p.vocab_char)) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) if keras_model_name.find("P") != -1: if hand_features is not None: if "UNIPOS" in hand_features: pos_embedding_path = unipos_embedding_path model_config.pos_vocab_size = len(p.pos_extractor.features_dict) if keras_model_name.find("H") != -1: # model_config.hand_feature_size = gen_no_hand_dimension(data_name, hand_features, keras_model_name) model_config.hand_feature_size = 53 print("model_config.hand_feature_size: ", str(model_config.hand_feature_size)) # load embedding W_embeddings = load_word_embeddings(p.vocab_word, w_embedding_path, model_config.word_embedding_size) print("Load W_embeddings: {0}".format(W_embeddings.shape)) C_embeddings = None POS_embeddings = None # if "C" in keras_model_name: # C_embeddings = load_word_embeddings(p.vocab_char, c_embedding_path, model_config.char_embedding_size) # print("Load C_embeddings: {0}".format(C_embeddings.shape)) # if "P" in keras_model_name: # POS_embeddings = load_word_embeddings(p.pos_extractor.features_dict, pos_embedding_path, model_config.pos_embedding_size) # print("Load POS_embeddings: {0}".format(POS_embeddings.shape)) atepc_evaluator = ATEPCEvaluator() results = [] # TODO Kfold split kf = KFold(n_splits=10) i_fold = 0 for train_index, valid_index in kf.split(x_train_valid): model_name = "{0}.{1}.{2}".format(keras_model_name, "{0}".format(hand_features), i_fold) X_train, X_valid = x_train_valid[train_index], x_train_valid[ valid_index] X_train_dep, X_valid_dep = x_train_valid_dep[ train_index], x_train_valid_dep[valid_index] Y_train, Y_valid = y_train_valid[train_index], y_train_valid[ valid_index] print("Data train: ", X_train.shape, Y_train.shape) print("Data valid: ", X_valid.shape, Y_valid.shape) print("Data test: ", X_test.shape, Y_test.shape) trainer = Trainer(model_config=model_config, training_config=training_config, checkpoint_path=LOG_ROOT, save_path=save_path, preprocessor=p, W_embeddings=W_embeddings, C_embeddings=C_embeddings, POS_embeddings=POS_embeddings, keras_model_name=keras_model_name, model_name=model_name) # trainer = Trainer2(model_config=model_config, # training_config=training_config, # checkpoint_path=LOG_ROOT, # save_path=save_path, # preprocessor=p, # W_embeddings=W_embeddings, # C_embeddings=C_embeddings, # POS_embeddings=POS_embeddings, # keras_model_name = keras_model_name, # model_name=model_name) trainer.train(list(zip(X_train, X_train_dep)), Y_train, list(zip(X_valid, X_valid_dep)), Y_valid) evaluator = anago.Evaluator(model_config, weights=model_name, save_path=save_path, preprocessor=p, keras_model_name=keras_model_name) print("--- Test phrase --- " + model_name) print("Train ") f1_score_train = evaluator.eval(list(zip(X_train, X_train_dep)), Y_train) print("Validation ") f1_score_valid = evaluator.eval(list(zip(X_valid, X_valid_dep)), Y_valid) print("Test ") f1_score_test = evaluator.eval(list(zip(X_test, X_test_dep)), Y_test) print("---") i_fold += 1 f_out_name = "data/{0}.{1}.test.pred.tsv".format(data_name, task_name) f_out = open(f_out_name, "w") tagger = anago.Tagger(model_config, model_name, save_path=save_path, preprocessor=p, keras_model_name=keras_model_name) for x, y in zip(list(zip(X_test, X_test_dep)), Y_test): result = tagger.predict(x) for word, label, pred in zip(x[0], y, result): f_out.write("{0}\t{1}\t{2}\n".format(word, label, pred)) f_out.write("\n") f_out.close() ate_f1, apc_acc, c_apc_acc = atepc_evaluator.evaluate(f_out_name) results.append([ate_f1, apc_acc, c_apc_acc]) print(results[-1]) print("-----All-----{0}--{1}".format(keras_model_name, data_name)) for result in results: print(result) print("-----AVG-----") results_np = np.array(results, dtype=np.float32) print(results_np.mean(axis=0)) print("-------------")