def test_tagger(tagger_name, tagger_input, test_data, **kwargs): # initialise results tagger_eval = dict() # train tic() tagger_tagger = tagger_name(tagger_input, **kwargs) tagger_eval['train_time'] = toc() # test tic() tagger_eval['test_accuracy'] = tagger_tagger.evaluate(test_data) tagger_eval['test_time'] = toc() # show results display_training_metrics(tagger_eval)
train_sents, val_sents, test_sents = read_corpus('INTERA', role='train', proportion=PROPORTION, tag_length=TAG_LENGTH) """ # ============================================================================= # investigate NLTK classification tagging options # ============================================================================= """ """ 1. TNT tagger """ tnt_eval = dict() # train tic() tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) tnt_eval['train_time'] = toc() # test tic() tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents) tnt_eval['test_time'] = toc() # display results display_training_metrics(tnt_eval) """ 2. Naive Bayes classifier tagger """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedPOSTagger(train=train_sents) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
'build_fn': build_model, 'input_dim': train_X.shape[1], 'hidden_neurons': 32, 'output_dim': train_y.shape[1], 'epochs': 5, 'batch_size': 256, 'verbose': 1, 'validation_data': (val_X, val_y), 'shuffle': True } pos_model = KerasClassifier(**model_params) # train the model pos_model_history = pos_model.fit(train_X, train_y) deeplearn_eval = dict() deeplearn_eval['train_time'] = toc() print(deeplearn_eval['train_time']) # review training results plot_model_performance(pos_model_history) plot_model(pos_model.model, to_file=RESULTS_DIR+'Greek_POS_deep_model.png', show_shapes=True) """ 4. save and test the model """ # temporarily save the trained model, history and details pos_model.model.save(RESOURCES_DIR+'Greek_POS_DL.h5') save_tagger(RESULTS_DIR+'Greek_POS_DL_History.pkl', pos_model_history.history) save_tagger(RESOURCES_DIR+'Greek_POS_DL_DictVectorizer.pkl', dict_vectorizer) save_tagger(RESOURCES_DIR+'Greek_POS_DL_LabelEncoder.pkl', label_encoder)
# train with backoff and Brill tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger) tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger) tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger) tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger) tag1b_tagger = train_brill_tagger(tag1_tagger, train_sents, True, max_rules=100) tag1_eval['train_time'] = toc() # test tic() tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents) tag1_eval['test_time'] = toc() # display results display_training_metrics(tag1_eval) """ # ============================================================================= # finalise a classification-based tagger # ============================================================================= """ """ 1. Naive Bayes classifier tagger with features and Brill """ nb_eval = dict() # train tic()
# ============================================================================= # compound taggers using sequential taggers and backoff # ============================================================================= """ """ 1. create a tagger utilising: n-gram, unigram, regexp and default taggers """ tag2_eval = dict() # train with backoff tic() tag2_input = create_regexp_list('Open_Word_Patterns.xlsx', RESOURCES_DIR) tag2_tagger = DefaultTagger('NO') tag2_tagger = RegexpTagger(tag2_input, backoff=tag2_tagger) tag2_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag2_tagger) tag2_tagger = BigramTagger(train_sents, backoff=tag2_tagger) tag2_tagger = TrigramTagger(train_sents, backoff=tag2_tagger) tag2_eval['train_time'] = toc() # test tic() tag2_eval['test_accuracy'] = tag2_tagger.evaluate(val_sents) tag2_eval['test_time'] = toc() # display results display_training_metrics(tag2_eval) """ 2. create a tagger utilising: n-gram, unigram, affix and default taggers """ tag1_eval = dict() # train with backoff tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
proportion=70, tag_length=TAG_LENGTH) _, val_100, test_int = read_corpus('INTERA', role='train', proportion=PROPORTION, tag_length=TAG_LENGTH) _, _, test_ud = read_corpus('UDGreek') _, _, test_tt = read_corpus('tagged_texts') """ 1. sequential tagger """ seq_eval = dict() seq_tag = load_tagger(RESOURCES_DIR+'Greek_POS_seq.pkl') # word level seq_eval['verification'] = seq_tag.evaluate(val_70) tic() seq_eval['evaluate'] = seq_tag.evaluate(test_int) seq_eval['evaluate_time'] = toc() seq_eval['ud_greek'] = seq_tag.evaluate(test_ud) seq_eval['tagged_text'] = seq_tag.evaluate(test_tt) # sentence level pred_int = [seq_tag.tag(s) for s in untag(test_int)] seq_eval['sent_evaluate'] = compute_sent_acc(test_int, pred_int) pred_ud = [seq_tag.tag(s) for s in untag(test_ud)] seq_eval['sent_ud_greek'] = compute_sent_acc(test_ud, pred_ud) pred_tt = [seq_tag.tag(s) for s in untag(test_tt)] seq_eval['sent_tagged_text'] = compute_sent_acc(test_tt, pred_tt) print('\n') print(seq_eval) """ 2. classification tagger """ class_eval = dict()