def test_hmm_count_correct(): sentence = "Tamir the AP comes this story :" tags = "PROPN DET PROPN VERB DET NOUN PUNCT" gold_sentence = [(sentence.split()[i], tags.split()[i]) for i in range(len(sentence.split()))] sentences = tagger.load_annotated_corpus( 'trainTestData/en-ud-train.upos.tsv') allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B = tagger.learn_params( sentences) pred_sentence = tagger.hmm_tag_sentence(sentence.split(), A, B) correct, correctOOV, OOV = tagger.count_correct(gold_sentence, pred_sentence) print(correct, correctOOV, OOV)
def test_tag_sentence(): sentence = "Tamir the AP comes this story :" sentences = tagger.load_annotated_corpus( 'trainTestData/en-ud-train.upos.tsv') allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B = tagger.learn_params( sentences) # Models that must be supported (you can add more): baseline = {'baseline': [perWordTagCounts, allTagCounts]} HMM = {'hmm': [A, B]} Vanilla_BiLSTM = {'blstm': [{}]} BiLSTM_case = {'cblstm': [{}]} print(tagger.tag_sentence(sentence.split(), baseline)) print(tagger.tag_sentence(sentence.split(), HMM)) print(tagger.tag_sentence(sentence.split(), Vanilla_BiLSTM)) print(tagger.tag_sentence(sentence.split(), BiLSTM_case))
import tagger train_path = r"C:\src\MastersCourses\NLP\Assign_4\data\en-ud-train.upos.tsv" dev_path = r"C:\src\MastersCourses\NLP\Assign_4\data\en-ud-dev.upos.tsv" train_data = tagger.load_annotated_corpus(train_path) dev_data = tagger.load_annotated_corpus(dev_path) [allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B] = tagger.learn_params(train_data) gold_sentence = dev_data[0] pred_sentence = [w[0] for w in gold_sentence] print(f"tested sentence is {gold_sentence} of length {len(pred_sentence)}") tagged_sentence = tagger.baseline_tag_sentence(pred_sentence, perWordTagCounts, allTagCounts) correct, correctOOV, OOV = tagger.count_correct(gold_sentence, tagged_sentence) print(f"correct: {correct}, correctOOV: {correctOOV}, OOV: {OOV}") score_nom, score_denom = 0, 0 for gold_sentence in dev_data: pred_sentence = [w[0] for w in gold_sentence] tagged_sentence = tagger.baseline_tag_sentence(pred_sentence, perWordTagCounts, allTagCounts) correct, correctOOV, OOV = tagger.count_correct(gold_sentence, tagged_sentence) score_nom += correct score_denom += len(pred_sentence)
def test_learn_params(sentences): allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B = tagger.learn_params( sentences) return allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B
def check_sampled_sentence(gold_sentence, model_dict): pred_sentence = [w[0] for w in gold_sentence] tagged_sentence = tag_sentence(pred_sentence, model_dict) correct, correctOOV, OOV = count_correct(gold_sentence, tagged_sentence) print(f"correct: {correct}, correctOOV: {correctOOV}, OOV: {OOV}\n") train_path = r"en-ud-train.upos.tsv" dev_path = r"en-ud-dev.upos.tsv" train_data = load_annotated_corpus(train_path) dev_data = load_annotated_corpus(dev_path) [allTagCounts, perWordTagCounts, transitionCounts, emissionCounts, A, B] = learn_params(train_data) # draw random sentence gold_sentence = dev_data[randrange(len(dev_data))] print( f"tested random sentence is {gold_sentence} of length {len(gold_sentence)}\n" ) # test baseline calc_score(dev_data, {'baseline': [perWordTagCounts, allTagCounts]}) check_sampled_sentence(gold_sentence, {'baseline': [perWordTagCounts, allTagCounts]}) # test hmm calc_score(dev_data, {'hmm': [A, B]}) check_sampled_sentence(gold_sentence, {'hmm': [A, B]})