class Chunker: _tagger = DefaultTagger def __init__(self, words, sents): self._tagger = DefaultTagger('NN') self.tag_words(words, sents) def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents) def get_accuracy(self, sentences=[]): if sentences == []: test_sents = treebank.tagged_sents()[6000:] else: test_sents = sentences print self._tagger.evaluate(test_sents)
def test_default_tagger(self): test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv')) tagger = DefaultTagger('N') split = int(len(test_list) * .90) train_data = test_list[:split] test_data = test_list[split:] print(tagger.evaluate(train_data)) print(tagger.evaluate(test_data))
def find_accuracy(train_set, test_set): #skal alt her være test-set? train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] train_set_most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(train_set_most_frequent_tag) accuracy_result = default_tagger.evaluate(test_set) return accuracy_result
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False): """ Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation. :param gold_standard_filename: tsv file of format: word \t POS \n :param num_folds: int: number of folds for cross-validation :param loo: bool: whether to use Leave One Out cross-validation :return: """ tagged_sents = make_sentence_list(gold_standard_filename) backoff = DefaultTagger('N') tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] scores = { 'DefaultTagger': [], 'UnigramTagger': [], 'BigramTagger': [], 'TrigramTagger': [], 'BrillTagger': [], } # k-fold cross-validation if loo: # Leave One Out cross-validation num_folds = len(tagged_sents)-1 subset_size = int(len(tagged_sents) / num_folds) for i in range(num_folds): # training and testing data for this round X_test = tagged_sents[i * subset_size:][:subset_size] X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:] # compute score for taggers default_score = backoff.evaluate(X_train) trigram, tagger_scores = backoff_tagger(X_train, X_test, tagger_classes, backoff=backoff) uni_score, bi_score, tri_score = tagger_scores brill_tagger = train_brill_tagger(trigram, X_train) brill_score = brill_tagger.evaluate(X_test) brill_tagger.print_template_statistics(printunused=False) # save scores scores['DefaultTagger'].append(default_score) scores['UnigramTagger'].append(uni_score) scores['BigramTagger'].append(bi_score) scores['TrigramTagger'].append(tri_score) scores['BrillTagger'].append(brill_score) for k, v in scores.items(): # average scores across folds if v: scores[k] = sum(v)/len(v) print(k, ": {:2.2%}".format(scores[k])) return scores
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
######### DEFAULT TAGGER ############### #Assigning the default Tag from nltk.tag import DefaultTagger, untag tagger=DefaultTagger('NN') tokens=[['Hello','World'],['How','are','you','?']] print tagger.tag(tokens) print tagger.tag_sents(tokens) #Untagging tagged=tagger.tag(tokens) print untag(tagged) #Evaluating the tagger accuracy from nltk.corpus import treebank test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents)
# building your own tagger # preparing the data from nltk.corpus import treebank data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] print train_data[0] # default tagger from nltk.tag import DefaultTagger dt = DefaultTagger('NN') print dt.evaluate(test_data) print dt.tag(tokens) # regex tagger from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] #print(train_data[0]) dt = DefaultTagger('NN') print(dt.evaluate(test_data)) nt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print(nt.evaluate(test_data))
# Use the test data to evaluate taggers and see how they perform on the sample sentence. # Build a custom tagger by extending class TaggerI, from the nltk.tag package and implementing the tag function. # Use the evaluate function to assess the performance of the tagger. # --- Tagger --- # INPUT: Sentence tokens # OUTPUT: List of pairs where each item corresponds to a token of the input with its POS tag # 1. BACKOFF TAGGER (a tagger that is consulted by another when not able to tag a token): # Assigns the same tag to all tokens (tag specified as argument, NN in this case) dt = DefaultTagger("NN") # Measure accuracy on test data (i.e. Gold Standard). Test data should be tagged to compare these # tags against new ones computed by the evaluated tagger (dt in this case) print(dt.evaluate(gold=test_data)) print(dt.tag(tokens=tokens)) # 2. REGEX TAGGER: # Assigns tags to tokens by comparing their word strings to a series of regular expressions # Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions # are evaluated bottom up and thus, the last one defines the default tag patterns = [ (r".*ing$", "VBG"), # Gerunds (r".*ed$", "VBD"), # Simple past (r".*es$", "VBZ"), # 3rd singular present (r".*ould$", "MD"), # Modals (r".*'s$", "NN$"), # Possesive pronouns (r".*s$", "NNS"), # Plural nouns (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # Cardinal numbers
from nltk.corpus import treebank, wordnet from nltk.probability import FreqDist from nltk.tag.sequential import ClassifierBasedPOSTagger from nltk.tag import brill, brill_trainer, tnt, SequentialBackoffTagger from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger from samples import sample # Test and training variables test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] tk_sample = word_tokenize(sample) # Default tagger - Nouns df_tagger = DefaultTagger('NN') tagged = df_tagger.tag(tk_sample) accuracy = df_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Unigram tagger ug_tagger = UnigramTagger(train_sents) tagged = ug_tagger.tag(tk_sample) accuracy = ug_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger) accuracy = ugb_tagger.evaluate(test_sents) print(f"Accuracy of backoff: {accuracy}\n") # Saving pickle and testing it. with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file:
print(tagger.tag(['we', 'are','going']))# WRONG #SequentialBackoffTagger implements the tag() method, which calls the #choose_tag() method of the subclass for each index in the tokens list while accumulating #a history of the previously tagged tokens """DefaultTagger is a subclass of SequentialBackoffTagger. Every subclass of SequentialBackoffTagger must implement the choose_tag() method, which takes three arguments: * The list of tokens * The index of the current token whose tag we want to choose * The history, which is a list of the previous tags SequentialBackoffTagger implements the tag() method, which calls the choose_tag() method of the subclass for each index in the tokens list while accumulating a history of the previously tagged tokens. This history is the reason for the Sequential in SequentialBackoffTagger. We'll get to the backoff portion of the name in the Combining taggers with backoff tagging recipe. """ #accuracy text """So, by just choosing NN for every tag, we can achieve 14 % accuracy testing on one-fourth of the treebank corpus.""" from nltk.corpus import treebank test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
# every tagger has a tag() method. # DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method. from nltk.tag import DefaultTagger from nltk.corpus import treebank tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) # thought it's too simple, we can try to evaluate it test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents)) # for sentences print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']])) # untagging from nltk.tag import untag print(untag([('Hello', 'NN'), ('World', 'NN')]))
""" """ 1. create a tagger utilising: n-gram, unigram, regexp and default taggers """ tag2_eval = dict() # train with backoff tic() tag2_input = create_regexp_list('Open_Word_Patterns.xlsx', RESOURCES_DIR) tag2_tagger = DefaultTagger('NO') tag2_tagger = RegexpTagger(tag2_input, backoff=tag2_tagger) tag2_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag2_tagger) tag2_tagger = BigramTagger(train_sents, backoff=tag2_tagger) tag2_tagger = TrigramTagger(train_sents, backoff=tag2_tagger) tag2_eval['train_time'] = toc() # test tic() tag2_eval['test_accuracy'] = tag2_tagger.evaluate(val_sents) tag2_eval['test_time'] = toc() # display results display_training_metrics(tag2_eval) """ 2. create a tagger utilising: n-gram, unigram, affix and default taggers """ tag1_eval = dict() # train with backoff tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger) tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
import nltk from nltk.corpus import treebank #import treebank corpus from nltk.tag import DefaultTagger #import DefaultTagger tagger = DefaultTagger('NN') #Default Tagger with assigning NN tag treebank_tagged_sents = treebank.tagged_sents( ) #initialising treebank_tagged_sents tagger.tag(treebank_tagged_sents) #tag treebank_tagged_sents print('Accuracy %4.1f%%' % (100.0 * tagger.evaluate(treebank_tagged_sents))) #calculate and print Accuracy
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T #%% from nltk.corpus import treebank data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] print(train_data[0]) #%% # default tagger from nltk.tag import DefaultTagger dt = DefaultTagger('NN') # accuracy on test data dt.evaluate(test_data) # tagging our sample headline dt.tag(nltk.word_tokenize(sentence)) #%% # regex tagger from nltk.tag import RegexpTagger # define refex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NNS'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns
def indivDefault(bambara): default = DefaultTagger('n') print(default.evaluate(bambara.test_sents)) return default