def getDefaultTaggerAccuracy(testingSet): # gets the accuracy of the DefaultTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this # altogether # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary defaultTagger = DefaultTagger("NN") defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences) # calculate accuracy totalTags = 0 matches = 0 # iterate through sentences for sentencePOSTags in goldPOSTags: # iterate through tags for individualPOSTag in sentencePOSTags: totalTags += 1 # if the gold tag is NN, then match if individualPOSTag == "NN": matches += 1 accuracy = (matches / totalTags) * 100 return accuracy
def tag_sents(self, sents): ''' Tag a list of sentences. NB before using this function, user should specify the mode_file either by - Train a new model using ``train'' function - Use the pre-trained model which is set via ``set_model_file'' function :params sentences : list of sentences needed to tag. :type sentences : list(list(str)) :return : list of tagged sentences. :rtype : list (list (tuple(str,str))) ''' if self._model_file == '': raise Exception(' No model file is found !! Please use train or set_model_file function') # We need the list of sentences instead of the list generator for matching the input and output ################ added by Kathrin ######################################### default = DefaultTagger('None') sents = default.tag_sents(sents) ########################################################################### result = [] for tokens in sents: features = [self._feature_func(tokens,i) for i in range(len(tokens))] labels = self._tagger.tag(features) if len(labels) != len(tokens): raise Exception(' Predicted Length Not Matched, Expect Errors !') ############### added by Kathrin ############################################ tokens = [i[0] for i in tokens] ############################################################################# tagged_sent = list(zip(tokens,labels)) result.append(tagged_sent) return result
def getDefaultTaggerAccuracy(testingSet): # gets the accuracy of the DefaultTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this # altogether # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary defaultTagger = DefaultTagger('NN') defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences) # calculate accuracy totalTags = 0 matches = 0 # iterate through sentences for sentencePOSTags in goldPOSTags: # iterate through tags for individualPOSTag in sentencePOSTags: totalTags += 1 # if the gold tag is NN, then match if individualPOSTag == 'NN': matches += 1 accuracy = (matches / totalTags) * 100 return accuracy
######### DEFAULT TAGGER ############### #Assigning the default Tag from nltk.tag import DefaultTagger, untag tagger=DefaultTagger('NN') tokens=[['Hello','World'],['How','are','you','?']] print tagger.tag(tokens) print tagger.tag_sents(tokens) #Untagging tagged=tagger.tag(tokens) print untag(tagged) #Evaluating the tagger accuracy from nltk.corpus import treebank test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents)
# every tagger has a tag() method. # DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method. from nltk.tag import DefaultTagger from nltk.corpus import treebank tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) # thought it's too simple, we can try to evaluate it test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents)) # for sentences print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']])) # untagging from nltk.tag import untag print(untag([('Hello', 'NN'), ('World', 'NN')]))
# POS Taggers # module load python/3.5 # brown and treebank corpora from nltk.corpus import treebank train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] # my first tagger from nltk.tag import DefaultTagger tagger = DefaultTagger('NN') print(tagger.tag_sents([['Hello', '.'], ['My', 'name', 'is', 'Steve']])) print(tagger.evaluate(test_sents)) # unigrams from nltk.tag import UnigramTagger unigram_tagger = UnigramTagger(train_sents) tagger = UnigramTagger(train_sents, cutoff=3) print(tagger.evaluate(test_sents)) # bigrams from nltk.tag import BigramTagger bigram_tagger = BigramTagger(train_sents) tagger = BigramTagger(train_sents, cutoff=3) print(tagger.evaluate(test_sents)) # trigrams from nltk.tag import TrigramTagger trigram_tagger = TrigramTagger(train_sents) tagger = TrigramTagger(train_sents, cutoff=3) print(tagger.evaluate(test_sents))