def traintest_bigram_trigram_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] print 'trainging bigramTagger' bitagger = BigramTagger(train_sents) print 'evaluation bitagger' print bitagger.evaluate(test_sents) print 'trainging trigram Tagger' tritagger = TrigramTagger(train_sents) print 'evaluation bitagger' print tritagger.evaluate(test_sents) print 'tagging'
# N-gram taggers >>>from nltk.tag import UnigramTagger >>>from nltk.tag import DefaultTagger >>>from nltk.tag import BigramTagger >>>from nltk.tag import TrigramTagger # we are dividing the data into a test and train to evaluate our taggers. >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) >>>print unigram_tagger.evaluate(test_data) >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) >>>print bigram_tagger.evaluate(test_data) >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger) >>>print trigram_tagger.evaluate(test_data) # Regex tagger >>>from nltk.tag.sequential import RegexpTagger >>>regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ])
print('Bigram tagger accuracy:') from nltk.tag import BigramTagger bigramTagger = BigramTagger(training) print(bigramTagger.evaluate(testing)) #----------------------------------------------------- print('Trigram tagger accuracy:') from nltk.tag import TrigramTagger trigramTagger = TrigramTagger(training) print(trigramTagger.evaluate(testing)) #----------------------------------------------------- #Brill Tagger from nltk.tag import brill, brill_trainer # make sure you've got some train_sents! #brill_tagger = train_brill_tagger(unigramTagger, training) print('Brill tagger accuracy:') #print(brill_tagger.evaluate(testing)) #------------------------------------------------------ # Backoff tagger #bigram_tagger = BigramTagger(training, backoff=unigramTagger)
print("------------Brill Tagger------------") print(brillTagger.tag(sent)) print("------------Accuracy: Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Bigram Tagger Trained------------") print(bigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Trigram Tagger Trained------------") print(trigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger with backoff enabled. Backoff Chain: UnigramTagger -> DefaultTagger------------") unigramTagger = UnigramTagger(brown_train_sents, backoff=defaultTagger) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Tagger with backoff enabled. Backoff Chain: TrigramTagger -> BigramTagger -> UnigramTagger -> DefaultTagger------------") print(initialTagger.evaluate(brown_test_sents)) print("------------Accuracy: Brill Tagger------------") print(brillTagger.evaluate(brown_test_sents)) print(brillTagger.rules()) print("------------Accuracy: TnT Tagger------------") print(tnt_tagger.evaluate(brown_test_sents))
for page in list(root): l = [] text = page.find('text').text.decode('utf8') language = page.find('language').text.decode('utf8') pos = page.find('pos_tags').text.decode('utf8') splitText = text.split(" ")[1:-1] posText = pos.split(" ")[1:-1] for i in range(len(splitText)): l.append((splitText[i], posText[i])) data.append(l) count = count + 1 shuffle(data) # Divide data into train and test sets eightyPercent = count*0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] # Train train_data = training_set tag1 = DefaultTagger('NN') tag2 = UnigramTagger(train_data, backoff = tag1) tag3 = BigramTagger(train_data, backoff = tag2) tag4 = TrigramTagger(train_data, backoff = tag3) # Accuracy # print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split()) # print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split()) gold_sentences = test_set print tag4.evaluate(gold_sentences)
from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger # 从布朗语料库中获取文本数据,切分为句子 sentences = brown.tagged_sents(categories='news') # 将4000个句子用作训练,623个句子用作测试 train = sentences[:4000] test = sentences[4000:] # 创建回退标注器 unigram = UnigramTagger(train) bigram = BigramTagger(train, backoff=unigram) trigram = TrigramTagger(train, backoff=bigram) # 查看准确率 trigram.evaluate(test) # TF-IDF import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer # 创建文本 text_data = np.array( ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both']) # 创建TF-IDF特征矩阵 tfidf = TfidfVectorizer() feature_matrix = tfidf.fit_transform(text_data) # 查看TF-IDF特征矩阵 feature_matrix feature_matrix.toarray() # 查看特征的名字 tfidf.vocabulary_
import nltk from nltk.tag import BigramTagger, TrigramTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] bigramtag = BigramTagger(training) print(bigramtag.evaluate(testing)) trigramtag = TrigramTagger(training) print(trigramtag.evaluate(testing))
from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) #testing perfomence of unigram tagger print(ut.evaluate(test_data)) print(ut.tag(tokens)) #testing perfomence of bigram tagger print(bt.evaluate(test_data)) print(bt.tag(tokens)) #testing perfomence of trigram tagger print(tt.evaluate(test_data)) print(tt.tag(tokens)) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print(ct.evaluate(test_data)) print(ct.tag(tokens))
with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file: pickle.dump(ugb_tagger, file) with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file: pk_tagger = pickle.load(file) accuracy = pk_tagger.evaluate(test_sents) print(f"Accuracy of pickled backoff: {accuracy}\n") # Testing bigram and trigram taggers bg_tagger = BigramTagger(train_sents) accuracy = bg_tagger.evaluate(test_sents) print(f"Accuracy of bigram: {accuracy}\n") tg_tagger = TrigramTagger(train_sents) accuracy = tg_tagger.evaluate(test_sents) print(f"Accuracy of trigram: {accuracy}\n") def make_backoffs(training, tagger_classes, backoff=None): """ Function for training and make chains of backoff tagger """ # Make a tagger using the previous one as a backoff for cls in tagger_classes: backoff = cls(training, backoff=backoff) return backoff # Testing the function with all 4 taggers bc_tagger = make_backoffs(train_sents,
# we are dividing the data into a test and train to evaluate our taggers. train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] #Unigram selecciona la clasificación + probable #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.UnigramTagger unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) print("Unigram Tagger: {}".format(unigram_tagger.evaluate(test_data))) #Bigram se basa en la palabra actual y la anterior para clasificar #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.BigramTagger bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print("Bigram Tagger: {}".format(bigram_tagger.evaluate(test_data))) #Trigram se basa en la actual, anterior y anterior a la anterior #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.TrigramTagger trigram_tagger = TrigramTagger(train_data,backoff=bigram_tagger) print("Trigram Tagger: {}".format(trigram_tagger.evaluate(test_data))) ''' Aquí lo que se ha hecho ha sido crear 3 "taggeadores" N-Gram con un conjunto de datos de entrenamiento del corpus brown, que ya estaba clasificado. Además, se han podido combinar para que cuando un "taggeador" no sepa que hacer pruebe con su "taggeador" N-1 hasta llegar al por defecto de clasificarlo como NN. ####################### ### Regexp Tagger ### ####################### Otra opción para crear nuestro propio "taggeador" es recurrir a las queridas expresiones regulares con un RegexpTagger '''
from nltk.tag import UnigramTagger from nltk.tag import TrigramTagger from nltk.tag import BigramTagger from nltk.tag import DefaultTagger # we are dividing the data into a test and train to evaluate our taggers. train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) # unigram_tagger = UnigramTagger(train_data, backoff=regexp_tagger) print(unigram_tagger.evaluate(test_data)) # 0.8361407355726104 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8452108043456593 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.843317053722715 # 命名实体识别 # NER tagger from nltk import ne_chunk from nltk import word_tokenize sent = "Mark is studying at Stanford University in California" print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False)) print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=True))
#训练自己的tagger #import nltk #nltk.download('brown') from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger sentences = brown.tagged_sents(categories='news') print(sentences) train = sentences[:4000] test = sentences[4000:] unigram = UnigramTagger(train) bigram = BigramTagger(train, backoff=unigram) trigram = TrigramTagger(train, backoff=bigram) print(trigram.evaluate(test)) #词袋 from sklearn.feature_extraction.text import CountVectorizer text_data = np.array( ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both']) count = CountVectorizer() bag_of_words = count.fit_transform(text_data) print(bag_of_words.toarray()) count_2gram = CountVectorizer(ngram_range=(1, 2), stop_words="english", vocabulary=['brazil', 'love']) bag = count_2gram.fit_transform(text_data) print(bag.toarray())
nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')]) # Unigram tagger unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger) print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])), Template(brill.Pos([1, 2])), Template(brill.Pos([1, 3])), Template(brill.Word([1, 1])), Template(brill.Word([2, 2])), Template(brill.Word([1, 2])), Template(brill.Word([1, 3])), Template(brill.Pos([-1, -1]), brill.Pos([1, 1])), Template(brill.Word([-1, -1]), brill.Word([1, 1])), ]
def indivTrigram(bambara,backoff): trigram=TrigramTagger(bambara.train_sents, backoff=backoff) print("Trigram accuracy: ",trigram.evaluate(bambara.test_sents)) return trigram
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank from tag_util import backoff_tagger train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] bitagger = BigramTagger(train_sents) print(bitagger.evaluate(test_sents)) tritagger = TrigramTagger(train_sents) print(tritagger.evaluate(test_sents)) default_tagger = DefaultTagger('NN') combined_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) print(combined_tagger.evaluate(test_sents)) # # train # default_tagger = DefaultTagger('NN') # # train_sents = treebank.tagged_sents()[:3000] # tagger = UnigramTagger(train_sents, backoff=default_tagger) # # # test # test_sents = treebank.tagged_sents()[3000:] # print(tagger.evaluate(test_sents)) # # # save to pickle # import pickle # with open('unitagger.pkl', 'wb') as output: # pickle.dump(tagger, output)
print(rt.tag(tokens)) # 3. N-GRAM TAGGERS: # Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens)) # Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams # and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words # do not always appear paired in the same way) # 4. TAGGER CHAINING WITH BACKOFF TAGGERS:
from nltk.tag import BigramTagger as BigT from nltk.tag import TrigramTagger as TriT biTagger=BigT(train_sents) biTagger.evaluate(test_sents) triTagger=TriT(train_sents) triTagger.evaluate(test_sents)
(r'.*', 'NOUN_NOM')]) # Unigram tagger unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger) print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])), Template(brill.Pos([1, 2])), Template(brill.Pos([1, 3])), Template(brill.Word([1, 1])), Template(brill.Word([2, 2])), Template(brill.Word([1, 2])), Template(brill.Word([1, 3])), Template(brill.Pos([-1, -1]), brill.Pos([1, 1])), Template(brill.Word([-1, -1]), brill.Word([1, 1])), ]
print(default_tagger.evaluate(brown_tagged_sents2)) # 0.3333333333333333 train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) print(unigram_tagger.evaluate(test_data)) # 0.835841722316356 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8454101465164956 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.8427190272102063 regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) print(regexp_tagger.evaluate(test_data))
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print ct.evaluate(test_data) print ct.tag(tokens) from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank from tag_util import backoff_tagger train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] bitagger = BigramTagger(train_sents) print(bitagger.evaluate(test_sents)) tritagger = TrigramTagger(train_sents) print(tritagger.evaluate(test_sents)) default_tagger = DefaultTagger('NN') combined_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) print(combined_tagger.evaluate(test_sents)) # # train # default_tagger = DefaultTagger('NN') # # train_sents = treebank.tagged_sents()[:3000] # tagger = UnigramTagger(train_sents, backoff=default_tagger) # # # test # test_sents = treebank.tagged_sents()[3000:] # print(tagger.evaluate(test_sents)) # # # save to pickle # import pickle