def train_tagger(): """ This function trains the tagger """ print("Training POS tagger...") # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py tagged_sentences = treebank.tagged_sents() size = int(len(tagged_sentences) * 0.9) train_sents = tagged_sentences[:size] test_sents = tagged_sentences[3000:] default = DefaultTagger("NN") tagger = ClassifierBasedPOSTagger( train=train_sents, backoff=default, cutoff_prob=0.3 ) print(tagger.evaluate(test_sents)) # 0.9613641269156055 # save model to pickle file as binary file_name = MODEL_PATH + "tag_model.pkl" with open(file_name, "wb") as fout: pickle.dump(tagger, fout) print("model written to: " + file_name) print("") return tagger
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def nbc_tagger(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Results Is:", test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years" sent_w = sent3.lower().split() print sent_w tag = nbc_tagger.tag(sent_w) print "The Tag Is:", tag
def load_tagger(self, name='backup/tagger.pickle'): try: with open(name, "rb") as f: tagger = pickle.load(f) f.close() return tagger except IOError as e: print ("I/O error: {0}".format(e)) pass tagger = ClassifierBasedPOSTagger(train=self.__train_sents, backoff=self.__default, cutoff_prob=0.3) print 'Tagger accuracy : {}'.format(tagger.evaluate(self.__test_sents)) with open(name, 'wb') as f: pickle.dump(tagger, f) f.close() return tagger
print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print ct.evaluate(test_data) print ct.tag(tokens) from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger nbt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print nbt.evaluate(test_data) print nbt.tag(tokens) # try this out for fun! met = ClassifierBasedPOSTagger(train=train_data, classifier_builder=MaxentClassifier.train) print met.evaluate(test_data) print met.tag(tokens)
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] #print(train_data[0]) dt = DefaultTagger('NN') print(dt.evaluate(test_data)) nt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print(nt.evaluate(test_data))
from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks #punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]') #tword = [punctuation.sub("", word) for word in token] #print(tword) #without punctuation #removing all the MS smart quotes #smart_quotes = re.compile(r'[\x80-\x9f]')
tnt_eval['train_time'] = toc() # test tic() tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents) tnt_eval['test_time'] = toc() # display results display_training_metrics(tnt_eval) """ 2. Naive Bayes classifier tagger """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedPOSTagger(train=train_sents) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents) nb_eval['test_time'] = toc() # display results display_training_metrics(nb_eval) """ 3. Naive Bayes classifier tagger with features """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedTagger(train=train_sents, feature_detector=add_features) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents) nb_eval['test_time'] = toc() # display results
# print( 'Training TnT...' ) # tnt_tagger = tnt.TnT() # tnt_tagger.train(train_corpus) # print( 'Testing...' ) # acc = tnt_tagger.evaluate(test_corpus) # print( 'TnT accuracy={0}\n'.format(acc) ) # # # ---------------------------------------------------------------------- # # print( 'Training UnigramTagger...' ) # unigram_tagger = UnigramTagger(train_corpus) # with open( 'unigram.pos_tagger.pickle', 'wb' ) as f: # pickle.dump( unigram_tagger, f ) # # print( 'Testing...' ) # acc = unigram_tagger.evaluate(test_corpus) # print( 'UnigramTagger accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- print('Training ClassifierBasedPOSTagger...') cbt = ClassifierBasedPOSTagger(train=train_corpus) print('Testing...') acc = cbt.evaluate(test_corpus) print('accuracy={0}\n'.format(acc)) print('Storing...') with open(os.path.join(model_folder, 'ClassifierBasedPOSTagger.pickle'), 'wb') as f: pickle.dump(cbt, f)
from nltk.tag.sequential import ClassifierBasedPOSTagger from tag_util import train_sents, test_sents tagger = ClassifierBasedPOSTagger(train=train_sents) print(tagger.evaluate(test_sents))
'a': 'JJ', 'r': 'RB', 'v': 'VB' } self.fd = FreqDist(treebank.words()) def choose_tag(self, tokens, index, history): """ Choses a POS tag based on the wordnet tag """ word = tokens[index] for synset in wordnet.synsets(word): self.fd[synset.pos()] += 1 return self.wordnet_tag_map.get(self.fd.max()) # Using the wordnet tagger wn_tagger = WordNetTagger() accuracy = wn_tagger.evaluate(test_sents) print(f"Accuracy of the wordnet tagger: {accuracy}\n") # Classifier tagging cl_tagger = ClassifierBasedPOSTagger(train=train_sents) accuracy = cl_tagger.evaluate(test_sents) print(f"Accuracy of the classifier tagger: {accuracy}\n") # Saving pickle - Heavy one with open('pickles/pos-taggers/classifier_tagger.pickle', 'wb') as file: pickle.dump(cl_tagger, file)
from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks #punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]') #tword = [punctuation.sub("", word) for word in token] #print(tword) #without punctuation #removing all the MS smart quotes #smart_quotes = re.compile(r'[\x80-\x9f]')