def train_tagger(): """ This function trains the tagger """ print("Training POS tagger...") # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py tagged_sentences = treebank.tagged_sents() size = int(len(tagged_sentences) * 0.9) train_sents = tagged_sentences[:size] test_sents = tagged_sentences[3000:] default = DefaultTagger("NN") tagger = ClassifierBasedPOSTagger( train=train_sents, backoff=default, cutoff_prob=0.3 ) print(tagger.evaluate(test_sents)) # 0.9613641269156055 # save model to pickle file as binary file_name = MODEL_PATH + "tag_model.pkl" with open(file_name, "wb") as fout: pickle.dump(tagger, fout) print("model written to: " + file_name) print("") return tagger
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def nbc_tagger(): news_text = brown.tagged_sents(categories='news') train_sents = news_text[:3230] test_sents = news_text[3230:4600] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Results Is:", test sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years" sent_w = sent3.lower().split() print sent_w tag = nbc_tagger.tag(sent_w) print "The Tag Is:", tag
def load_tagger(self, name='backup/tagger.pickle'): try: with open(name, "rb") as f: tagger = pickle.load(f) f.close() return tagger except IOError as e: print ("I/O error: {0}".format(e)) pass tagger = ClassifierBasedPOSTagger(train=self.__train_sents, backoff=self.__default, cutoff_prob=0.3) print 'Tagger accuracy : {}'.format(tagger.evaluate(self.__test_sents)) with open(name, 'wb') as f: pickle.dump(tagger, f) f.close() return tagger
def __init__(self, limit=300, debug=True): '''Instance the TrainingSetAnalyzer Keyword arguments: @param: limit size of the tweets which need to be analyzed (300) @param: debug flag for development process ''' self.__debug = debug self.__limit = limit self.__speller = SpellChecker() self.__splitter = Splitter("rtw") self.__replacer = RegexpReplacer() self.__ngramHandler = NgramHandler() train_sents = treebank.tagged_sents()[:3000] self.__tagger = ClassifierBasedPOSTagger(train=train_sents)
def parse(): tagger_classes=([nltk.UnigramTagger, nltk.BigramTagger]) trained_sents, tagged_sents = trainer("WSJ_02-21.pos-chunk","WSJ_23.pos") #tagger = nltk.UnigramTagger(trained_sents) print len(trained_sents) tagger = ClassifierBasedPOSTagger(train=trained_sents[:10000], classifier_builder=lambda train_feats: MaxentClassifier.train(train_feats, trace = 0,max_iter=10)) f = open("WSJ_23.chunk",'w') #print sents for sents in tagged_sents: (words,tags)=sents[0],sents[1] chunks = tagger.tag(tags) #print words, chunks wtc = zip(words, chunks) for tup in wtc: f.write("%s\t%s\n" %(tup[0],tup[1][1])) f.write("\n")
def nltk_classifier_based_pos_tagger(input_dict): """ A sequential tagger that uses a classifier to choose the tag for each token in a sentence. The featureset input for the classifier is generated by a feature detector function:: feature_detector(tokens, index, history) -> featureset Where tokens is the list of unlabeled tokens in the sentence; index is the index of the token for which feature detection should be performed; and history is list of the tags for all tokens before index. Construct a new classifier-based sequential tagger. :param training_corpus: A tagged corpus consisting of a list of tagged sentences, where each sentence is a list of (word, tag) tuples. :param backoff_tagger: A backoff tagger, to be used by the new tagger if it encounters an unknown context. TODO: odloci se katerega se obdrzi od naslednjih dveh :param classifier_builder: A function used to train a new classifier based on the data in *train*. It should take one argument, a list of labeled featuresets (i.e., (featureset, label) tuples). :param classifier: The classifier that should be used by the tagger. #This is only useful if you want to manually construct the classifier; normally, you would use *train* instead. :param backoff_tagger: A backoff tagger, used if this tagger is unable to determine a tag for a given token. :param cutoff_prob: If specified, then this tagger will fall back on its backoff tagger if the probability of the most likely tag is less than *cutoff_prob*. :returns pos_tagger: A python dictionary containing the POS tagger object and its arguments. """ chunk = input_dict['training_corpus']['chunk'] corpus = input_dict['training_corpus']['corpus'] training_corpus=corpus_reader(corpus, chunk) backoff_tagger=input_dict['backoff_tagger']['object'] if input_dict['backoff_tagger'] else DefaultTagger('-None-') classifier=None #(input_dict['classifier']) cutoff_prob=int(input_dict['cutoff_prob']) if input_dict['cutoff_prob'] else None import nltk tagger_object=ClassifierBasedPOSTagger(train=training_corpus, classifier=classifier, backoff=backoff_tagger, cutoff_prob=cutoff_prob) return {'pos_tagger': { 'function':'tag_sents', 'object': tagger_object } }
def myParse(sentence): print("ClassifierBasedPOSTagger tag:") brown_tagged_sents = brown.tagged_sents(categories='news') train_sents = brown_tagged_sents[:500000] tagger = ClassifierBasedPOSTagger( train=train_sents) # , classifier_builder=MaxentClassifier.train) mytagger = SQLPosTagger(tagger) words = nltk.word_tokenize(sentence) result = mytagger.tag(words) print(result)
def parse(): tagger_classes = ([nltk.UnigramTagger, nltk.BigramTagger]) trained_sents, tagged_sents = trainer("WSJ_02-21.pos-chunk", "WSJ_23.pos") #tagger = nltk.UnigramTagger(trained_sents) print len(trained_sents) tagger = ClassifierBasedPOSTagger( train=trained_sents[:10000], classifier_builder=lambda train_feats: MaxentClassifier.train( train_feats, trace=0, max_iter=10)) f = open("WSJ_23.chunk", 'w') #print sents for sents in tagged_sents: (words, tags) = sents[0], sents[1] chunks = tagger.tag(tags) #print words, chunks wtc = zip(words, chunks) for tup in wtc: f.write("%s\t%s\n" % (tup[0], tup[1][1])) f.write("\n")
def get_chunks(text_string): # tokenization print('Tokenising text...') sentences = sent_tokenize(text_string) tokenized_sentences = [] for s in sentences: tokenized_sentences.append(word_tokenize(s)) # PoS tagging train_sents = treebank.tagged_sents() print('Training PoS tagger...') tagger = ClassifierBasedPOSTagger(train=train_sents) tagged_sentences = [] print('Tagging sentences...') for s in tokenized_sentences: tagged_sentences.append(tagger.tag(s)) # chunking print('Getting trained chunk classifier...') chunk_classifier = get_trained_classifier() chunked_sentences = [] print('Chunking sentences...') for s in tagged_sentences: chunked_sentences.append(chunk_classifier.parse(s)) return chunked_sentences
def wordTagger(self, wordlist,number): train_sents = treebank.tagged_sents()[:3000] if number==1: taglist = nltk.pos_tag(wordlist) elif number ==2: tagger = DefaultTagger('NN') taglist = tagger.tag(wordlist) elif number ==3: tagger = UnigramTagger(train_sents) taglist = tagger.tag(wordlist) elif number ==4: tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) taglist = tnt_tagger.tag(wordlist) elif number ==5: tagger = ClassifierBasedPOSTagger(train=train_sents) taglist = tagger.tag(wordlist) return taglist
##### # # 3 classified tagger # ##### from nltk.tag.sequential import ClassifierBasedPOSTagger print("started classified") class_tagger = None try: with open('test_pickles/class.pickle', 'rb') as fa: class_tagger = pickle.load(fa) except FileNotFoundError as a: # training data print("dumping class") class_tagger = ClassifierBasedPOSTagger(train=train) with open('test_pickles/class.pickle', 'wb') as fb: pickle.dump(class_tagger, fb) #print(class_tagger.evaluate(test)) print(class_tagger.tag(tokenized_words)) #### # # 4 TnT # #### print("started tnt") from nltk.tag import tnt tnt_tagger = None try:
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks #punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]') #tword = [punctuation.sub("", word) for word in token] #print(tword) #without punctuation #removing all the MS smart quotes
#adding the tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) #implementing it on the url names ntag = tagger.tag(ntoken) #extracting all the noun phrases from URL string nlist = [] for word,tag in ntag: if (tag == 'NN'):
#adding the tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) #tagger.evaluate(test_sents) #applying the tagger htag = tagger.tag(hd_tokens) print(htag) #extracting all the noun phrases from raw string nlist = []
# train tic() tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) tnt_eval['train_time'] = toc() # test tic() tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents) tnt_eval['test_time'] = toc() # display results display_training_metrics(tnt_eval) """ 2. Naive Bayes classifier tagger """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedPOSTagger(train=train_sents) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents) nb_eval['test_time'] = toc() # display results display_training_metrics(nb_eval) """ 3. Naive Bayes classifier tagger with features """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedTagger(train=train_sents, feature_detector=add_features) nb_eval['train_time'] = toc() # test
from nltk.tag.sequential import ClassifierBasedPOSTagger from tag_util import train_sents, test_sents tagger = ClassifierBasedPOSTagger(train=train_sents) print(tagger.evaluate(test_sents))
# print( 'Training TnT...' ) # tnt_tagger = tnt.TnT() # tnt_tagger.train(train_corpus) # print( 'Testing...' ) # acc = tnt_tagger.evaluate(test_corpus) # print( 'TnT accuracy={0}\n'.format(acc) ) # # # ---------------------------------------------------------------------- # # print( 'Training UnigramTagger...' ) # unigram_tagger = UnigramTagger(train_corpus) # with open( 'unigram.pos_tagger.pickle', 'wb' ) as f: # pickle.dump( unigram_tagger, f ) # # print( 'Testing...' ) # acc = unigram_tagger.evaluate(test_corpus) # print( 'UnigramTagger accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- print('Training ClassifierBasedPOSTagger...') cbt = ClassifierBasedPOSTagger(train=train_corpus) print('Testing...') acc = cbt.evaluate(test_corpus) print('accuracy={0}\n'.format(acc)) print('Storing...') with open(os.path.join(model_folder, 'ClassifierBasedPOSTagger.pickle'), 'wb') as f: pickle.dump(cbt, f)
class TrainingSetAnalyzer(): ''' This class handles the setting of the training set data and provides support for features exctraction given a text''' def __init__(self, limit=300, debug=True): '''Instance the TrainingSetAnalyzer Keyword arguments: @param: limit size of the tweets which need to be analyzed (300) @param: debug flag for development process ''' self.__debug = debug self.__limit = limit self.__speller = SpellChecker() self.__splitter = Splitter("rtw") self.__replacer = RegexpReplacer() self.__ngramHandler = NgramHandler() train_sents = treebank.tagged_sents()[:3000] self.__tagger = ClassifierBasedPOSTagger(train=train_sents) def __analyzeSingleTweet(self, tweet): ''' Helper function to get unigrams, emoticons, ngrams given a text Keyword arguments: @param: tweet the tweet to be analyzed ''' chunks = self.__splitter.split(u'' + tweet) raw_feature_list_neg = [] emot_list = [] ngrams = [] for subTweet in chunks: try: preprocessed_tweet = self.__replacer.preprocess(subTweet) acr_expanded, tmp_emot_list = self.__replacer \ .acr_emot_exctractor(preprocessed_tweet) emot_list += tmp_emot_list enanched_txt = self.__speller.check_and_replace(acr_expanded) tagged_sent = self.__tagger.tag(enanched_txt) raw_feature_list_neg += self.__replacer \ .filter_raw_feature_list( acr_expanded) ngrams += self.__ngramHandler.exctract_ngrams(tagged_sent) except Exception as e: print "Sorry, something goes wrong: %s txt: %s" \ % (str(e), tweet) return raw_feature_list_neg, emot_list, ngrams def analyze(self): ''' Analyzes a set of tweets ''' print "Found %i elments for training" % self.__limit n = 0 while n < 20: qs = get_tweets_for_analyzing(skip=n) for tweet in qs: raw_feature_list_neg, emot, ngrams = self.__analyzeSingleTweet( tweet.text) if not self.__debug: print "saving...." tweet.set_features(raw_feature_list_neg, emot, ngrams) n += 1 return def extract_features_for_classification(self, text): ''' Helper function to exctract features given a text Keyword arguments: @param: text the text whose the features will be exctracted ''' raw_feature_list_neg, emot_list, ngrams = self.__analyzeSingleTweet( text) return raw_feature_list_neg, emot_list, ngrams, dict([ (word, True) for word in raw_feature_list_neg + emot_list + ngrams ]) def purge_useless_features(self): '''Helper function to prune less frequent unigram features''' tweets = get_tweets_for_pruning() print "Pruning process for %i tweets" % tweets.count() mrt = tweets.map_reduce(mapfunc_filter, reducefunc, "cn") mrt = filter(lambda status: status.value > PURGE_TRESHOLD, mrt) purged_qs = [item.key for item in mrt] for tweet in tweets: try: tweet.features.filtered_unigram = [ item for item in purged_qs if item in tweet.features.raw_feature_list_neg ] tweet.save() except Exception, e: print e print "Done!"
'a': 'JJ', 'r': 'RB', 'v': 'VB' } self.fd = FreqDist(treebank.words()) def choose_tag(self, tokens, index, history): """ Choses a POS tag based on the wordnet tag """ word = tokens[index] for synset in wordnet.synsets(word): self.fd[synset.pos()] += 1 return self.wordnet_tag_map.get(self.fd.max()) # Using the wordnet tagger wn_tagger = WordNetTagger() accuracy = wn_tagger.evaluate(test_sents) print(f"Accuracy of the wordnet tagger: {accuracy}\n") # Classifier tagging cl_tagger = ClassifierBasedPOSTagger(train=train_sents) accuracy = cl_tagger.evaluate(test_sents) print(f"Accuracy of the classifier tagger: {accuracy}\n") # Saving pickle - Heavy one with open('pickles/pos-taggers/classifier_tagger.pickle', 'wb') as file: pickle.dump(cl_tagger, file)
#%% # combined tagger with a list of taggers and use a backoff tagger def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) # evaluating the new combined tagger with backoff taggers print(ct.evaluate(test_data)) print(ct.tag(nltk.word_tokenize(sentence))) #%% ## Training using Supervised classification algorithm from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger nbt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) # evaluate tagger on test data and sample sentences print(nbt.evaluate(test_data)) print(nbt.tag(nltk.word_tokenize(sentence)))
#adding the tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) #applying the tagger rawtag = tagger.tag(clean) print rawtag #extracting all the noun phrases from raw string nlist = [] for word, tag in rawtag: if (tag == 'NN'):
# adding the tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger("NN") train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) # tagger.evaluate(test_sents) # applying the tagger htag = tagger.tag(hd_tokens) print(htag) # extracting all the noun phrases from raw string nlist = []
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] #print(train_data[0]) dt = DefaultTagger('NN') print(dt.evaluate(test_data)) nt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print(nt.evaluate(test_data))
def classify_based_tag_train(): train_sents = treebank.tagged_sents()[:5000] #train_sents = brown.tagged_sents(categories='learned', tagset='universal') bigram_tagger = BigramTagger(train_sents) cbtagger = ClassifierBasedPOSTagger(train=train_sents, backoff = bigram_tagger) pickle.dump(cbtagger, open( 'my_tagger.pkl', 'wb' ) )
#tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) #tagger.evaluate(test_sents) #applying the tagger rtag = tagger.tag(r) print(rtag) #extracting all the noun phrases from raw string nlist = []
#Regexp - best to treat numbers? regexp_tagger = RegexpTagger(patterns, backoff=nt) treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger) #skipping affix #skipping brill #TnT #Tried on 9/24 took a long time on evaluting accuracy #tagger = tnt.TnT(unk=backoff,Trained=True) #tagger.train(train_sents) #Used Classifier tagger because of accuracy. Could play around with cutoff probability for using #backoff tagger tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=regexp_tagger,cutoff_prob=0.3) print("Writing new tagger.pickle") f = open('tagger.pickle','w') pickle.dump(tagger,f) f.close() else: print("Opening existing tagger.pickle") f = open('tagger.pickle','r') tagger = pickle.load(f) #Chunker train_new_chunker = True if train_new_chunker: train_chunks = treebank_chunk.chunked_sents()[:3000] conll_train= conll2000.chunked_sents('train.txt')
#adding the tagger from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) #applying the tagger rawtag = tagger.tag(clean) print rawtag #extracting all the noun phrases from raw string nlist = [] for word,tag in rawtag: if (tag == 'NN'):
from nltk.tag.sequential import ClassifierBasedPOSTagger import pickle datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read() datas = datas.split('\n\n') train_sents = [] for data in datas: train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n'))) tagger = ClassifierBasedPOSTagger(train=train_sents) tagger_files = open("indonesian_classifier_pos_tag.pickle", "wb") pickle.dump(tagger, tagger_files) tagger_files.close()
print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print ct.evaluate(test_data) print ct.tag(tokens) from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger nbt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print nbt.evaluate(test_data) print nbt.tag(tokens) # try this out for fun! met = ClassifierBasedPOSTagger(train=train_data, classifier_builder=MaxentClassifier.train) print met.evaluate(test_data) print met.tag(tokens)
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 ) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks #punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]') #tword = [punctuation.sub("", word) for word in token] #print(tword) #without punctuation #removing all the MS smart quotes