templates, deterministic=True) return trainer.train(training, **kwargs) # Brill tagger using the previous backoff chain tagger br_tagger = train_brill_tagger(bc_tagger, train_sents) accuracy = br_tagger.evaluate(test_sents) print(f"Accuracy of the brill tagger: {accuracy}\n") # Saving pickle with open('pickles/pos-taggers/brill_tagger.pickle', 'wb') as file: pickle.dump(br_tagger, file) # TnT tagger with default tagger for unknown words tnt_tagger = tnt.TnT(unk=df_tagger, Trained=True, N=200) tnt_tagger.train(train_sents) accuracy = tnt_tagger.evaluate(test_sents) print(f"Accuracy of the tnt tagger: {accuracy}\n") # Saving pickle with open('pickles/pos-taggers/tnt_tagger.pickle', 'wb') as file: pickle.dump(tnt_tagger, file) # Tagging using the wordnet class WordNetTagger(SequentialBackoffTagger): """ Class implementation of the wordnet tagger """ def __init__(self, *args, **kwargs):
def run_test(my_corpus): if my_corpus == treebank: print 'Corpus Info:' print ' Corpus: treebank' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) my_tagged_sents = my_corpus.tagged_sents() my_sents = my_corpus.sents() elif my_corpus == brown: print 'Corpus Info:' print ' Corpus: brown' print ' Tagged Sents:', len(my_corpus.tagged_sents()) print ' Tagged Words:', len(my_corpus.tagged_words()) #print ' Tagged Sents (news):', len(my_corpus.tagged_sents(categories='news')) #print ' Tagged Words (news):', len(my_corpus.tagged_words(categories='news')) #my_tagged_sents = my_corpus.tagged_sents(categories='news') #my_sents = my_corpus.sents(categories='news') print ' Tagged Sents :', len(my_corpus.tagged_sents()) print ' Tagged Words :', len(my_corpus.tagged_words()) my_tagged_sents = my_corpus.tagged_sents() my_sents = my_corpus.sents() else: return fold = 5 print 'Performing', fold, 'fold cross validation on corpus ...' train_accuracy = [] test_accuracy = [] train_runtime = [] test_runtime = [] for k in range(fold): train_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold != k ] validation_data = [ x for i, x in enumerate(my_tagged_sents) if i % fold == k ] #test_data = [x for i, x in enumerate(my_sents) if i % fold == k] print 'Fold', k, ' has', len(train_data), 'train sentences and', len( validation_data), 'test sentences' tnt_pos_tagger = tnt.TnT() begin = time.time() tnt_pos_tagger.train(train_data) end = time.time() train_acc = tnt_pos_tagger.evaluate(train_data) train_accuracy.append(train_acc) train_runtime.append(end - begin) print ' Train accuracy =', train_acc, ' runtime =', end - begin begin = time.time() test_acc = tnt_pos_tagger.evaluate(validation_data) end = time.time() test_accuracy.append(test_acc) test_runtime.append(end - begin) print ' Test accuracy =', test_acc, ' runtime =', end - begin print 'Results:' print '%15s %15s %15s %15s %15s' % ('Fold', 'Train-Accuracy', 'Train-Runtime', 'Test-Accuracy', 'Test-Runtime') for k in range(fold): print '%15d %15.3f%% %15.5f %15.3f%% %15.5f' % ( k, train_accuracy[k] * 100, train_runtime[k], test_accuracy[k] * 100, test_runtime[k]) avg_train_acc = sum(train_accuracy) / len(train_accuracy) avg_train_runtime = sum(train_runtime) / len(train_runtime) avg_test_acc = sum(test_accuracy) / len(test_accuracy) avg_test_runtime = sum(test_runtime) / len(test_runtime) print '%15s %15.3f%% %15.5f %15.3f%% %15.5f' % ( 'Average', avg_train_acc * 100, avg_train_runtime, avg_test_acc * 100, avg_test_runtime) return
brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs) defaultTagger = DefaultTagger('NN') initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger) brillTagger = train_brill_tagger(initialTagger, brown_train_sents) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(brown_train_sents) bigramTagger = BigramTagger(brown_train_sents) trigramTagger = TrigramTagger(brown_train_sents) print("------------Recommended Tagger------------") print(nltk.pos_tag(sent)) print("------------Default Tagger------------") print(defaultTagger.tag(sent)) print("------------Unigram Tagger Overrode------------") unigramTagger = UnigramTagger(model={'Pierre': 'NN'}) print(unigramTagger.tag(sent))
trainer = [] for r, d, f in os.walk(source_txt): trainer.append(f) text = '\n\n'.join([get_tags(source_txt + tr) for tr in trainer[0]]) list_of_tags = Pos(text).get_words() list_of_tags.sort() to_train = list(list_of_tags for list_of_tags,_ in itertools.groupby(list_of_tags)) all_wrd = np.asarray(Pos(text).get_words())[:, 0] target_pos = np.asarray(Pos(text).get_words())[:, 1] tnt_tot = tnt.TnT() tnt_tot.train([list(zip(list(all_wrd), list(target_pos)))]) with open(path_to_save_model + 'tnt.pkl', 'wb') as h: pickle.dump(tnt_tot, h) if dedup: all_wrd = np.asarray(to_train)[:, 0] target_pos = np.asarray(to_train)[:, 1] ### filter POS to keep only value that appear more than 10 times all_string = [x for x in target_pos]
def hindi_model(): train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
def updateScore(self, score): self.score += score if __name__ == '__main__': file = codecs.open("hindi_text.txt", "r", "utf-8") # Read the contents of the file into memory. train_data_file = file.read() file.close() train_data1 = train_data_file.splitlines() file1 = codecs.open("hindi_output.txt", "w+", "utf-8") train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train( train_data) # Training the tnt Part of speech tagger with hindi data for line in train_data1: s = tnt_pos_tagger.tag(nltk.word_tokenize(line)) for x in s: for j in x: file1.write(j + " ") file1.write("\n") #Create dictionary key: Fixed Part Value: Idiom db_fp = defaultdict() #Create set containing CNFP db_cnfp = defaultdict(set) # Create set containing extra words db_extra_words = defaultdict(set)
format(sys.argv[0]), file=sys.stderr) sys.exit(1) posField = int(sys.argv[5]) - 1 with open(sys.argv[1], encoding='UTF-8') as FP_train,\ open(sys.argv[2], encoding='UTF-8') as FP_test,\ open(sys.argv[3], 'w', encoding='UTF-8') as FP_out: # XXX Unk not handled... # In Brant's version the default is: # sparse data : linear interpolation # unknown mode: statistics of singletons # using suffix trie up to length 10 # case of characters is significant (Handled) tagger = tnt.TnT(C=True) sents = [] sent = [] print("Adding sentences...", file=sys.stderr) for line in FP_train: line = line.strip().split() if len(line) == 0: sents.append(sent) sent = [] else: sent.append((line[posField], line[-1])) print("Training...", file=sys.stderr) tagger.train(sents) print("Tagging...", file=sys.stderr) sent = []
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
accuracys = [] accuracysSolo = [] # In[ ]: for i in range(-1,-5,-1): #SIN TNT affix_tagger = AffixTagger(train=train_reducido,affix_length=i) evalSolo = affix_tagger.evaluate(test_reducido) accuracysSolo.append(evalSolo) print("Suavizado Solo con Affix_Length = ",i," Accuracy: ", evalSolo) #CON TNT tnt_tagging = tnt.TnT(unk=affix_tagger,Trained=True) tnt_tagging.train(train_reducido) evaluacion = tnt_tagging.evaluate(test_reducido) accuracys.append(evaluacion) print("TnT Con suavizado Affix_Length = ",i," Accuracy: ", evaluacion) # In[ ]: print(accuracys) # In[ ]:
def nepali_model(): data_path = os.path.join(os.getcwd(), 'data/nepali.pos') train_data = indian.tagged_sents(data_path) tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
from nltk.tag import tnt, pos_tag from nltk.corpus import indian from nltk.tokenize import TreebankWordTokenizer import codecs from googletrans import Translator import re import pandas as pd translator = Translator() train_data = indian.tagged_sents('hindi.pos') t = tnt.TnT() t.train(train_data) filename = "text_regional.txt" with codecs.open(filename, encoding="utf-8") as file: data = file.read() with codecs.open("final_stopwords.txt", encoding="utf-8") as file: stopwordstext = file.read() #stopwords=list(stopwords.words('hindi')) stopwords = stopwordstext.split("\n") words = [ w for w in TreebankWordTokenizer().tokenize(data) if w not in stopwords and w not in ["|", ".", ","] ] tags = t.tag(words) tag_list = [] for i, j in tags: if j == "Unk": tr = translator.translate(i)
from src.appconfig import ApplicationConfig import nltk from nltk.tag import tnt import pickle #---------------------------------------Global Variables TNT_POS_TAGGER = tnt.TnT() #-------------------------------------------------------# def loadPOSTaggerModelFromDisk(): fileRef = open(ApplicationConfig.POS_TAGGER_MODEL_PICKLE_PATH, "rb") tnt_pos_tagger = pickle.load(fileRef) fileRef.close() return tnt_pos_tagger def performPOSTagging(tokenList): #tokenList = nltk.word_tokenize("അകത്തി അടിമയായിത്തീരുക നശിക്കുക അരികെ പാളി ഉച്ചത്തിലുള്ള യായിത്") tnt_pos_tagger = loadPOSTaggerModelFromDisk() taggedOutput = tnt_pos_tagger.tag(tokenList) return taggedOutput # ------------------------- TESTING MODULE ------------# def testPosTagging(): tokenList = nltk.word_tokenize(
from nltk.tag import tnt, RegexpTagger, DefaultTagger from tag_util import train_sents, test_sents, patterns tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) print(tnt_tagger.evaluate(test_sents)) # 0.875631340384 # deal with unknown tokens default_tagger = DefaultTagger('NN') unk_tagger = RegexpTagger(patterns, backoff=default_tagger) tnt_tagger2 = tnt.TnT(unk=unk_tagger, Trained=True) tnt_tagger2.train(train_sents) print(tnt_tagger2.evaluate(test_sents)) # 0.896956615584
def train_hindi_model(model_path): train_data = indian.tagged_sents(model_path) tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
import nltk from nltk.tag import DefaultTagger from nltk.tag import tnt from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] tnt_tagger=tnt.TnT() unknown=DefaultTagger('NN') tagger_tnt=tnt.TnT(unk=unknown,Trained=True) tnt_tagger.train(training) print(tnt_tagger.evaluate(testing))
# for loop splits text into sentences training_sentences = [] sentence = [] for item in line_list: if '<sentence' in item[0]: sentence = [] elif '/sentence' in item[0]: training_sentences.append(sentence) else: mapped_tag = mapping.get(item[2], 'ERROR') sentence.append((item[0], mapped_tag)) # initialize and train tagger print('Training tagger...') tnt_tagger = tnt.TnT() tnt_tagger.train(training_sentences) # Import Swahili development and testing data print('Parsing test/dev data...') old_books = open('hcs2_new_news.vrt', 'r').readlines() # Parse Swahili development and testing files POS_test_dev = [old_books] # training and dev sets must be split by text first # not just sentences like the training data all_texts = [] text = [] for lines in POS_test_dev:
from nltk.tag import hmm from nltk.tag import tnt import pickle import numpy with open('test', 'rb') as fp: test = pickle.load(fp) with open('train', 'rb') as fp: train = pickle.load(fp) # Entrenamiento del etiquetador #tagger_hmm = hmm.HiddenMarkovModelTagger.train(train) tagger_tnt = tnt.TnT() tagger_tnt.train(train) # Evaluación del etiquetador #print(tagger_hmm.evaluate(test)) print(tagger_tnt.evaluate(test)) # Etiquetado de palabras del conjunto de test words = [] correct = [] for sentence in test: for word in sentence: words.append(word[0]) correct.append(word) #t = tagger_hmm.tag(words) t = tagger_tnt.tag(words)
def evaluateTnT(): tagger = tnt.TnT() train = int(len(fsents) * 0.9) tagger.train(rsents[:train]) precisionTnT = tagger.evaluate(rsents[train:]) print(precisionTnT)
def nepali_model(): train_data = indian.tagged_sents('<path/to/nepali.pos>') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
listeval = [] intervals = [] for iter in range(10): test = bloques[iter] train = [] for element in bloques: if element != test: for item in element: train.append(item) # Affix tagger suffix_tagger = nltk.tag.AffixTagger(train=train, affix_length=-2) # Entrenamiento del etiquetador tagger_tnt = tnt.TnT(N=100, unk=suffix_tagger, Trained=True) tagger_tnt.train(train) # Evaluación del etiquetador v = tagger_tnt.evaluate(test) d = 1.96 * math.sqrt((v * (1 - v)) / len(np.array(test).flatten()) / 2) ic = [round(v - d, 3), round(v + d, 3)] listeval.append(round(v, 3)) intervals.append(ic) v = 0 for val in listeval: v += val va = v / 10
from nltk.corpus import treebank train_data = treebank.tagged_sents()[:3] test_data = treebank.tagged_sents()[300:400] print test_data[0] from nltk.tag import tnt tnt_post_tagger = tnt.TnT() tnt_post_tagger.train(train_data) print tnt_post_tagger.evaluate(test_data) # see http://textminingonline.com/dive-into-nltk-part-iii-part-of-speech-tagging-and-pos-tagger
def demo3(): from nltk import tag from nltk.corpus import treebank, brown from nltk.tag import tnt d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d)*0.1) e10 = int(len(e)*0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = tnt.TnT(N=1000, C=False) s = tnt.TnT(N=1000, C=False) dtest = d[(i*d10):((i+1)*d10)] etest = e[(i*e10):((i+1)*e10)] dtrain = d[:(i*d10)] + d[((i+1)*d10):] etrain = e[:(i*e10)] + e[((i+1)*e10):] t.train(dtrain) s.train(etrain) tacc = tag.accuracy(t, dtest) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = tag.accuracy(s, etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print "brown: acc over words known:", 10 * tknacc print " : overall accuracy:", 10 * tallacc print " : words known:", 10 * tknown print "treebank: acc over words known:", 10 * sknacc print " : overall accuracy:", 10 * sallacc print " : words known:", 10 * sknown
def __init__(self,lang='en'): self.lang = lang self.stopwords = None self.stemmer = None self.sentiment_analyzer = None self.text_processor = None INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/" common.set_resources_path(INDIC_NLP_RESOURCES) self.pos_tagger = None if lang == 'hi': self.ht = HindiTokenizer.Tokenizer() self.sentiment_analyzer = load_learner(path="../model/hi-sentiment") self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()] other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = None self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens ) loader.load() train_data = indian.tagged_sents('hindi.pos') self.tnt_pos_tagger = tnt.TnT() self.tnt_pos_tagger.train(train_data) if lang == 'en': self.sentiment_analyzer = VS() self.stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = PorterStemmer() self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons,slang] )