def no_backoff_taggers(test, train, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers without backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # unigram tagger uni_tagger = UnigramTagger(train) # bigram tagger bi_tagger = BigramTagger(train) # trigram tagger tri_tagger = TrigramTagger(train) info(uni_tagger) uni_score = uni_tagger.evaluate(test) print('accuracy score: {}\n'.format(uni_score)) info(bi_tagger) bi_score = bi_tagger.evaluate(test) print('accuracy score: {}\n'.format(bi_score)) info(tri_tagger) tri_score = tri_tagger.evaluate(test) print('accuracy score: {}\n'.format(tri_score))
def pos_tag(self): tokenize_obj = NLTKTokenize(self.options) res = tokenize_obj.tokenize() tokens = res['result'] tags = [] # Performs Bigram / Unigram / Regex Tagging if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: trainer = self.options['train'] if self.options.get( 'train') in TRAINERS else DEFAULT_TRAIN train = brown.tagged_sents(categories=trainer) # Create your custom regex tagging pattern here regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) current = os.path.dirname(os.path.abspath(__file__)) # Unigram tag training data load / dump pickle pkl_name = current + '/trained/unigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: unigram_tag = load(pkl) else: unigram_tag = UnigramTagger(train, backoff=regex_tag) with open(pkl_name, 'wb') as pkl: dump(unigram_tag, pkl, -1) # Bigram tag training data load / dump pickle if self.options['tagger'] == 'bigram': pkl_name = current + '/trained/bigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: bigram_tag = load(pkl) else: bigram_tag = BigramTagger(train, backoff=unigram_tag) with open(pkl_name, 'wb') as pkl: dump(bigram_tag, pkl, -1) tags = bigram_tag.tag(tokens) # Bigram tagging performed here elif self.options['tagger'] == 'unigram': tags = unigram_tag.tag( tokens) # Unigram tagging performed here else: tags = regex_tag.tag(tokens) # Regex tagging performed here # Performs default pos_tag elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': tags = pos_tag(tokens) return self._dump(tags)
def backoff_taggers(test, train, save, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers with backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # UNIGRAM TAGGER WITH BACKOFF uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger) # BIGRAM TAGGER WITH BACKOFF bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff) # TRIGRAM TAGGER WITH BACKOFF tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff) info(uni_tagger_backoff) uni_backoff_score = uni_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(uni_backoff_score)) info(bi_tagger_backoff) bi_backoff_score = bi_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(bi_backoff_score)) info(tri_tagger_backoff) tri_backoff_score = tri_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(tri_backoff_score)) if not save: return accuracy_dict = {} accuracy_dict['uni'] = uni_backoff_score accuracy_dict['bi'] = bi_backoff_score accuracy_dict['tri'] = tri_backoff_score # Saving our Trigram-tagger with backoff if uni_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(uni_tagger_backoff, output, -1) elif bi_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(bi_tagger_backoff, output, -1) elif tri_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus) dump(tri_tagger_backoff, output, -1) output.close() info('saving %s...\n', tagger_file)
def __init__(self, mode, train_sents): if mode == TRIGRAM: self.tagger = UnigramTagger(train_sents) self.tagger = BigramTagger(train_sents, backoff=self.tagger) self.tagger = TrigramTagger(train_sents, backoff=self.tagger) elif HDM: self.tagger = HiddenMarkovModelTagger.train(train_sents)
def __init__(self, train=None, default=None, name=None): self.name = name # As found on page 199 of the nltk book regexps = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ] self.default = default self.regex = RegexpTagger(regexps, backoff=self.default) self.unigram = UnigramTagger(train=train, backoff=self.regex) self.bigram = BigramTagger(train=train, backoff=self.unigram)
def get_pos_tagger(): from nltk.corpus import brown regexp_tagger = nltk.RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents() unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = nltk.RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger) return main_tagger
def train_tagger(corpus_name, corpus): """ Train the taggers and saves them Args: corpus_name: name of the corpus used to create the tagger corpus: corpus for creating the tagger """ #List of n-gram taggers names complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES] # Training UnigramTagger tagger1 = UnigramTagger(corpus) utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH) print "UnigramTagger trained with", corpus_name # Training BigramTagger tagger2 = BigramTagger(corpus) utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH) print "BigramTagger trained with", corpus_name # Training TrigramTagger tagger3 = TrigramTagger(corpus) utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH) print "TrigramTagger trained with", corpus_name
def __init__(self, train_sents, to_detect_list, n_gram=1): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] self.tagger = UnigramTagger(train_data) if n_gram > 1: self.tagger = BigramTagger(train_data, backoff=self.tagger) if n_gram > 2: self.tagger = TrigramTagger(train_data, backoff=self.tagger) self.to_detect_list = to_detect_list
def TrainTaggers(training, testing): global results Unigram = UnigramTagger(training, backoff = default) print('unigram trained') Bigram = BigramTagger(training, backoff = Unigram) print('bigram trained') Trigram = TrigramTagger(training, backoff = Bigram) print('trigram trained') results += [Trigram.evaluate(testing)]
def train(self, model_path): corpus = [[(token.lower(), tag) for token, tag in sent] for sent in CORPUS] unigram_tagger = UnigramTagger(corpus, backoff=DefaultTagger('UNK')) bigram_tagger = BigramTagger(corpus, backoff=unigram_tagger) with open(model_path, "wb") as model_file: pickle.dump(bigram_tagger, model_file)
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def __init__(self, train_sents): """Show parameters. train_sents: trained sentences which have already been tagged. using Brown, conll2000, and TreeBank corpus. """ t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2)
def ngram_tag_with_backoff(): fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(1000000) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) #With Backoff train_len = int(len(brown_tagged_sents) * 0.9) print(brown_tagged_sents[train_len:]) bigram_tagger = BigramTagger(brown_tagged_sents[:train_len], backoff=lookup_tagger) score = bigram_tagger.evaluate(brown_tagged_sents[train_len:]) print(score)
def train_tagger(corpus_name, corpus): """ Function to train tagger. """ # Training UnigramTagger. uni_tag = UnigramTagger(corpus) save_tagger('{}_unigram.tagger'.format(corpus_name), uni_tag) # Training BigramTagger. bi_tag = BigramTagger(corpus, backoff=uni_tag) save_tagger('{}_bigram.tagger'.format(corpus_name), bi_tag) _msg = str("Tagger trained with {} using " "UnigramTagger and BigramTagger.").format(corpus_name) print(_msg, file=sys.stderr)
def create_tagger(sents,patterns=PATTERNS,maxngram=4): '''Обучение Backoff tagger на каком-либо корпусе предложений''' train = sents def_tagger = DefaultTagger('NN') re_tagger = RegexpTagger(patterns, backoff=def_tagger) uni_tagger = UnigramTagger(train, backoff=re_tagger) bi_tagger = BigramTagger(train, backoff=uni_tagger) tri_tagger = TrigramTagger(train, backoff=bi_tagger) ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger) return ngram_tagger
class BigramChunker(ChunkParserI): def __init__(self, train_sentences): train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sentences] self.tagger = BigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunk_tags = [chunk_tag for (pos, chunk_tag) in tagged_pos_tags] conll_tags = [(word, pos, chunk_tag) for ((word, pos), chunk_tag) in zip(sentence, chunk_tags)] return conlltags2tree(conll_tags)
def get_tagger(type="StandfordPOSTagger"): if type == "Custom": brown_tagged_sents = brown.tagged_sents(categories='news', tagset='universal') t0 = DefaultTagger('NOUN') t1 = UnigramTagger(brown_tagged_sents, backoff=t0) t2 = BigramTagger(brown_tagged_sents, backoff=t1) else: t2 = StanfordPOSTagger( 'data/./models/wsj-0-18-bidirectional-distsim.tagger', '3rdparty_libs/stanford-postagger.jar') return t2
def __init__(self): if os.path.exists('tagger_spanish.pickle'): with open('tagger_spanish.pickle', 'r') as file_obj: self.tagger = pickle.load(file_obj) else: print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...' from nltk import UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(sents) bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word self.tagger = unigram_tagger with open('tagger_spanish.pickle', 'w') as file_obj: pickle.dump(self.tagger, file_obj) # Dump trained tagger
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb')) return trigram_tagger
def train_and_save_bigram_tagger(): train_text = brown.tagged_sents() regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger) bigram_tagger = BigramTagger(train_text, backoff=unigram_tagger) output = open('../taggers/bigram_tagger.pkl', 'wb') dump(bigram_tagger, output, -1) output.close()
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ if os.path.exists(os.path.join(os.getcwd(), r"DataBase/trained_tagger.pkl")): print("Trained Tagger File already Exists..") return # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
def __init__(self, train_sents, load=False): if load: print 'Loading saved tagger...', self.load() print 'done.' else: time_start = time.time() print 'Training the tagger...' tag_counts = Counter([t for s in train_sents for w, t in s]) default_tag = argmax(tag_counts) def_tgr = DefaultTagger(default_tag) af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr) uni_tgr = UnigramTagger(train_sents, backoff=af_tgr) bi_tgr = BigramTagger(train_sents, backoff=uni_tgr) tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr) self.tgr = tri_tgr print 'Done.' time_stop = time.time() print 'Training time: {0:.2f}s'.format(time_stop - time_start)
def prepare_toolset(): toolset = {} patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'), (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'), (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')] train_set = brown.tagged_sents( categories='learned', tagset='universal') + brown.tagged_sents( categories='news', tagset='universal') + brown.tagged_sents( categories='reviews', tagset='universal') utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN')) btgr = BigramTagger(train=train_set, backoff=utgr) ttgr = TrigramTagger(train=train_set, backoff=btgr) toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr) toolset['sw'] = stopwords.words('english') toolset['lr'] = WordNetLemmatizer() toolset['wntg'] = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV, 'X': wordnet.NOUN } print('Tools Ready') return toolset
def train_brill_tagger(train_data): # Modules for creating the templates. from nltk import UnigramTagger # The brill tagger module in NLTK. from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk import BigramTagger,UnigramTagger,TrigramTagger import nltk from pickle import dump #unigram_tagger = UnigramTagger(train_data) templates=nltk.tag.brill.fntbl37() #Regular expression (Regex) Tagger as a default tagger default_tagger = nltk.RegexpTagger( [(r'^[Jj]ing', 'ABN'), (r'^[pP]yn', 'CAV'), (r'^[nN]ga$', '1PSG'), (r'^[pP]hi$', '2PG'), (r'^[pP]ha$', '2PF'), (r'^[mM]e$', '2PM'), (r'^[iI]$', '3PSG'), (r'^[bB]an$', 'INP'), (r'^[Kk]a$', '3PSF'), (r'^[uU]$', '3PSM'), (r'^[kK]i$', '3PPG'), (r'(sha|da|na|hapoh|halor|ha|naduh|shaduh|hapdeng|haduh)$', 'IN'), (r'(bad|ruh|namar|hynrei|tangba|katba|katta)$', 'COC'), (r'(lada|haba|khnang|ynda)$', 'SUC'), (r'(katkum|kat|pat|wat|tang|lang)$', 'AD'), (r'(bun|baroh)$', 'QNT'), (r'^-?[0-9]+(.[0-9]+)?$', 'CN'), (r'(dei|long|don)$', 'CO'), (r'^[jJ]ong$', 'POP'), (r'^[sS]hah$', 'PAV'), (r'^[lL]ah$', 'MOD'), (r'^[lL]a$', 'VST'), (r'(ym|em|khlem|nym|kam)$', 'NEG'), (r'^hi$', 'EM'), (r'.*lade$', 'RFP'), (r'(dang|nang)$', 'VPP'), (r'([uU]n|[kK]an|[kK]in|[sS]a|[yY]n|[nN]gin|[pP]hin)$', 'VFT'), (r'(.*ngut|.*tylli)$', 'ADJ'), (r'^[bB]a$', 'COM'), (r'^\W+$', 'SYM'), (r'[^a-z\W]a$', 'IN'), (r'([vV]ote|[bB]ye|[cC]onstituency|[sS]outh)$', 'FR'), (r'.*', 'CMN') ]) t0 = default_tagger print(train_data) t1 = UnigramTagger(train_data,backoff=t0) t2 = BigramTagger(train_data,backoff=t1) t3 = TrigramTagger(train_data,backoff=t2) trainer = BrillTaggerTrainer(initial_tagger=t3, templates=templates, trace=3, deterministic=True) brill_tagger = trainer.train(train_data,max_rules=10) # Saving the Tagger for future use output = open('t2.pkl', 'wb') dump(t3, output, -1) output.close() return brill_tagger
def __init__(self, train_sents): t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2)
f.close() tmp = open("/tmp/proper.txt", "w") from nltk import DefaultTagger, UnigramTagger, BigramTagger from nltk.corpus import brown # This section is recompiled from the Natural Language Processing Book: http://nltk.googlecode.com/svn/trunk/doc/book/book.html brown_news_tagged = brown.tagged_sents(categories='news') # Automatic tagging of a sentence, based on Brown News corpus size = int(len(brown_news_tagged) * 0.9) brown_news_train = brown_news_tagged[:size] unigram_tagger = UnigramTagger(brown_news_train) # Uses BigramTagger -- if it fails, it uses the UnigramTagger -- if it fails, it uses DefaultTagger t0 = DefaultTagger('NN') t1 = UnigramTagger(brown_news_train, backoff=t0) tagger = BigramTagger(brown_news_train, backoff=t1) for line in text: tagged = tagger.tag(line.split(" ")) is_np = re.compile(r"NP") for w in tagged: if (is_np.match(w[1])): #print w[0] tmp.write(w[0] + "\n") tmp.close() # Sort list and remove duplicates os.system("cat /tmp/proper.txt | sort | uniq")
#b) #using regex from nltk.org/book/chp05.html, 4.2 patterns = [ (r'.*ing$', 'VBG'), #gerunds (r'.*ed$', 'VBD'), #simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), #modal (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), #plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') #nouns (default) ] regexp_tagger = RegexpTagger(patterns) uniB = UnigramTagger(brownT90, backoff=defaultTB90) biB = BigramTagger(brownT90, backoff=uniB) triB = TrigramTagger(brownT90, backoff=biB) uniC = UnigramTagger(chatT50, backoff=defaultTChat50) biC = BigramTagger(chatT50, backoff=uniC) triC = TrigramTagger(chatT50, backoff=uniC) print("Regextag50/50: ", regexp_tagger.evaluate(brownT50)) print("Default: ", defaultTB90.evaluate(brownT50)) print("Bigram Brown 50/50: ", BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50)) print("Default: ", defaultTB50.evaluate(brownT50)) print("Bigram Brown 90/10: ", BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90))
# In[17]: display() # ## N-Gram Tagger # Unigram taggers assign to each wort $w_n$ the tag $t_n$, which is the most frequent tag for $w_n$ in the training corpus. N-Gram taggers are a generalization of Unigram-Taggers. During training they determine for each combination of $N-1$ previous tags $t_{n-1},t_{n-2},...$ and the current word $w_n$ the most frequent tag $t_n$. Tagging is then realized, by inspecting the $n-1$ previous tags and the current word $w_n$ and assigning the most frequent tag, which appeared for this combination in the training corpus. # ![NgramTagging](https://maucher.home.hdm-stuttgart.de/Pics/NGramTagging.png) # In[18]: baseline=nltk.DefaultTagger('NOUN') unigram = UnigramTagger(train=train_sents,backoff=baseline) bigram = BigramTagger(train=train_sents,backoff=unigram) # In[19]: bigram.evaluate(test_sents) # # Find most frequent nouns # The most frequent nouns usually provide information on the subject of a text. Below, the most frequent nouns of an already tagged text of the *Treebank*-corpus are determined. Let's see if we can conclude the text's subject. # In[20]: from nltk.corpus import treebank
def createModel(self): model_name=None try: unigrams=self.buildUnigrams() N=len(self.corpusSents) toTraining=round(self.training_portion*N) #logging.info("Sentencias totales:" + str(N)) training=self.corpusSents[:toTraining] test=self.corpusSents[toTraining:] post_patterns=[] for regex,post in self.regex_list: try: regex=regex.decode('utf-8') except: pass post_patterns.append((regex,post)) for regex,post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'),post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger) bigramTagger= BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion!=1: score_ut=unigramTagger.evaluate(test) score_bt=bigramTagger.evaluate(test)-0.002 score_tt=trigramTagger.evaluate(test) score_nt=NTagger.evaluate(test) scores=[score_ut,score_bt,score_tt,score_nt] tagger_names=["uTagger","biTagger","triTagger","NTagger"] taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger] bestTagger_index= scores.index(max(scores)) best_msg=max(scores),tagger_names[bestTagger_index] fname=self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname+self.tagger_extension_file): fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file else: fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file model=taggers[bestTagger_index] f = open(fname,'wb') pickle.dump(model, f) f.close() print ("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name=fname except Exception,e: print "ERRPR EN POS TAGGER GENERATOR:",str(e) pdb.set_trace()
def __init__(self, train_sentences): train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sentences] self.tagger = BigramTagger(train_data)
def createModel(self): model_name = None try: unigrams = self.buildUnigrams() N = len(self.corpusSents) toTraining = round(self.training_portion * N) #logging.info("Sentencias totales:" + str(N)) training = self.corpusSents[:toTraining] test = self.corpusSents[toTraining:] post_patterns = [] for regex, post in self.regex_list: try: regex = regex.decode('utf-8') except: pass post_patterns.append((regex, post)) for regex, post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'), post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams + training, backoff=regexpTagger) bigramTagger = BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger = NgramTagger(self.max_ngrams, training, backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print( "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion != 1: score_ut = unigramTagger.evaluate(test) score_bt = bigramTagger.evaluate(test) - 0.002 score_tt = trigramTagger.evaluate(test) score_nt = NTagger.evaluate(test) scores = [score_ut, score_bt, score_tt, score_nt] tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"] taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger] bestTagger_index = scores.index(max(scores)) best_msg = max(scores), tagger_names[bestTagger_index] fname = self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname + self.tagger_extension_file): fname = fname + str(len(listdir( self.taggers_path))) + self.tagger_extension_file else: fname = self.taggers_path + tagger_names[ bestTagger_index] + self.tagger_extension_file model = taggers[bestTagger_index] f = open(fname, 'wb') pickle.dump(model, f) f.close() print("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name = fname except Exception, e: print "ERRPR EN POS TAGGER GENERATOR:", str(e) pdb.set_trace()
nltk.download('averaged_perceptron_tagger') from nltk.corpus import wordnet as wn from nltk.corpus import treebank, conll2000, brown, conll2002 from nltk import DefaultTagger, UnigramTagger, BigramTagger wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() # The code below trains bigram part of speech tagger from various datasets. train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents() edited_train = [] for sent in train_sents: edited_train.append([(word.lower(),tag) for (word,tag) in sent]) t0 = DefaultTagger(None) et1 = UnigramTagger(edited_train, backoff = t0) et2 = BigramTagger(edited_train, backoff = et1) # The function below converts bigram pos to wordnet pos for lemmatization def penn_to_wn(tag): nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV} try: return nltk_wn_pos[tag[0]] except: return None # The list below is a list of unwanted tokens unwanted_tokens = ['"','!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/','”','“','–',"'s", ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] # The function below filters unwanted tokens from the given tokenList def filterUnwantedCharacters(tokenList):