def test_defaultTagger(tokens): default_tagger = nltk.DefaultTagger('NN') print default_tagger.tag(tokens)
# Preenchemos a lista tagged_sentences com os dados do córpus com as tags já filtradas, correspondentes às # classes gramaticais: tagged_sentences = [] print('Extraindo classes gramaticais das tags do córpus:') for sent in tqdm(sentences): tagged_sent = [] for (word, tag) in sent: tagged_sent.append((word.lower(), extract_pos(tag))) tagged_sentences.append(tagged_sent) # Dividimos o córpus em um conjunto de treinamento e umm conjunto de testes: train = tagged_sentences[100:] test = tagged_sentences[:100] # Escolhemos como Baseline um tagger que chuta sempre a mesma classe para toda palavra, a classe gramatical mais # frequente no córpus: 'n', correspondente à NOME: baseline = nltk.DefaultTagger('n') print('Baseline accuracy:', baseline.evaluate(test)) # Treinamos um tagger de Unigramas usando o baseline como backoff: tagger1 = nltk.UnigramTagger(train, backoff=baseline) print('Unigram Tagger accuracy:', tagger1.evaluate(test)) # Em seguida, treinamos um tagger de Bigramas, usando o tagger de unigramas como backoff: tagger2 = nltk.BigramTagger(train, backoff=tagger1) print('Bigram Tagger accuracy', tagger2.evaluate(test)) # Exemplo de classificação sintática de uma sentença tokenizada, usando nosso melhor tagger: print(tagger2.tag(['o', 'pássaro', 'segue', 'feliz', '.']))
#%% size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) print(unigram_tagger.evaluate(test_sents)) #%% 请注意,bigram 标注器能够标注训练中它看到过的句子中的所有词,但对一个没见过 的句子表现很差。只要遇到一个新词(如13.5),就无法给它分配标记。它不能标注下面的 词(如:million),即使是在训练过程中看到过的,只是因为在训练过程中从来没有见过它 前面有一个None 标记的词。因此,标注器标注句子的其余部分也失败了。它的整体准确度 得分非常低: bigram_tagger = nltk.BigramTagger(train_sents) print(bigram_tagger.tag(brown_sents[2007])) unseen_sent = brown_sents[4203] print(bigram_tagger.tag(unseen_sent)) #%% import _pickle as cPickle t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) output = open('t2.pkl', 'wb') cPickle.dump(t2, output, -1) output.close() #%% import _pickle as cPickle from _pickle import load input = open('t2.pkl', 'rb') tagger = load(input) input.close() #%% text = """The board's action shows what free enterprise is up against in our complex maze of regulatory laws .""" tokens = text.split()
for (w, t) in sent] for sent in dataset2] shuffle(traindata) shuffle(traindata2) regex_patterns = [ (r"^[nN][ao]s?$", "ADP"), (r"^[dD][ao]s?$", "ADP"), (r"^[pP]el[ao]s?$", "ADP"), (r"^[nN]est[ae]s?$", "ADP"), (r"^[nN]um$", "ADP"), (r"^[nN]ess[ae]s?$", "ADP"), (r"^[nN]aquel[ae]s?$", "ADP"), (r"^\xe0$", "ADP"), ] tagger = nltk.BigramTagger(traindata, backoff=nltk.RegexpTagger( regex_patterns, backoff=nltk.UnigramTagger( traindata2, backoff=nltk.AffixTagger( traindata2, backoff=nltk.DefaultTagger('NOUN'))))) templates = nltk.brill.fntbl37() tagger = nltk.BrillTaggerTrainer(tagger, templates) tagger = tagger.train(traindata, max_rules=100) with open("tagger.pkl", "wb") as f: pickle.dump(tagger, f)
# for words in sent: # print(words[0],end=' ') print(brown_sents) # 1.默认标注器 # 找出最有可能的标记 print('\n找出最有可能的标记:') tags = [tag for (word, tag) in brown.tagged_words(categories='news')] print('tags: ', tags) tag_max = nltk.FreqDist(tags).max() print('tag_max: ', tag_max) # 创建一个将所有词都标注成tag_max的标注器 raw = 'I do not like green eggs and ham, I do not like them Sam I am!' tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger(tag_max) tokens_tagged = default_tagger.tag(tokens) print('tokens_tagged: ', tokens_tagged) # 评估 print('评估brown_tagged_sents: ', default_tagger.evaluate(brown_tagged_sents)) # 2.正则表达式标注器 print('\n正则表达式标注器:') patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns
try: word_tokenizer except NameError: word_tokenizer = make_word_tokenizer() try: sent_tokenizer except NameError: sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') try: tagger except NameError: brown_a = nltk.corpus.brown.tagged_sents(categories='a') t0 = nltk.DefaultTagger('WTF') t1 = nltk.UnigramTagger(brown_a, backoff=t0) t2 = nltk.BigramTagger(brown_a, backoff=t1) tagger = nltk.TrigramTagger(brown_a, backoff=t2) class SpeechAnalyzer(object): NOTATIONS = { r'--': "##PAUSE##", r'\(sic\)': "##SIC##", r'\[mispronunciation\]': '##MISPRONUNCIATION##', r'\.\.\.': ' ##PAUSE## ' } PHRASES = [ "wall street", "main street",
def train_es_tagger(path): nltk.download('cess_esp') def convert_to_universal_tag(t): tagdict = { 'Fa': '.', 'Faa': '.', 'Fat': '.', 'Fc': '.', 'Fd': '.', 'Fe': '.', 'Fg': '.', 'Fh': '.', 'Fi': '.', 'Fia': '.', 'Fit': '.', 'Fp': '.', 'Fpa': '.', 'Fpt': '.', 'Fs': '.', 'Fx': '.', 'Fz': '.', 'X': 'X', 'Y': 'X', 'Zm': 'NUM', 'Zp': 'NUM', 'ao': 'ADJ', 'ao0fp0': 'ADJ', 'ao0fs0': 'ADJ', 'ao0mp0': 'ADJ', 'ao0ms0': 'ADJ', 'aq': 'ADJ', 'aq00000': 'ADJ', 'aq0cn0': 'ADJ', 'aq0cp0': 'ADJ', 'aq0cs0': 'ADJ', 'aq0fp0': 'ADJ', 'aq0fpp': 'ADJ', 'aq0fs0': 'ADJ', 'aq0fsp': 'ADJ', 'aq0mp0': 'ADJ', 'aq0mpp': 'ADJ', 'aq0ms0': 'ADJ', 'aq0msp': 'ADJ', 'cc': 'CONJ', 'cs': 'CONJ', 'da': 'DET', 'da0fp0': 'DET', 'da0fs0': 'DET', 'da0mp0': 'DET', 'da0ms0': 'DET', 'da0ns0': 'DET', 'dd': 'DET', 'dd0cp0': 'DET', 'dd0cs0': 'DET', 'dd0fp0': 'DET', 'dd0fs0': 'DET', 'dd0mp0': 'DET', 'dd0ms0': 'DET', 'de': 'DET', 'de0cn0': 'DET', 'di': 'DET', 'di0cp0': 'DET', 'di0cs0': 'DET', 'di0fp0': 'DET', 'di0fs0': 'DET', 'di0mp0': 'DET', 'di0ms0': 'DET', 'dn': 'DET', 'dn0cp0': 'DET', 'dn0cs0': 'DET', 'dn0fp0': 'DET', 'dn0fs0': 'DET', 'dn0mp0': 'DET', 'dn0ms0': 'DET', 'dp': 'DET', 'dp1cps': 'DET', 'dp1css': 'DET', 'dp1fpp': 'DET', 'dp1fsp': 'DET', 'dp1mpp': 'DET', 'dp1msp': 'DET', 'dp1mss': 'DET', 'dp2cps': 'DET', 'dp2css': 'DET', 'dp3cp0': 'DET', 'dp3cs0': 'DET', 'dp3fs0': 'DET', 'dp3mp0': 'DET', 'dp3ms0': 'DET', 'dt': 'DET', 'dt0cn0': 'DET', 'dt0fs0': 'DET', 'dt0ms0': 'DET', 'i': 'X', 'nc': 'NOUN', 'nc00000': 'NOUN', 'nccn000': 'NOUN', 'nccp000': 'NOUN', 'nccs000': 'NOUN', 'ncfn000': 'NOUN', 'ncfp000': 'NOUN', 'ncfs000': 'NOUN', 'ncmn000': 'NOUN', 'ncmp000': 'NOUN', 'ncms000': 'NOUN', 'np': 'NOUN', 'np00000': 'NOUN', 'np0000a': 'NOUN', 'np0000l': 'NOUN', 'np0000o': 'NOUN', 'np0000p': 'NOUN', 'p0': 'PRON', 'p0000000': 'PRON', 'p010p000': 'PRON', 'p010s000': 'PRON', 'p020s000': 'PRON', 'p0300000': 'PRON', 'pd': 'PRON', 'pd0cp000': 'PRON', 'pd0cs000': 'PRON', 'pd0fp000': 'PRON', 'pd0fs000': 'PRON', 'pd0mp000': 'PRON', 'pd0ms000': 'PRON', 'pd0ns000': 'PRON', 'pe': 'PRON', 'pe000000': 'PRON', 'pi': 'PRON', 'pi0cp000': 'PRON', 'pi0cs000': 'PRON', 'pi0fp000': 'PRON', 'pi0fs000': 'PRON', 'pi0mp000': 'PRON', 'pi0ms000': 'PRON', 'pn': 'PRON', 'pn0cp000': 'PRON', 'pn0fp000': 'PRON', 'pn0fs000': 'PRON', 'pn0mp000': 'PRON', 'pn0ms000': 'PRON', 'pp': 'PRON', 'pp1cp000': 'PRON', 'pp1cs000': 'PRON', 'pp1csn00': 'PRON', 'pp1cso00': 'PRON', 'pp1mp000': 'PRON', 'pp2cp000': 'PRON', 'pp2cp00p': 'PRON', 'pp2cs000': 'PRON', 'pp2cs00p': 'PRON', 'pp2csn00': 'PRON', 'pp2cso00': 'PRON', 'pp3cn000': 'PRON', 'pp3cna00': 'PRON', 'pp3cno00': 'PRON', 'pp3cpa00': 'PRON', 'pp3cpd00': 'PRON', 'pp3csa00': 'PRON', 'pp3csd00': 'PRON', 'pp3fp000': 'PRON', 'pp3fpa00': 'PRON', 'pp3fs000': 'PRON', 'pp3fsa00': 'PRON', 'pp3mp000': 'PRON', 'pp3mpa00': 'PRON', 'pp3ms000': 'PRON', 'pp3msa00': 'PRON', 'pp3ns000': 'PRON', 'pr': 'PRON', 'pr000000': 'PRON', 'pr0cn000': 'PRON', 'pr0cp000': 'PRON', 'pr0cs000': 'PRON', 'pr0fp000': 'PRON', 'pr0fs000': 'PRON', 'pr0mp000': 'PRON', 'pr0ms000': 'PRON', 'pt': 'PRON', 'pt000000': 'PRON', 'pt0cp000': 'PRON', 'pt0cs000': 'PRON', 'pt0mp000': 'PRON', 'pt0ms000': 'PRON', 'px': 'PRON', 'px1fp0p0': 'PRON', 'px1fs0p0': 'PRON', 'px1mp0p0': 'PRON', 'px1ms0p0': 'PRON', 'px2fs0s0': 'PRON', 'px3fs000': 'PRON', 'px3mp000': 'PRON', 'px3ms000': 'PRON', 'px3ns000': 'PRON', 'rg': 'ADV', 'rn': 'ADV', 'sn': 'ADP', 'sn-SUJ': 'ADP', 'sn.co-SUJ': 'ADP', 'sn.e': 'ADP', 'sn.e-ATR': 'ADP', 'sn.e-CD': 'ADP', 'sn.e-SUJ': 'ADP', 'sn.e.1n-SUJ': 'ADP', 'sp': 'ADP', 'spcms': 'ADP', 'sps00': 'ADP', 'va': 'VERB', 'vag0000': 'VERB', 'vaic1p0': 'VERB', 'vaic3p0': 'VERB', 'vaic3s0': 'VERB', 'vaif1p0': 'VERB', 'vaif2s0': 'VERB', 'vaif3p0': 'VERB', 'vaif3s0': 'VERB', 'vaii1p0': 'VERB', 'vaii1s0': 'VERB', 'vaii2s0': 'VERB', 'vaii3p0': 'VERB', 'vaii3s0': 'VERB', 'vaip1p0': 'VERB', 'vaip1s0': 'VERB', 'vaip2p0': 'VERB', 'vaip2s0': 'VERB', 'vaip3p0': 'VERB', 'vaip3s0': 'VERB', 'vais3s0': 'VERB', 'vam02s0': 'VERB', 'vam03s0': 'VERB', 'van0000': 'VERB', 'vap00sm': 'VERB', 'vasi1p0': 'VERB', 'vasi1s0': 'VERB', 'vasi3p0': 'VERB', 'vasi3s0': 'VERB', 'vasp1s0': 'VERB', 'vasp3p0': 'VERB', 'vasp3s0': 'VERB', 'vm': 'VERB', 'vmg0000': 'VERB', 'vmic1p0': 'VERB', 'vmic1s0': 'VERB', 'vmic2s0': 'VERB', 'vmic3p0': 'VERB', 'vmic3s0': 'VERB', 'vmif1p0': 'VERB', 'vmif1s0': 'VERB', 'vmif2s0': 'VERB', 'vmif3p0': 'VERB', 'vmif3s0': 'VERB', 'vmii1p0': 'VERB', 'vmii1s0': 'VERB', 'vmii2p0': 'VERB', 'vmii2s0': 'VERB', 'vmii3p0': 'VERB', 'vmii3s0': 'VERB', 'vmip1p0': 'VERB', 'vmip1s0': 'VERB', 'vmip2p0': 'VERB', 'vmip2s0': 'VERB', 'vmip3p0': 'VERB', 'vmip3s0': 'VERB', 'vmis1p0': 'VERB', 'vmis1s0': 'VERB', 'vmis2s0': 'VERB', 'vmis3p0': 'VERB', 'vmis3s0': 'VERB', 'vmm01p0': 'VERB', 'vmm02s0': 'VERB', 'vmm03p0': 'VERB', 'vmm03s0': 'VERB', 'vmn0000': 'VERB', 'vmp00pf': 'VERB', 'vmp00pm': 'VERB', 'vmp00sf': 'VERB', 'vmp00sm': 'VERB', 'vmsi1p0': 'VERB', 'vmsi1s0': 'VERB', 'vmsi3p0': 'VERB', 'vmsi3s0': 'VERB', 'vmsp1p0': 'VERB', 'vmsp1s0': 'VERB', 'vmsp2p0': 'VERB', 'vmsp2s0': 'VERB', 'vmsp3p0': 'VERB', 'vmsp3s0': 'VERB', 'vs': 'VERB', 'vsg0000': 'VERB', 'vsic1s0': 'VERB', 'vsic2s0': 'VERB', 'vsic3p0': 'VERB', 'vsic3s0': 'VERB', 'vsif1s0': 'VERB', 'vsif3p0': 'VERB', 'vsif3s0': 'VERB', 'vsii1p0': 'VERB', 'vsii1s0': 'VERB', 'vsii3p0': 'VERB', 'vsii3s0': 'VERB', 'vsip1p0': 'VERB', 'vsip1s0': 'VERB', 'vsip2s0': 'VERB', 'vsip3p0': 'VERB', 'vsip3s0': 'VERB', 'vsis1s0': 'VERB', 'vsis3p0': 'VERB', 'vsis3s0': 'VERB', 'vsm03s0': 'VERB', 'vsn0000': 'VERB', 'vsp00sm': 'VERB', 'vssf3s0': 'VERB', 'vssi3p0': 'VERB', 'vssi3s0': 'VERB', 'vssp1s0': 'VERB', 'vssp2s0': 'VERB', 'vssp3p0': 'VERB', 'vssp3s0': 'VERB', 'w': 'NOUN', 'z': 'NUM' } t = t.lower() return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t) cess = [[(w, convert_to_universal_tag(t)) for (w, t) in sent] for sent in nltk.corpus.cess_esp.tagged_sents()] shuffle(cess) def_tagger = nltk.DefaultTagger('NOUN') affix_tagger = nltk.AffixTagger(cess, backoff=def_tagger) unitagger = nltk.UnigramTagger(cess, backoff=affix_tagger) tagger = nltk.BigramTagger(cess, backoff=unitagger) tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37()) tagger = tagger.train(cess, max_rules=100) with open(path, "wb") as f: pickle.dump(tagger, f) return tagger
######################################################## ############### The Default Tagger ############### ######################################################## from nltk.corpus import brown import nltk tags = [tag for (word, tag) in brown.tagged_words(categories='news')] print(set(tags)) #Finding tags occuring most frew print(nltk.FreqDist(tags).max()) ## default Tagger - tags as input, marks every word in the tagger as given tag default_tagger = nltk.DefaultTagger('NN') default_tagger = nltk.DefaultTagger(nltk.FreqDist(tags).max()) text = "Gokhale conveyed that he was in Hong-Kong and could reach only past midnight even if he booked himself on the first Beijing-bound flight. He was urged to reach the Chinese capital as fast as he could, in a first clear indication that the quiet and dogged attempt to defuse the Doklam imbroglio may have borne fruit." tokens = nltk.word_tokenize(text) print(default_tagger.tag(tokens)) ##Performace of a tagger print( default_tagger.evaluate(nltk.corpus.brown.tagged_sents(categories='news'))) ######################################################## ############### Regexp Tagger ############### ######################################################## import nltk
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import floresta import cPickle FILENAME = "txts/floresta_trigram.pos" def simplify_tag(t): if '+' in t: return t[t.index('+')+1:] else: return t tsents = floresta.tagged_sents() tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train, backoff=tagger0) tagger2 = nltk.BigramTagger(train, backoff=tagger1) tagger = nltk.TrigramTagger(train, backoff=tagger2) tagger.evaluate(test) with open(FILENAME, 'wb') as outFile: cPickle.dump(tagger, outFile, -1)
data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca] for sentenca in sentencas_mac_morpho if sentenca] base = data teste = data print('Treinando tagger. Isso pode demorar...') _tagger = nltk.NgramTagger(4, base, backoff=nltk.TrigramTagger( base, backoff=nltk.BigramTagger( base, backoff=nltk.UnigramTagger( base, backoff=nltk.DefaultTagger('n'))))) print('Tagger treinado com sucesso! Precisão de %.1f!' % (_tagger.evaluate(teste) * 100)) try: print('Salvando tagger...') output = open(CAMINHO_DUMP, 'wb') dump(_tagger, output, -1) output.close() print('Tagger salvo em "dump_tagger.pkl"!') except: print('ERRO: Não foi possível salvar o tagger.')
#print('segundo',textoBrownTagSentNew) fdTag2 = nltk.FreqDist(tag for m in textoBrownTagSentNew for (word, tag) in m) print('tags2 TAG', fdTag2.most_common()) fdTag3 = nltk.FreqDist(word for m in textoBrownTagSentNew for (word, tag) in m) #print('tags2 WORD',fdTag3.most_common()) tagTexto(textoBrownTagSentNew) ''' 1.Estender o exemplo dos etiquetadores para TrigramTagger e analisar a precisao do modelo ''' treino = mac_morpho.tagged_sents()[1000:] teste = mac_morpho.tagged_sents()[:1000] etiq0 = nltk.DefaultTagger('N') etiq1 = nltk.UnigramTagger(treino, backoff=etiq0) print('UnigramTagger', etiq1.evaluate(teste)) etiq2 = nltk.BigramTagger(treino, backoff=etiq1) print('BigramTagger', etiq2.evaluate(teste)) etiq3 = nltk.TrigramTagger(treino, backoff=etiq2) print('TrigramTagger', etiq3.evaluate(teste)) doc = open('textoPT.txt', encoding='utf8') raw = doc.read() #texto = nltk.word_tokenize('O mundo atual possui diversos idiomas.') texto = nltk.word_tokenize(raw) #print('etiq2', etiq2.tag(texto)) #print('etiq3', etiq3.tag(texto)) '''
def __init__(self): self.train_tagged_sents = brown.tagged_sents() self.default_tagger = nltk.DefaultTagger('NN') self.unigram_tagger = nltk.UnigramTagger(self.train_tagged_sents, backoff = self.default_tagger ) self.bigram_tagger = nltk.BigramTagger(self.train_tagged_sents, backoff = self.unigram_tagger ) self.trigram_tagger = nltk.TrigramTagger(self.train_tagged_sents, backoff = self.bigram_tagger)
# pylint: disable=C0111 # pylint: disable=C0103 import nltk import sents patterns = [ (r'(da|do|de|das|dos)$', 'PREP'), # Preposições (r'.*ndo$', 'V-GER') # Gerundios ] defaultTagger = nltk.DefaultTagger('N') regexTagger = nltk.RegexpTagger(patterns, backoff=defaultTagger) resultado = regexTagger.evaluate(sents.sentTeste) print(resultado * 100.0) # Precisão foi de 23.130% # 01/10/2017 14:34
def learnDefaultTagger(simpleSentence): wordsInSentence = nltk.word_tokenize(simpleSentence) tagger = nltk.DefaultTagger("NN") posEnabledTags = tagger.tag(wordsInSentence) print(posEnabledTags)
import nltk from nltk.corpus import cess_esp def store_pickle_file(filename, obj): from pickle import dump output = open("./checkpoints/" + filename + ".pkl", "wb") dump(obj, output, -1) output.close() # Creating the default tagger default_tagger = nltk.DefaultTagger('S') # Creating a REGEX tagger patterns = [(r'.*o$', 'NMS'), (r'.*os$', 'NMP'), (r'.*a$', 'NFS'), (r'.*as$', 'NFP')] regex_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) # Creating, training an UnigramTagger on cess_esp sentences tagged_sents = cess_esp.tagged_sents() unigram_tagger = nltk.UnigramTagger(tagged_sents, backoff=regex_tagger) # Saving to disk the general tagger store_pickle_file("tagger", unigram_tagger) print("done")
pylab.show() disp() #Calling function #29. bts = brown.tagged_sents(categories='news') #Getting brown sentences size = int(len(bts) * 0.9) #training data 90% train_sents = bts[:size] #testing data test_sents = bts[size:] #Noun as default tagger t0 = nltk.DefaultTagger('NN') #Unigran tagger with backoff option as default tagger t1=nltk.UnigramTagger(train_sents, backoff=t0) #Bigram tagger training data bigram_tagger = nltk.BigramTagger(train_sents) #bigram seen data bigram_tagger.tag(bts[2007]) #bigram unseen data unseen_sent = bts[4203] #bigram unseen data bigram_tagger.tag(unseen_sent) #bigram training data bitag1=nltk.BigramTagger(train_sents) #Bigram with cutoff=2 option will ignore the words unless it is atleast 2 times, #so those data that are in training data and seen once only will be not recognized bitag2=nltk.BigramTagger(train_sents, cutoff=2)
for item in all_files: item = periods.sub(".", item) all_text.append(item) # Make raw text and then tokenize corpus_raw = "".join(all_text) corpus_sents = nltk.sent_tokenize(corpus_raw, language="french") for sentence in corpus_sents: corpus_list = sentence.split() corpus_tuples = [nltk.tag.str2tuple(item) for item in corpus_list] corpus_tagged_sents.append(corpus_tuples) # Split into training and held out data size = int(len(corpus_tagged_sents) * 0.9) train_sents = corpus_tagged_sents[:size] test_sents = corpus_tagged_sents[size:] # Train taggers tagger_default = nltk.DefaultTagger("NN") tagger_unigram = nltk.UnigramTagger(train_sents, backoff=tagger_default) tagger_bigram = nltk.BigramTagger(train_sents, backoff=tagger_unigram) tagger_trigram = nltk.TrigramTagger(train_sents, backoff=tagger_bigram) # Evaluate with disfluency chunks and print some stats stats_dir = "./stats/" result = tagger_trigram.evaluate(test_sents) with open(f"{stats_dir}test_dis_ext_result.txt", "w") as file: file.write(str(result))
def perf2(cfd, wl): #assign word and cfd as dictionary to variable zz = dict((word, cfd[word].max()) for word in wl) bt = nltk.UnigramTagger(model=zz, backoff=nltk.DefaultTagger('NN')) return bt.evaluate(brown.tagged_sents(categories='news'))
def default_backoff_tagger(train_sents): tags = [tag for sent in train_sents for (word, tag) in sent] nltk.DefaultTagger(nltk.FreqDist(tags).max())
#adaptacion SPANISH from nltk.corpus import cess_esp nltk.tag.mapping._load_universal_map("es-cast3lb") mapdict = nltk.tag.mapping._MAPPINGS["es-cast3lb"]["universal"] alltags = set(t for w, t in cess_esp.tagged_words()) for tag in alltags: if len(tag) <= 2: # These are complete continue mapdict[tag] = mapdict[tag[:2]] cess_esp._tagset = "es-cast3lb" from nltk import UnigramTagger as ut from nltk import BigramTagger as bt cess_sents = cess_esp.tagged_sents(tagset='universal') uni_tag = ut(cess_sents, backoff=nltk.DefaultTagger('X')) class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers def classify(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features): votes = []
def tagging_performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) return baseline_tagger.evaluate(brown_tagged_sents)
recipe_name = input('What meal are you looking for ? : ') get_access_recipes(path, tagged_dataset_path, user, recipe_name) #recipe_name = input('What meal are you looking for ? : ') #get_access_recipes(path, tagged_dataset_path, user, recipe_name) print("Shopping list : ") print(*user.shop_list, sep="\n") print("Fridge : ") print(*user.fridge, sep="\n") ###### #Evaluation of the model : can't work because of uncleaned dataset ###### tagged_dataset = input( "Location of the tagged data set which is going to be created : ") tagged = open(tagged_dataset, "rb") tagged_ingredients = pickle.load(tagged) train_set = tagged_ingredients[:len(tagged_ingredients)] test_set = tagged_ingredients[len(tagged_ingredients):] back = nltk.DefaultTagger('OTHER') unigram_tagger = nltk.UnigramTagger(train_set, backoff=back) bigram_tagger = nltk.BigramTagger(train_set, backoff=unigram_tagger) #Problem on the dataset #unigram_tagger.evaluate(test_set)
pos2 = nltk.Index((value, key) for (key, value) in pos.items()) pos2['ADJ'] # Automatic Tagging from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') ## The default tagger tags = [tag for (word, tag) in brown.tagged_words(categories='news')] nltk.FreqDist(tags).max() # 'NN' raw = 'I do not like green engs and ham, I do not like them Sam I am!' tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') default_tagger.tag(tokens) default_tagger.evaluate(brown_tagged_sents) ## the regular expression tagger patterns = [(r'.ing$', 'VBG'), (r'.ed$', 'VBD'), (r'.es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] regexp_tagger = nltk.RegexpTagger(patterns) regexp_tagger.tag(brown_sents[3]) regexp_tagger.evaluate(brown_tagged_sents) ## the lookup tagger fd = nltk.FreqDist(brown.words(categories='news'))
def tag_sentences_default_tagger(sentences, tag): default_tagger = nltk.DefaultTagger(tag) for s in sentences: tokens = nltk.word_tokenize(s) s_tagged = default_tagger.tag(tokens) print(s_tagged)
def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist) baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) # 对新数据进行标注 return baseline_tagger.evaluate(brown.tagged_sents(categories='fiction'))
print("{0:.4f} HiddenMarkovModelTagger".format(result)) def createDataFrame(): df = pd.DataFrame() df['word'] = [w for s in result for w in s] df['bi_tag'] = [w[1] for s in bi_tagged for w in s] df['tri_tag'] = [w[1] for s in tri_tagged for w in s] df['hmm_tag'] = [w[1] for s in hmm_tagged for w in s] return df tagged_texts = loopFiles(sys.argv[1]) # loen sisse treeninghulga test_texts = loopFiles(sys.argv[2]) # loen sisse teshulga andmed' train_sents = tagged_texts default_tagger = nltk.DefaultTagger("S") #S(nimisona) on koige sagedasem unigram_tagger_backoff = nltk.UnigramTagger(train_sents, backoff = default_tagger) bigram_tagger_backoff = nltk.BigramTagger(train_sents, backoff = unigram_tagger_backoff) trigram_tagger_backoff = nltk.TrigramTagger(train_sents, backoff = bigram_tagger_backoff) hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) result = get_tagged_words(os.getcwd() + '/' + sys.argv[3], 2) bi_tagged = bigram_tagger_backoff.tag_sents(result) tri_tagged = trigram_tagger_backoff.tag_sents(result) hmm_tagged = hmm_tagger.tag_sents(result) #Loome DataFrame'i df = createDataFrame() #Kirjutame faili df.to_csv("ossip_villem-oskar_4.csv", header=False)
>>>from nltk.tag.stanford import POSTagger >>>import nltk >>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') >>>tokens =nltk.word_tokenize(s) >>>stan_tagger.tag(tokens) # POS tags freq distribtuion >>>from nltk.corpus import brown >>>import nltk >>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')] >>>print nltk.FreqDist(tags) # default tagger >>>brown_tagged_sents = brown.tagged_sents(categories='news') >>>default_tagger = nltk.DefaultTagger('NN') >>>print default_tagger.evaluate(brown_tagged_sents) # N-gram taggers >>>from nltk.tag import UnigramTagger >>>from nltk.tag import DefaultTagger >>>from nltk.tag import BigramTagger >>>from nltk.tag import TrigramTagger # we are dividing the data into a test and train to evaluate our taggers. >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) >>>print unigram_tagger.evaluate(test_data) >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) >>>print bigram_tagger.evaluate(test_data)
def combineTagger(): t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents)
output.append(line.split()) return output if __name__ == "__main__": train_file = sys.argv[1] test_file = sys.argv[2] train_data, word_counts = read_train_data(train_file) test_data = read_test_data(test_file, word_counts) # Brill Tagger https://www.nltk.org/book/ch05.html 5.4 templates = brill.fntbl37() t0 = nltk.DefaultTagger("NN") t1 = nltk.UnigramTagger(train_data, backoff=t0) t2 = nltk.BigramTagger(train_data, backoff=t1) t3 = nltk.TrigramTagger(train_data, backoff=t2) trainer = nltk.tag.BrillTaggerTrainer(t3, templates) model = t3 # for sent in test_data: if sent: tagged_sent = model.tag(sent) output = [] for word, tag in tagged_sent: output.append(word + "_" + tag)
import nltk from nltk.corpus import cess_esp from pickle import dump patterns = [ (r".*o$","NMS"), (r".*os$","NMP"), (r".*a$","NFS"), (r".*as$","NFP"), ] cesp_tsents = cess_esp.tagged_sents() td = nltk.DefaultTagger("s") tr = nltk.RegexpTagger(patterns, backoff = td ) tu = nltk.UnigramTagger(cesp_tsents, backoff = tr ) output = open("tagger.pkl","wb") dump(tu,output,-1) output.close()