def createInstance(cls, backoff): from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents() return nltk.TrigramTagger(brown_tagged_sents, backoff=backoff)
for item in all_files: item = periods.sub(".", item) all_text.append(item) # Make raw text and then tokenize corpus_raw = "".join(all_text) corpus_sents = nltk.sent_tokenize(corpus_raw, language="french") for sentence in corpus_sents: corpus_list = sentence.split() corpus_tuples = [nltk.tag.str2tuple(item) for item in corpus_list] corpus_tagged_sents.append(corpus_tuples) # Split into training and held out data size = int(len(corpus_tagged_sents) * 0.9) train_sents = corpus_tagged_sents[:size] test_sents = corpus_tagged_sents[size:] # Train taggers tagger_default = nltk.DefaultTagger("NN") tagger_unigram = nltk.UnigramTagger(train_sents, backoff=tagger_default) tagger_bigram = nltk.BigramTagger(train_sents, backoff=tagger_unigram) tagger_trigram = nltk.TrigramTagger(train_sents, backoff=tagger_bigram) # Evaluate with disfluency chunks and print some stats stats_dir = "./stats/" result = tagger_trigram.evaluate(test_sents) with open(f"{stats_dir}test_dis_ext_result.txt", "w") as file: file.write(str(result))
# 一元标注器 unigram_tagger = nltk.UnigramTagger(train_sents) unigram_rate = unigram_tagger.evaluate(test_sents) print('unigram_rate', unigram_rate) # 0.8121200039868434 # 二元标注器 bigram_tagger = nltk.BigramTagger(train_sents) print(bigram_tagger.tag(brown_sents[2007])) unseen_sent = brown_sents[4203] print(bigram_tagger.tag(unseen_sent)) bigram_rate = bigram_tagger.evaluate(test_sents) print('bigram_rate', bigram_rate) # 组合 bigram 标注器、unigram 标注器和一个默认标注器 # 1. 尝试使用 bigram 标注器标注标识符。 # 2. 如果 bigram 标注器无法找到一个标记,尝试 unigram 标注器。 # 3. 如果 unigram 标注器也无法找到一个标记,使用默认标注器。 t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, cutoff=2, backoff=t2) conbine_rate = t3.evaluate(test_sents) print('conbine_rate', conbine_rate) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) conbine_rate_simple = t2.evaluate(test_sents) print('conbine_rate_simple', conbine_rate_simple)
from nltk.tag.sequential import UnigramTagger from nltk import jsontags import pprint tagged_sents = brown.tagged_sents(categories='news') size = int(len(tagged_sents) * 0.9) train_sents = tagged_sents[:size] test_sents = tagged_sents[size:] @jsontags.register_tag class PreviousTagger(UnigramTagger): json_tag = 'nltk.tag.sequential.PreviousTagger' def context(self, tokens, index, history): if index == 0: return None else: return history[index-1] t0 = nltk.DefaultTagger('NN') t1 = PreviousTagger(train_sents, backoff=t0) t2 = nltk.UnigramTagger(train_sents, backoff=t1) t3 = nltk.BigramTagger(train_sents, backoff=t2) t4 = nltk.TrigramTagger(train_sents, backoff=t3) pprint.pprint(t4.tag(['I', 'like', 'to', 'blog', 'on', 'Kim\'s', 'blog']))
def __init__(self, train_sents): train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.TrigramTagger(train_data)
import codecs import nltk entrada = codecs.open("catalanTagged_0_5000-utf-8.txt", "r", encoding="utf-8") tagged_words = [] tagged_sents = [] for linia in entrada: linia = linia.rstrip() if linia.startswith("<") or len(linia) == 0: if len(tagged_words) > 0: tagged_sents.append(tagged_words) tagged_words = [] else: camps = linia.split(" ") forma = camps[0] lema = camps[1] etiqueta = camps[2] tupla = (forma, etiqueta) tagged_words.append(tupla) unigram_tagger = nltk.UnigramTagger(tagged_sents) bigram_tagger = nltk.BigramTagger(tagged_sents, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(tagged_sents, backoff=bigram_tagger) oracio = "l'àcid desoxiribonucleic (ADN o DNA) és un àcid nucleic que conté les instruccions genètiques utilitzades en el desenvolupament i funcionament de tots els éssers vius coneguts, així com en alguns virus, des d'un punt de vista químic, l'ADN es compon de dos llargs polímers d'unitats simples anomenades nucleòtids, amb un tronc compost de sucres i grups fosfats units per enllaços èster" tokenitzador = nltk.tokenize.RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') tokens = tokenitzador.tokenize(oracio) analisi = trigram_tagger.tag(tokens) print(analisi)
for sentenca in sentencas_floresta if sentenca] sentencas_mac_morpho = nltk.corpus.mac_morpho.tagged_sents() data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca] for sentenca in sentencas_mac_morpho if sentenca] base = data teste = data print('Treinando tagger. Isso pode demorar...') _tagger = nltk.NgramTagger(4, base, backoff=nltk.TrigramTagger( base, backoff=nltk.BigramTagger( base, backoff=nltk.UnigramTagger( base, backoff=nltk.DefaultTagger('n'))))) print('Tagger treinado com sucesso! Precisão de %.1f!' % (_tagger.evaluate(teste) * 100)) try: print('Salvando tagger...') output = open(CAMINHO_DUMP, 'wb') dump(_tagger, output, -1) output.close() print('Tagger salvo em "dump_tagger.pkl"!')
try: return cfd[word].max() except ValueError: return "UNK" cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news", tagset="universal")) print(get_most_likely_tag("lol", cfd)) for word in brown.sents(categories="science_fiction")[0]: print(word, get_most_likely_tag(word, cfd)) # Task 8.3 # a), b) t0 = nltk.DefaultTagger('NOUN') train_sents = brown.tagged_sents(tagset='universal', categories='news') t1 = nltk.UnigramTagger(train_sents) t2 = nltk.BigramTagger(train_sents) t3 = nltk.TrigramTagger(train_sents) sentences = [ [("The", "DET"), ("only", "ADJ"), ("Conservative", "NOUN"), ("councillor", "NOUN"), ("representing", "VERB"), ("Cambridge", "NOUN"), ("resigned", "VERB"), ("from", "ADP"), ("the", "DET"), ("city", "NOUN"), ("council", "NOUN"), (".", ".")]] print(t0.evaluate(sentences)) print(t1.evaluate(sentences)) print(t2.evaluate(sentences)) print(t3.evaluate(sentences))
# In[13]: # train Unigramtagger unigram_tagger = nltk.UnigramTagger(train_sents) unigram_tagger.evaluate(test_sents) # In[14]: #train a BigramTagger Bigram_tagger = nltk.BigramTagger(train_sents) Bigram_tagger.evaluate(test_sents) # In[15]: #train a Trigramtagger Trigram_tagger = nltk.TrigramTagger(train_sents) Trigram_tagger.evaluate(test_sents) # In[16]: #Ensemble the Uni and Bi - gram taggers t1 = nltk.UnigramTagger(train_sents) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents) # In[17]: #Ensemble the Uni and Bi and Tri - gram taggers, the performance is slightly down, we stick with Uni and Bi -gram taggers t3 = nltk.TrigramTagger(train_sents, backoff=t2) t3.evaluate(test_sents)
#find the distinct n-grams that contain the word 'on' ngrams = [item for item in set(bigram_tuples) if "on" in item] #Create naive Default Tagger default_tagger = nltk.DefaultTagger('NN') tagged_sentence = default_tagger.tag(tokens) # download tagsets from .download() nltk.help.upenn_tagset('NN') #Regular Expression Tagset patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ed$', 'VB')] regexp_tagger = nltk.RegexpTagger(patterns) regexp_tagger.tag(tokens) from nltk.corpus import brown training = brown.tagged_sents(categories='news') # Create Unigram, Bigram, Trigram taggers based on the training set. unigram_tagger = nltk.UnigramTagger(training) bigram_tagger = nltk.BigramTagger(training) trigram_tagger = nltk.TrigramTagger(training) #Combination of taggers default_tagger = nltk.DefaultTagger('NN') bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) print("done!")
def add_trigram_tagger(self): tagger = nltk.TrigramTagger(self.tagged_sents_train, backoff=self.currentTagger) self.currentTagger = tagger return self
import nltk tagged_sents = nltk.corpus.mac_morpho.tagged_sents() print(tagged_sents) t0 = nltk.DefaultTagger('N') t1 = nltk.UnigramTagger(tagged_sents, backoff=t0) t2 = nltk.BigramTagger(tagged_sents, backoff=t1) t3 = nltk.TrigramTagger(tagged_sents, backoff=t2) tagged = t3.tag( nltk.word_tokenize('Ontem, o João Antunes comeu peixe ao almoço.')) print(tagged)
tups_to_file('ruso/OUTPUT.txt', russian_tags) ############################################################ ################### Training the Models #################### ############################################################ # Gathering the training data from Penn Treebank penn_sents = nltk.corpus.treebank.tagged_sents() penn_sents_train, penn_sents_test = train_test_split(penn_sents, test_size=0.15) # TrigramTagger t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(penn_sents_train, backoff=t0) t2 = nltk.BigramTagger(penn_sents_train, backoff=t1) t3 = nltk.TrigramTagger(penn_sents_train, backoff=t2) print("Accuracy of the trigramm tagger in a Penn Treebank random test set: ", end="") print(t3.evaluate(penn_sents_test)) macbeth_trigram_predictions = t3.tag(macbeth_words) macbeth_trigram_tags = [x[1] for x in macbeth_trigram_predictions] assert len(macbeth_tags_nltk) == len(macbeth_trigram_tags) coinc = [ macbeth_tags_nltk[i] == macbeth_trigram_tags[i] for i in range(len(macbeth_trigram_tags)) ] print("Percentage of tags that are coincident with NLTK default predictor: ", end="")
def __init__(self): self._UNIGRAM_TRAIN_SETS=[ [ ("teaspoon", "QT"), ("tablespoon", "QT"), ("lbs", "QT"), ("g", "QT"), ("grams", "QT"), ("pounds", "QT"), ("cups", "QT"), ("whole", "QT"), ("chopped", "QT"), ("medium", "QT"), ("size", "QT"), #ingredients ("flour", "ING"), ("water", "ING"), ("salt", "ING"), ("sugar", "ING"), ("pepper", "ING"), ("oil","ING"), ("beef", "ING"), ("butter", "ING"), ("mushrooms", "ING"), ("onions", "ING"), ("wine", "ING"), ("stock","ING"), ("chives", "ING"), ("paneer", "ING"), ("capsicum", "ING"), ("ghee", "ING"), ("tomatoes", "ING"), ("coriander", "ING"), ("chillies", "ING"), ("garlic", "ING"), ("ginger", "ING"), ("fenugreek", "ING"), ("red", "ING"), ("green", "ING"), ("yellow", "ING"), ("Avocadoes", "ING"), ("Beans", "ING"), ("Cheese", "ING"), ("chipotles", "ING"), ("chocolate", "ING"), ("limes", "ING"), ("oregano", "ING"), ("pickles", "ING"), ("limes", "ING"), ("lemon", "ING"), ("tomatoes", "ING"), ("bell pepper", "ING"), ("capsicum", "ING"), ("eggplant", "ING"), ("lentils", "ING"), ("basil", "ING"), ("thyme", "ING"), ("Parsley", "ING"), ("Mint", "ING"), ("rosemary", "ING"), ("sage", "ING"), ("chives", "ING"), ("dill", "ING"), ("cilantro", "ING"), ("Tarragon", "ING"), ("saffron", "ING"), ("cardamom", "ING"), ("cinnamon", "ING"), ("cloves", "ING"), ("cumin", "ING"), ] ] self._BIGRAM_TRAIN_SETS = [ [("coriander","ING"), ("seeds", "ING")], [("garlic","ING" ), ("paste", "ING")], [("green", "ING"), ("chillies", "ING")], [("chopped", "ING"), ("ginger", "ING")], [("fenugreek", "ING"), ("leaves", "ING")], [("size", "ING"), ("tomatoes", "ING")], [("red","ING"), ("chillies", "ING")], ] self._TRIGRAM_TRAIN_SETS = [ [("whole", "ING"), ("red","ING"), ("chillies", "ING")], [("chopped", "ING"), ("green", "ING"), ("chillies", "ING")], [("medium", "ING"), ("size", "ING"), ("tomatoes", "ING")], ] self._default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) self._uni_tagger = nltk.UnigramTagger(self._UNIGRAM_TRAIN_SETS, backoff=self._default_tagger) self._bi_tagger = nltk.BigramTagger(self._BIGRAM_TRAIN_SETS, backoff=self._uni_tagger) self._tri_tagger = nltk.TrigramTagger(self._TRIGRAM_TRAIN_SETS, backoff=self._bi_tagger)
tagmap[""] = "X" for line in contents.splitlines(): line = line.strip() if line == "": continue fine, coarse = line.split("\t") tagmap[fine] = coarse def simplify_tag(t): if "+" in t: t = t[t.index("+") + 1:] if "|" in t: t = t[t.index("|") + 1:] t = t.lower() return tagmap[t] print "Training Tagger" dataset1 = nltk.corpus.floresta.tagged_sents() dataset2 = nltk.corpus.mac_morpho.tagged_sents() train = [[(w, simplify_tag(t)) for (w, t) in sent] for sent in dataset1 + dataset2] tagger_fast = nltk.TrigramTagger(train, backoff=nltk.BigramTagger( train, backoff=nltk.UnigramTagger( train, backoff=nltk.DefaultTagger('N')))) print "Done" with open("Models/tagger.pkl", "wb") as fid: cPickle.dump(tagger_fast, fid, cPickle.HIGHEST_PROTOCOL)
count = len(bigram_tuples) print count count = {item : bigram_tuples.count(item) for item in set(bigram_tuples)} print count default_tagger = nl.DefaultTagger('NN') tagged_sentence = default_tagger.tag(tokens) print tagged_sentence patterns = [(r'.*ing$', 'VBG'),(r'.*ed$', 'VBD'),(r'.*es$', 'VBZ'),(r'.*ed$', 'VB')] regexp_tagger = nl.RegexpTagger(patterns) tagged_sentence = regexp_tagger.tag(tokens) print tagged_sentence """ training = brown.tagged_sents(categories='news') #print training def_tagger = nl.DefaultTagger('NN') uni_tagger = nl.UnigramTagger(training, backoff=def_tagger) bi_tagger = nl.BigramTagger(training, backoff=uni_tagger) tri_tagger = nl.TrigramTagger(training, backoff=bi_tagger) print uni_tagger.tag(tokens) print bi_tagger.tag(tokens) print tri_tagger.tag(tokens)
# pylint: disable=C0111 # pylint: disable=C0103 import nltk import sents from pickle import dump defaultTagger = nltk.DefaultTagger('N') patterns = [(r'(da|do|de|das|dos)$', 'PREP'), (r'.*ndo$', 'V-GER')] regexTagger = nltk.RegexpTagger(patterns, backoff=defaultTagger) unigramTagger = nltk.UnigramTagger(sents.sentTreino, backoff=regexTagger) bigramTagger = nltk.BigramTagger(sents.sentTreino, backoff=unigramTagger) trigramTagger = nltk.TrigramTagger(sents.sentTreino, backoff=bigramTagger) FinalTagger = trigramTagger output = open('mac_morpho.pkl', 'wb') dump(FinalTagger, output, -1) output.close() # resultado = unigramTagger.evaluate(sents.sentTeste) # print(resultado*100.0) # Precisão foi de 81.521% com regex (prep), default(N) e unigram # 01/10/2017 14:40 # Precisão foi de 81.545% com regex (prep), default(N), unigram e bigram # 03/10/2017 07:28 # Precisão foi de 81.553% com regex (prep), default(N), unigram e bigram # 01/10/2017 14:33
regex_tag = nltk.RegexpTagger([ #(r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.[\/\/]\S+', 'URL'), #URL / useless (r'.*', 'UNK') # unknown terms ]) unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag) bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag) trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag) # PoS Browns Corpus Tagging: https://en.wikipedia.org/wiki/Brown_Corpus # custom defined Context Free Grammar (CFG) by vipul cfg = dict() cfg['NNP+NNP'] = 'NNP' cfg['NN+NN'] = 'NNI' cfg['NNP+NNI'] = 'NNI' cfg['NNI+NN'] = 'NNI' cfg['NNI+NNI'] = 'NNI' cfg['NNI+NNP'] = 'NNI' cfg['JJ+JJ'] = 'JJ' cfg['JJ+NN'] = 'NNI' cfg['CD+CD'] = 'CD' cfg['NPI+NNP'] = 'NNP' # this is specific for collecting terms with the word deal cfg['NNI+RP'] = 'NNI' # collects terms like "heats up" -- RP = adverb particle
def __init__(self): self.train_tagged_sents = brown.tagged_sents() self.default_tagger = nltk.DefaultTagger('NN') self.unigram_tagger = nltk.UnigramTagger(self.train_tagged_sents, backoff = self.default_tagger ) self.bigram_tagger = nltk.BigramTagger(self.train_tagged_sents, backoff = self.unigram_tagger ) self.trigram_tagger = nltk.TrigramTagger(self.train_tagged_sents, backoff = self.bigram_tagger)
r = default_tagger.evaluate(brown_tagged_sents) print(r) unigram_tagger = nltk.UnigramTagger(brown_tagged_sents, verbose=True) print(unigram_tagger.tag(brown_sents[2007])) r = unigram_tagger.evaluate(brown_tagged_sents) print(r) train_size = int(len(brown_tagged_sents) * 0.9) print(train_size) train_sents = brown_tagged_sents[:train_size] test_sents = brown_tagged_sents[train_size:] unigram_tagger = nltk.UnigramTagger(train_sents, verbose=True) print(unigram_tagger.tag(brown_sents[2007])) r = unigram_tagger.evaluate(test_sents) print(r) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True) t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True) r = t2.evaluate(test_sents) t22 = nltk.BigramTagger(train_sents, cutoff=2, backoff=t1, verbose=True) r = t22.evaluate(test_sents) print(r) t3 = nltk.TrigramTagger(train_sents, backoff=t2, verbose=True) r = t3.evaluate(test_sents) print(r)
#print('tags2 WORD',fdTag3.most_common()) tagTexto(textoBrownTagSentNew) ''' 1.Estender o exemplo dos etiquetadores para TrigramTagger e analisar a precisao do modelo ''' treino = mac_morpho.tagged_sents()[1000:] teste = mac_morpho.tagged_sents()[:1000] etiq0 = nltk.DefaultTagger('N') etiq1 = nltk.UnigramTagger(treino, backoff=etiq0) print('UnigramTagger', etiq1.evaluate(teste)) etiq2 = nltk.BigramTagger(treino, backoff=etiq1) print('BigramTagger', etiq2.evaluate(teste)) etiq3 = nltk.TrigramTagger(treino, backoff=etiq2) print('TrigramTagger', etiq3.evaluate(teste)) doc = open('textoPT.txt', encoding='utf8') raw = doc.read() #texto = nltk.word_tokenize('O mundo atual possui diversos idiomas.') texto = nltk.word_tokenize(raw) #print('etiq2', etiq2.tag(texto)) #print('etiq3', etiq3.tag(texto)) ''' 2. Implementar a tecnica de validacao 10-fold cross-validation e analisar a precisao dos modelos. Discutir os resultados. '''
backoff_reg_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger) fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.keys()[:1500] likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) lookup_tagger = nltk.UnigramTagger(model=likely_tags, backoff=backoff_reg_tagger) print "%s:test:%lf" % (method, lookup_tagger.evaluate(test)) elif method == 'unigram': # unigram backoff tagger unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger) print "%s:test:%lf" % (method, unigram_tagger.evaluate(test)) elif method == 'bigram': # bigram backoff tagger unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger) print "%s:test:%lf" % (method, bigram_tagger.evaluate(test)) elif method == 'trigram': # trigram backoff tagger unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(train, cutoff=10, backoff=bigram_tagger) print "%s:test:%lf" % (method, trigram_tagger.evaluate(test)) else: print >> sys.stderr, "unknown method" sys.exit(2)
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import floresta import cPickle FILENAME = "txts/floresta_trigram.pos" def simplify_tag(t): if '+' in t: return t[t.index('+')+1:] else: return t tsents = floresta.tagged_sents() tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train, backoff=tagger0) tagger2 = nltk.BigramTagger(train, backoff=tagger1) tagger = nltk.TrigramTagger(train, backoff=tagger2) tagger.evaluate(test) with open(FILENAME, 'wb') as outFile: cPickle.dump(tagger, outFile, -1)
if o in ('-m', '--method'): method = a train_tagged_sents = brown.tagged_sents(categories=trainsection) test_tagged_sents = brown.tagged_sents(categories=testsection) train_tagged_words = brown.tagged_words(categories=trainsection) test_tagged_words = brown.tagged_words(categories=testsection) train_words = brown.words(categories=trainsection) print_to_file("\n\nmethod = " + method + "\n") default_tag = default_tag(train_tagged_sents) default_tagger = nltk.DefaultTagger(default_tag) if method in ['unigram', 'bigram', 'trigram']: tu = nltk.UnigramTagger(train_tagged_sents, backoff=default_tagger) tb = nltk.BigramTagger(train_tagged_sents, backoff=tu) tt = nltk.TrigramTagger(train_tagged_sents, backoff=tb) fd = nltk.FreqDist(train_words) cfd = nltk.ConditionalFreqDist(train_tagged_words) d = {k: cfd[k].max() for k in fd.keys()[:1000]} patterns = [ (r'^the$', 'AT'), (r'^,$', ','), (r'^\.$', '.'), (r'^of$', 'IN'), (r'^and$', 'CC'), (r'^to$', 'TO'), (r'^a$', 'AT'), (r'^in$', 'IN'), (r'^that$', 'CS'),
print("{0:.4f} HiddenMarkovModelTagger".format(result)) def createDataFrame(): df = pd.DataFrame() df['word'] = [w for s in result for w in s] df['bi_tag'] = [w[1] for s in bi_tagged for w in s] df['tri_tag'] = [w[1] for s in tri_tagged for w in s] df['hmm_tag'] = [w[1] for s in hmm_tagged for w in s] return df tagged_texts = loopFiles(sys.argv[1]) # loen sisse treeninghulga test_texts = loopFiles(sys.argv[2]) # loen sisse teshulga andmed' train_sents = tagged_texts default_tagger = nltk.DefaultTagger("S") #S(nimisona) on koige sagedasem unigram_tagger_backoff = nltk.UnigramTagger(train_sents, backoff = default_tagger) bigram_tagger_backoff = nltk.BigramTagger(train_sents, backoff = unigram_tagger_backoff) trigram_tagger_backoff = nltk.TrigramTagger(train_sents, backoff = bigram_tagger_backoff) hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) result = get_tagged_words(os.getcwd() + '/' + sys.argv[3], 2) bi_tagged = bigram_tagger_backoff.tag_sents(result) tri_tagged = trigram_tagger_backoff.tag_sents(result) hmm_tagged = hmm_tagger.tag_sents(result) #Loome DataFrame'i df = createDataFrame() #Kirjutame faili df.to_csv("ossip_villem-oskar_4.csv", header=False)
import argparse if __name__ == "__main__": parser = argparse.ArgumentParser("Create a part of speech tagger") parser.add_argument("--tagger_file", type=str, default="tagger.pkl") args = parser.parse_args() tf.io.gfile.makedirs(os.path.dirname(args.tagger_file)) brown_tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal') training_split = int(len(brown_tagged_sentences) * 0.9) train_sentences = brown_tagged_sentences[:training_split] test_sentences = brown_tagged_sentences[training_split:] t0 = nltk.DefaultTagger('<unk>') t1 = nltk.UnigramTagger(train_sentences, backoff=t0) t2 = nltk.BigramTagger(train_sentences, backoff=t1) t3 = nltk.TrigramTagger(train_sentences, backoff=t2) scores = [[t0.evaluate(test_sentences), t0], [t1.evaluate(test_sentences), t1], [t2.evaluate(test_sentences), t2], [t3.evaluate(test_sentences), t3]] best_score, best_tagger = max(scores, key=lambda x: x[0]) print("Finished building tagger with {0:.2f}% accuracy".format(best_score * 100)) with tf.io.gfile.GFile(args.tagger_file, 'wb') as f: pkl.dump(best_tagger, f)
except NameError: word_tokenizer = make_word_tokenizer() try: sent_tokenizer except NameError: sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') try: tagger except NameError: brown_a = nltk.corpus.brown.tagged_sents(categories='a') t0 = nltk.DefaultTagger('WTF') t1 = nltk.UnigramTagger(brown_a, backoff=t0) t2 = nltk.BigramTagger(brown_a, backoff=t1) tagger = nltk.TrigramTagger(brown_a, backoff=t2) class SpeechAnalyzer(object): NOTATIONS = { r'--': "##PAUSE##", r'\(sic\)': "##SIC##", r'\[mispronunciation\]': '##MISPRONUNCIATION##', r'\.\.\.': ' ##PAUSE## ' } PHRASES = [ "wall street", "main street", "my friends", "middle class", "fannie mae",
def initialize_taggers(self): self.t0 = nltk.DefaultTagger('unk') self.t1 = nltk.UnigramTagger(self.train, backoff=self.t0) self.t2 = nltk.BigramTagger(self.train, backoff=self.t1) self.t3 = nltk.TrigramTagger(self.train, backoff=self.t2)
return output if __name__ == "__main__": train_file = sys.argv[1] test_file = sys.argv[2] train_data, word_counts = read_train_data(train_file) test_data = read_test_data(test_file, word_counts) # Brill Tagger https://www.nltk.org/book/ch05.html 5.4 templates = brill.fntbl37() t0 = nltk.DefaultTagger("NN") t1 = nltk.UnigramTagger(train_data, backoff=t0) t2 = nltk.BigramTagger(train_data, backoff=t1) t3 = nltk.TrigramTagger(train_data, backoff=t2) trainer = nltk.tag.BrillTaggerTrainer(t3, templates) model = t3 # for sent in test_data: if sent: tagged_sent = model.tag(sent) output = [] for word, tag in tagged_sent: output.append(word + "_" + tag) print(" ".join(output))
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('TEGN')) print "LU tagged w/ backoff: ", baseline_tagger.evaluate(tagged_sents) print " " """Unigram tagger""" unigram_tagger = nltk.UnigramTagger(train) print "Unigram: ", unigram_tagger.evaluate(test) print " " """Bigram tagger""" bigram_tagger = nltk.BigramTagger(train) print "Bigram: ", bigram_tagger.evaluate(test) print " " """Trigram tagger with backoffs""" t0 = nltk.DefaultTagger('TEGN') t1 = nltk.UnigramTagger(train, backoff=t0) t2 = nltk.BigramTagger(train, backoff=t1) t3 = nltk.TrigramTagger(train, backoff=t2) print "Trigram with backoffs: ", t3.evaluate(test) print " " """ """