def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def train(self): self.re_tagger = nltk.RegexpTagger(self.patterns) self.bi_tagger = BigramTagger(brown.tagged_sents(), backoff=self.re_tagger) self.tri_tagger = TrigramTagger(brown.tagged_sents(), backoff=self.bi_tagger)
def __init__(self, idiom): self.tagger0 = DefaultTagger('N') self.tagger1 = UnigramTagger(None, self.tagger0) self.tagger2 = BigramTagger(None, self.tagger1) self.lang = os.path.abspath('FriggAnswer')+'/pickle/' #self.lang = os.path.abspath('pickle')+'\\' self.loadIdiom(idiom)
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger, ) return main_tagger
def traintest_bigram_trigram_tagger(self): from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] print 'trainging bigramTagger' bitagger = BigramTagger(train_sents) print 'evaluation bitagger' print bitagger.evaluate(test_sents) print 'trainging trigram Tagger' tritagger = TrigramTagger(train_sents) print 'evaluation bitagger' print tritagger.evaluate(test_sents) print 'tagging'
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) brown_train = brown.tagged_sents(categories="news") unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], backoff=trigram_tagger, ) return main_tagger
def train(self, sentence_list): """Trains the tagger from the tagged sentences provided """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) trigram_fallback = TrigramTagger(sentence_list, backoff=bigram_fallback) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates) self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
def lexical(tokens): print "\n" print "Step 2: Lexical Analysis\n" print "Essentially refers to dictionary and obtains the properties of the word" print "Part-Of-Speech tagging" print "The tagset is:\n" tag = DefaultTagger('NN') tagg = UnigramTagger(train_sent, backoff=tag) tagger = BigramTagger(train_sent, backoff=tagg) tagtokens = tagger.tag(tokens) for token, tag in tagtokens: print token + "->" + tag print "\n" print "The acurracy of the trained pos tagger is:" print tagger.evaluate(test_sents) return tagtokens
def train_tagger(language, model_type, feature, train_sents): if model_type == 'unigram': tagger = UnigramTagger(train_sents) elif model_type == 'bigram': tagger = BigramTagger(train_sents) elif model_type == 'trigram': tagger = TrigramTagger(train_sents) elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) elif model_type == 'crf': tagger = CRFTagger() tagger.train(train_sents, 'taggers/{0}/{1}/crf.pickle'.format(language, feature)) elif model_type == 'perceptron': tagger = PerceptronTagger(load=False) tagger.train(train_sents) return tagger
def train_tagger(tagger_name): train_sents = treebank.tagged_sents()[:5000] if tagger_name == "TnT" or tagger_name == 'tagger': trained_tagger = tnt.TnT() trained_tagger.train(train_sents) else: tagger1 = DefaultTagger('NN') tagger2 = TrigramTagger(train_sents, backoff=tagger1) tagger3 = BigramTagger(train_sents, backoff=tagger2) trained_tagger = UnigramTagger(train_sents, backoff=tagger3) return trained_tagger
def ngram_tagger(tagged_sents): patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'), (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'), (r'.*', 'MORA_HAUPT')] # default regex_tagger = nltk.RegexpTagger(patterns) tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger) # cutoff = 3, if necessary tagger2 = BigramTagger(tagged_sents, backoff=tagger1) tagger3 = TrigramTagger(tagged_sents, backoff=tagger2) return tagger3
def test_ngram_taggers(self): unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger) bitagger = BigramTagger(self.corpus, backoff=unitagger) tritagger = TrigramTagger(self.corpus, backoff=bitagger) ntagger = NgramTagger(4, self.corpus, backoff=tritagger) encoded = self.encoder.encode(ntagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(ntagger), repr(decoded)) self.assertEqual(repr(tritagger), repr(decoded.backoff)) self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff)) self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff)) self.assertEqual(repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff))
def train_brill_tagger(tagged_sents): # The brill tagger module in NLTK. Template._cleartemplates() templates = brill24() # or fntbl37 # default_tagger = nltk.DefaultTagger('MORA_HAUPT') patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'), (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'), (r'.*', 'MORA_HAUPT')] # default regex_tagger = nltk.RegexpTagger(patterns) tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger) # cutoff = 3, if necessary tagger2 = BigramTagger(tagged_sents, backoff=tagger1) tagger3 = TrigramTagger(tagged_sents, backoff=tagger2) tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3) tagger5 = tagger4.train(tagged_sents, max_rules=200) print return tagger5
class Tagger: def __init__(self, idiom): self.tagger0 = DefaultTagger('N') self.tagger1 = UnigramTagger(None, self.tagger0) self.tagger2 = BigramTagger(None, self.tagger1) self.lang = os.path.abspath('FriggAnswer')+'/pickle/' #self.lang = os.path.abspath('pickle')+'\\' self.loadIdiom(idiom) def loadIdiom(self, idiom): input = open(self.lang +idiom +'1.pkl', 'rb') self.tagger = load(input) input.close() input = open(self.lang +idiom+'2.pkl', 'rb') self.tagger2 = load(input) input.close() def classify(self, question): tags = self.tagger2.tag(question) return tags
def train_tagger(): ''' Um exemplo de treinamento de um etiquetador sintático usando um modelo de tri-gramas baseado em probabilidades. Um etiquetador sintático identifica quais a classe de uma palavra Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N Preposição Verbo Artigo Substantivo ''' # Carregando um conjunto de dados em português que possui # sentenças manualmente identificadas data = [ [(w, re.split('[|-]', tag)[0]) for w, tag in sent] for sent in mac_morpho.tagged_sents()] # Classe sintática padrão. N siginifica Nome/substantivo tagger0 = DefaultTagger('N') print('train unigram') tagger1 = UnigramTagger(data, backoff=tagger0) print('training bigram') tagger2 = BigramTagger(data, backoff=tagger1) print('training trigram') return TrigramTagger(data, backoff=tagger2)
from flask_cors import CORS from flask_socketio import SocketIO, emit from logger import getlogger import nltk import sys import timeit import urllib.request import json import settings import requests from nltk.corpus import alpino as alp from nltk.tag import UnigramTagger, BigramTagger training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) bitagger = BigramTagger(training_corpus, backoff=unitagger) pos_tag = bitagger.tag logger = getlogger(__name__) app = Flask(__name__, template_folder='html/templates', static_folder='html/static') CORS(app) socketio = SocketIO(app) app.debug = False @app.route('/') def index(): return render_template('test4nl.html') def query_pixabay(nouns): if nouns:
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank from tag_util import backoff_tagger train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] bitagger = BigramTagger(train_sents) print(bitagger.evaluate(test_sents)) tritagger = TrigramTagger(train_sents) print(tritagger.evaluate(test_sents)) default_tagger = DefaultTagger('NN') combined_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) print(combined_tagger.evaluate(test_sents)) # # train # default_tagger = DefaultTagger('NN') # # train_sents = treebank.tagged_sents()[:3000] # tagger = UnigramTagger(train_sents, backoff=default_tagger) # # # test # test_sents = treebank.tagged_sents()[3000:] # print(tagger.evaluate(test_sents)) # # # save to pickle # import pickle # with open('unitagger.pkl', 'wb') as output: # pickle.dump(tagger, output)
# finalise a sequential tagger # ============================================================================= """ """ 1. run tagger with different corpus size (50% and 100%) """ # backoff tagger tag1_eval = dict() # train with backoff and Brill tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger) tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger) tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger) tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger) tag1b_tagger = train_brill_tagger(tag1_tagger, train_sents, True, max_rules=100) tag1_eval['train_time'] = toc() # test tic() tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents) tag1_eval['test_time'] = toc() # display results display_training_metrics(tag1_eval) """ # ============================================================================= # finalise a classification-based tagger
def train_dutch_tagger(): training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) bitagger = BigramTagger(training_corpus, backoff=unitagger) pos_tag = bitagger.tag return pos_tag
default_tagger = nltk.DefaultTagger('NN') print(default_tagger.evaluate(brown_tagged_sents)) # 0.13089484257215028 brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]] print(default_tagger.evaluate(brown_tagged_sents2)) # 0.3333333333333333 train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) print(unigram_tagger.evaluate(test_data)) # 0.835841722316356 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8454101465164956 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.8427190272102063 regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds
brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs) defaultTagger = DefaultTagger('NN') initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger) brillTagger = train_brill_tagger(initialTagger, brown_train_sents) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(brown_train_sents) bigramTagger = BigramTagger(brown_train_sents) trigramTagger = TrigramTagger(brown_train_sents) print("------------Recommended Tagger------------") print(nltk.pos_tag(sent)) print("------------Default Tagger------------") print(defaultTagger.tag(sent)) print("------------Unigram Tagger Overrode------------") unigramTagger = UnigramTagger(model={'Pierre': 'NN'}) print(unigramTagger.tag(sent)) print("------------Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.tag(sent))
import nltk from nltk.corpus import alpino as alp from nltk.tag import UnigramTagger, BigramTagger alpino = alp.tagged_sents() unitagger = UnigramTagger(alpino) bitagger = BigramTagger(alpino, backoff=unitagger) pos_tag = bitagger.tag sent = 'Een telescoop is een instrument dat een astronoom gebruikt .'.split() print(pos_tag(sent))
from nltk.tag import BigramTagger as BigT from nltk.tag import TrigramTagger as TriT biTagger=BigT(train_sents) biTagger.evaluate(test_sents) triTagger=TriT(train_sents) triTagger.evaluate(test_sents)
(r".*", "NN") # Nouns (default) ] rt = RegexpTagger(regexps=patterns) print(rt.evaluate(test_data)) print(rt.tag(tokens)) # 3. N-GRAM TAGGERS: # Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens))
for page in list(root): l = [] text = page.find('text').text.decode('utf8') language = page.find('language').text.decode('utf8') pos = page.find('pos_tags').text.decode('utf8') splitText = text.split(" ")[1:-1] posText = pos.split(" ")[1:-1] for i in range(len(splitText)): l.append((splitText[i], posText[i])) data.append(l) count = count + 1 shuffle(data) # Divide data into train and test sets eightyPercent = count*0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] # Train train_data = training_set tag1 = DefaultTagger('NN') tag2 = UnigramTagger(train_data, backoff = tag1) tag3 = BigramTagger(train_data, backoff = tag2) tag4 = TrigramTagger(train_data, backoff = tag3) # Accuracy # print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split()) # print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split()) gold_sentences = test_set print tag4.evaluate(gold_sentences)
def train(self, sentence_list): noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) self.tagger = TrigramTagger(sentence_list, backoff=bigram_fallback)
# Evaluation set evaulation_data = tagged_data_list[cutoff:development_size] # print "Data is splitted!" # Regular expression tagger nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')]) # Unigram tagger unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger) print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])), Template(brill.Pos([1, 2])), Template(brill.Pos([1, 3])), Template(brill.Word([1, 1])),
from nltk.corpus import treebank from nltk.corpus import wordnet as wn from os.path import isfile, join from os import listdir from pprint import pprint import gensim.downloader as api import re import nltk import os TEST_PATH = '../test/untagged' COMMON_WORDS_PATH = '../resources/1-1000.txt' TRAINING_SENTS = treebank.tagged_sents() UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN')) BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM) TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM) STOPWORDS = set(nltk.corpus.stopwords.words('english')) WORD_VECTORS = api.load("glove-wiki-gigaword-100") TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))] # Manual list of words to be considered "irrelevant" IRRELEVANT_WORDS = ["talk", "seminar", "lecture"] # manually created ontology tree, which is later extended TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}} # code to convert POS tags into the right form for lemmatization # https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word POS_TO_WORDNET = {
def indivBigram(bambara, backoff): bigram= BigramTagger(bambara.train_sents, backoff=backoff) print("Bigram accuracy: ",bigram.evaluate(bambara.test_sents)) return bigram
from nltk.tag import UnigramTagger unigramTagger = UnigramTagger(training, cutoff=2) # same as tagger.train(training) print('Uniigram tagger accuracy:') print(unigramTagger.evaluate(testing)) #----------------------------------------------------- print('Bigram tagger accuracy:') from nltk.tag import BigramTagger bigramTagger = BigramTagger(training) print(bigramTagger.evaluate(testing)) #----------------------------------------------------- print('Trigram tagger accuracy:') from nltk.tag import TrigramTagger trigramTagger = TrigramTagger(training) print(trigramTagger.evaluate(testing)) #----------------------------------------------------- #Brill Tagger from nltk.tag import brill, brill_trainer # make sure you've got some train_sents!
X_train = tagged_sentences[:int(len(tagged_sentences) * 0.8)] X_test = tagged_sentences[int(len(tagged_sentences) * 0.8):] ''' Question 2 - Performance of 0.13, 0.9 and 0.91 ''' # using only the default - NN - 0.1308 default_tagger = nltk.DefaultTagger('NN') print(default_tagger.evaluate(tagged_sentences)) # Unigrams - 0.902 unigram_tagger = UnigramTagger(X_train) print(unigram_tagger.evaluate(X_test)) # Bigrams with backoff of unigrams - 0.911 bigram_tagger = BigramTagger(X_train, backoff=unigram_tagger) print(bigram_tagger.evaluate(X_test)) ''' Question 3 Performace of 0.77 and 0.79 ''' treebank_tagged_sents = nltk.corpus.treebank.tagged_sents(tagset='universal') print(default_tagger.evaluate(treebank_tagged_sents)) print(unigram_tagger.evaluate(treebank_tagged_sents)) # 0.77 print(bigram_tagger.evaluate(treebank_tagged_sents)) # 0.79 ''' Question 4-5 - F1 of 0.972 for brown dataset. Better performance ''' # modified code def word2features(sent, i):
# 查看特征名 one_hot_multi.classes_ # 查看特征名 from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger # 从布朗语料库中获取文本数据,切分为句子 sentences = brown.tagged_sents(categories='news') # 将4000个句子用作训练,623个句子用作测试 train = sentences[:4000] test = sentences[4000:] # 创建回退标注器 unigram = UnigramTagger(train) bigram = BigramTagger(train, backoff=unigram) trigram = TrigramTagger(train, backoff=bigram) # 查看准确率 trigram.evaluate(test) # TF-IDF import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer # 创建文本 text_data = np.array( ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both']) # 创建TF-IDF特征矩阵 tfidf = TfidfVectorizer() feature_matrix = tfidf.fit_transform(text_data) # 查看TF-IDF特征矩阵 feature_matrix
###---------------------------------------------------------------------------- ### Train Tagger ( Unitagger ) #Attempt to use the bigram-callout annotation identifier #If the Bigram callout cannot find the tag, try Unigram the callout #If the Unigram callout cannot find the tag, use the default callout from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank print("Preparing, please wait\nLoading...") train = treebank.tagged_sents()[:7000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ### Train Rate_Marker ''' Sear_Edit_Dis = [None] * 10 Sear_Means1_Right = [None] * 5 Sear_Means1_Wrong = [None] * 5 Sear_Rou_Count = 0 Sear_Line_Count = 0 Sear_Total_Line = 0 Sear_Corre_Line = 0 Sear_Ques = [None] * 10 Sear_Answ = [None] * 10
import nltk from nltk.tag import BigramTagger, TrigramTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] bigramtag = BigramTagger(training) print(bigramtag.evaluate(testing)) trigramtag = TrigramTagger(training) print(trigramtag.evaluate(testing))
>>>brown_tagged_sents = brown.tagged_sents(categories='news') >>>default_tagger = nltk.DefaultTagger('NN') >>>print default_tagger.evaluate(brown_tagged_sents) # N-gram taggers >>>from nltk.tag import UnigramTagger >>>from nltk.tag import DefaultTagger >>>from nltk.tag import BigramTagger >>>from nltk.tag import TrigramTagger # we are dividing the data into a test and train to evaluate our taggers. >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) >>>print unigram_tagger.evaluate(test_data) >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) >>>print bigram_tagger.evaluate(test_data) >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger) >>>print trigram_tagger.evaluate(test_data) # Regex tagger >>>from nltk.tag.sequential import RegexpTagger >>>regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff
import nltk from nltk.tag import BigramTagger from nltk.corpus import treebank training_1= treebank.tagged_sents()[:7000] bigramtagger=BigramTagger(training_1) print(treebank.sents()[0]) print(bigramtagger.tag(treebank.sents()[0])) testing_1 = treebank.tagged_sents()[2000:] print(bigramtagger.evaluate(testing_1))
'''import replacer from replacer import RegexpReplacer from replacer import RepeatReplacer''' import linecache import matplotlib.pyplot as plt ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='firefox.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0 VBD_count = 0 VBZ_count = 0
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict