def load_pos(): from nltk_lite.corpora import brown from itertools import islice sentences = list(islice(brown.tagged(), 100)) tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb'] sequences = [] sequence = [] symbols = Set() start_re = re.compile(r'[^-*+]*') for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word m = start_re.match(tag) # cleanup the tag tag = m.group(0) if tag not in tag_set: tag = '*' sentence[i] = (word, tag) # store cleaned-up tagged token return sentences, tag_set, list(symbols)
def demo(): from nltk_lite.corpora import brown from itertools import islice from pprint import pprint pprint(list(islice(brown.raw('a'), 0, 5))) pprint(list(islice(brown.tagged('a'), 0, 5)))
def demo(): """ A simple demonstration function for the C{Tagger} classes. It constructs a backoff tagger using a trigram tagger, bigram tagger unigram tagger and a default tagger. It trains and tests the tagger using the Brown corpus. """ from nltk_lite.corpora import brown import sys print 'Training taggers.' # Create a default tagger t0 = Default('nn') # t1a = Affix(length=-3, minlength=5, backoff=t0) # t1b = Unigram(cutoff=2, backoff=t1a) t1 = Unigram(cutoff=1, backoff=t0) t2 = Bigram(cutoff=1, backoff=t1) t3 = Trigram(backoff=t2) t1.train(brown.tagged('a'), verbose=True) t2.train(brown.tagged('a'), verbose=True) t3.train(brown.tagged('a'), verbose=True) # Tokenize the testing files test_tokens = [] num_words = 0 # Run the taggers. For t0, t1, and t2, back-off to DefaultTagger. # This is especially important for t1 and t2, which count on # having known tags as contexts; if they get a context containing # None, then they will generate an output of None, and so all # words will get tagged a None. print '='*75 print 'Running the taggers on test data...' print ' Default (nn) tagger: ', sys.stdout.flush() _demo_tagger(t0, brown.tagged('b')) print ' Unigram tagger: ', sys.stdout.flush() _demo_tagger(t1, list(brown.tagged('b'))[:1000]) print ' Bigram tagger: ', sys.stdout.flush() _demo_tagger(t2, list(brown.tagged('b'))[:1000]) print ' Trigram tagger: ', sys.stdout.flush() _demo_tagger(t3, list(brown.tagged('b'))[:1000])
def demo(): """ Demonstrates how to use IndexConcordance and Aggregator. """ print "Reading Brown Corpus into memory..." corpus = list(brown.tagged(('a','j'))) print "Generating index..." ic = IndexConcordance(corpus) print "Showing all occurences of 'plasma' in the Brown Corpus..." ic.formatted(middleRegexp="^plasma/.*", verbose=True) print "Investigating the collocates of 'deal' and derivatives..." agg = Aggregator() agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0, leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates") agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1, rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates") agg.formatted(showFirstX=5, usePOS=False)
def demo(): import tnt from nltk_lite.corpora import brown sents = list(brown.tagged()) test = list(brown.raw()) # create and train the tagger tagger = tnt.Tnt() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j + 100] for i in range(len(s)): print s[i], '--', t[i] print
def demo(): import tnt from nltk_lite.corpora import brown sents = list(brown.tagged()) test = list(brown.raw()) # create and train the tagger tagger = tnt.Tnt() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j+100] for i in range(len(s)): print s[i],'--', t[i] print
def demo3(): from nltk_lite import tag from nltk_lite.corpora import treebank from nltk_lite.corpora import brown import tnt d = list(treebank.tagged()) e = list(brown.tagged()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=False) dtest = d[(i * d10):((i + 1) * d10)] etest = e[(i * e10):((i + 1) * e10)] dtrain = d[:(i * d10)] + d[((i + 1) * d10):] etrain = e[:(i * e10)] + e[((i + 1) * e10):] t.train(dtrain) s.train(etrain) tacc = tag.accuracy(t, dtest) tp_un = float(t.unknown) / float(t.known + t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = tag.accuracy(s, etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print "brown: acc over words known:", 10 * tknacc print " : overall accuracy:", 10 * tallacc print " : words known:", 10 * tknown print "treebank: acc over words known:", 10 * sknacc print " : overall accuracy:", 10 * sallacc print " : words known:", 10 * sknown
fictionGeneral = 'k', fictionMystery = 'l', fictionScience = 'm', fictionAdventure = 'n', fictionRomance = 'p', humour = 'r') # set corpus basedir set_basedir('./topicalizer/corpora') # create tokenizer tokenizer = analyser.Tokenizer() # train tagging model model = tag.Bigram() model.train(brown.tagged([textCategories['pressReportage'], textCategories['pressEditorial'], textCategories['pressReviews'], textCategories['skillsAndHobbies'], textCategories['popularLore']])) # tag text text = 'I want to buy a camera' tokens = list(tokenizer.processWhitespacesWithoutStopWords(text, 1)) taggedTokens = list(model.tag(tokens)) print tokens print taggedTokens # get WordNet information for each noun for taggedToken in taggedTokens: if taggedToken[1] == 'nn' or taggedToken[1] == None: # get synsets synsets = impl.lookupSynsetsByForm(taggedToken[0]) # print gloss
fictionAdventure='n', fictionRomance='p', humour='r') # set corpus basedir set_basedir('./topicalizer/corpora') # create tokenizer tokenizer = analyser.Tokenizer() # train tagging model model = tag.Bigram() model.train( brown.tagged([ textCategories['pressReportage'], textCategories['pressEditorial'], textCategories['pressReviews'], textCategories['skillsAndHobbies'], textCategories['popularLore'] ])) # tag text text = 'I want to buy a camera' tokens = list(tokenizer.processWhitespacesWithoutStopWords(text, 1)) taggedTokens = list(model.tag(tokens)) print tokens print taggedTokens # get WordNet information for each noun for taggedToken in taggedTokens: if taggedToken[1] == 'nn' or taggedToken[1] == None: # get synsets synsets = impl.lookupSynsetsByForm(taggedToken[0])
def demo3(): from nltk_lite import tag from nltk_lite.corpora import treebank from nltk_lite.corpora import brown import tnt d = list(treebank.tagged()) e = list(brown.tagged()) d = d[:1000] e = e[:1000] d10 = int(len(d)*0.1) e10 = int(len(e)*0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=False) dtest = d[(i*d10):((i+1)*d10)] etest = e[(i*e10):((i+1)*e10)] dtrain = d[:(i*d10)] + d[((i+1)*d10):] etrain = e[:(i*e10)] + e[((i+1)*e10):] t.train(dtrain) s.train(etrain) tacc = tag.accuracy(t, dtest) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = tag.accuracy(s, etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print "brown: acc over words known:", 10*tknacc print " : overall accuracy:", 10*tallacc print " : words known:", 10*tknown print "treebank: acc over words known:", 10*sknacc print " : overall accuracy:", 10*sallacc print " : words known:", 10*sknown
# 'context_pattern' is built based on the context's size (self._n), # for example: # self._n = 2 -> r'^(.+?)$', like 'tag1' # self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2' # self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3' context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) ) context_pattern = re.compile(context_pattern_str, re.UNICODE) for line in lines[1:]: m = re.match(pattern, line) context, text, tag = m.groups() c_m = re.match(context_pattern, context) key = (c_m.groups(), text) self._model[key] = tag handler.close() # load train corpus train_sents = list(islice(brown.tagged(), 500)) # create taggers tagger = MarshalNgram(3) #tagger.train(train_sents) #tagger.marshal("ngram.test") tagger.unmarshal("ngram.test") print tagger._model