def demo_pos_supervised(): from nltk.corpus import brown from sys import stdout print 'Loading data from Brown corpus...' tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb'] annul_nonmatching_tags(tagged_tokens, tag_set, '*') words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens) word_set.sort() tag_set.sort() print 'output alphabet', `word_set`[:50], '...' print 'state labels ', `tag_set`[:50], '...' print tag_set print 'Training HMM...' #print 'training data:' #print zip(words[1:], tags[1:]) trainer = HMMTrainer(tag_set, word_set) hmm = trainer.train_supervised(words[100:], tags[100:], lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print hmm print 'Testing...' for ws, ts in zip(words[:3], tags[:3]): print ws print 'HMM >>>' print hmm.best_path(ws) print 'CORRECT >>>' print ts print '-' * 60 count = correct = 0 for ws, ts in zip(words[:100], tags[:100]): print '.', stdout.flush() pts = hmm.best_path(ws) for t, pt in zip(ts, pts): count += 1 if t == pt: correct += 1 print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
def load_pos(): from nltk.corpus import brown from nltk.tagger import TaggedTokenizer tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] start_re = re.compile(r'[^-*+]*') for token in tagged_tokens: # the multi-output allows us to treat each word as a # tuple of features for sub_token in token['SUBTOKENS']: sequence.append(sub_token) # a feature for words as lower case features = [sub_token['TEXT'].lower()] #a feature for word suffixes of length 3 features.append(sub_token['TEXT'][-3:]) # a feature for the length of words features.append(len(sub_token['TEXT'])) # store the observation as a tuple of features sub_token['TEXT'] = tuple(features) m = start_re.match(sub_token['TAG']) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token['TAG'] = tag else: sub_token['TAG'] = '*' # split on the period tag if sub_token['TAG'] == '.': sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, 3
def _get_toks(file='ca01', debug=0): """ Load tokens from the given file. """ assert _chktype(1, file, types.StringType) assert _chktype(2, debug, types.IntType) _resettime() if debug: print _timestamp(), 'tokenizing', file ttoks = brown.tokenize(file) labeled_tokens = [Token(LabeledText(tok.type().base().lower(), tok.type().tag()), tok.loc()) for tok in ttoks] if debug: print _timestamp(), ' done tokenizing' return labeled_tokens
def demo_pos_supervised(): from nltk.corpus import brown from sys import stdout print 'Loading data from Brown corpus...' tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] annul_nonmatching_tags(tagged_tokens, tag_set, '*') words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens) word_set.sort() tag_set.sort() print 'output alphabet', ` word_set ` [:50], '...' print 'state labels ', ` tag_set ` [:50], '...' print tag_set print 'Training HMM...' #print 'training data:' #print zip(words[1:], tags[1:]) trainer = HMMTrainer(tag_set, word_set) hmm = trainer.train_supervised( words[100:], tags[100:], lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print hmm print 'Testing...' for ws, ts in zip(words[:3], tags[:3]): print ws print 'HMM >>>' print hmm.best_path(ws) print 'CORRECT >>>' print ts print '-' * 60 count = correct = 0 for ws, ts in zip(words[:100], tags[:100]): print '.', stdout.flush() pts = hmm.best_path(ws) for t, pt in zip(ts, pts): count += 1 if t == pt: correct += 1 print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)