def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ # use a set of tokens with 2D indices tokens = [Token(FEATURES=Numeric.array([3, 3])), Token(FEATURES=Numeric.array([1, 2])), Token(FEATURES=Numeric.array([4, 2])), Token(FEATURES=Numeric.array([4, 0])), Token(FEATURES=Numeric.array([2, 3])), Token(FEATURES=Numeric.array([3, 1]))] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusterer.cluster(tokens, True) print 'using clusterer', clusterer print 'clustered', str(tokens)[:60], '...' # classify a new token token = Token(FEATURES=Numeric.array([3, 3])) print 'classify(%s)' % token, clusterer.classify(token) print token # test the GAAC clusterer with 4 clusters clusterer = GroupAverageAgglomerativeClusterer(4) print 'using clusterer', clusterer clusterer.cluster(tokens, True) #print 'clustered', str(tokens)[:60], '...' print 'clustered', tokens # show the dendogram clusterer.dendogram().show() # classify a new token token = Token(FEATURES=Numeric.array([3, 3])) print 'classify(%s)' % token, clusterer.classify(token) print token print # test the EM clusterer with means given by k-means (2) and # dimensionality reduction clusterer = KMeansClusterer(2, euclidean_distance, svd_dimensions=1) clusterer.cluster(tokens) means = clusterer.means() clusterer = ExpectationMaximizationClusterer(means, svd_dimensions=1) clusterer.cluster(tokens, True) print 'using clusterer', clusterer print 'clustered', str(tokens)[:60], '...' # classify a new token token = Token(FEATURES=Numeric.array([3, 3])) print 'classify(%s)' % token, clusterer.classify(token) print token # show the classification probabilities token = Token(FEATURES=Numeric.array([2.2, 2])) print 'classification_probdist(%s)' % token clusterer.classification_probdist(token) for sample in token['CLUSTER_PROBDIST'].samples(): print '%s => %.0f%%' % (sample, token['CLUSTER_PROBDIST'].prob(sample) *100)
def demo_em(): # example from figure 14.10, page 519, Manning and Schutze tokens = [ Token(FEATURES=Numeric.array(f)) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]] ] means = [[4, 2], [4, 2.01]] clusterer = ExpectationMaximizationClusterer(means, bias=0.1) clusterer.cluster(tokens, True, trace=True) print 'clustered', tokens for c in range(2): print 'cluster %d' % c print 'prior', clusterer._priors[c] print 'mean ', clusterer._means[c] print 'covar', clusterer._covariance_matrices[c] # classify a new token token = Token(FEATURES=Numeric.array([2, 2])) print 'classify(%s)' % token, clusterer.classify(token) print token # show the classification probabilities token = Token(FEATURES=Numeric.array([2, 2])) print 'classification_probdist(%s)' % token clusterer.classification_probdist(token) for sample in token['CLUSTER_PROBDIST'].samples(): print '%s => %.0f%%' % (sample, token['CLUSTER_PROBDIST'].prob(sample) * 100)
def raw_tag(self, words): SUBTOKENS = self.property('SUBTOKENS') TEXT = self.property('TEXT') TAG = self.property('TAG') subtoks = [Token({TEXT:w}) for w in words] token = Token({SUBTOKENS:subtoks}) self.tag(token) return [token[TAG] for token in token[SUBTOKENS]]
def __init__(self, classifier, labeled_tokens): """ Entry conf[i][j] is the number of times a document with label i was given label j. """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token, )) try: import Numeric except: raise ImportError('ConfusionMatrix requires Numeric') # Extract the labels. ldict = {} for ltok in labeled_tokens: ldict[ltok.type().label()] = 1 labels = ldict.keys() # Construct a label->index dictionary indices = {} for i in range(len(labels)): indices[labels[i]] = i confusion = Numeric.zeros((len(labels), len(labels))) for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) ctok = classifier.classify(utok) confusion[indices[ltok.type().label()], indices[ctok.type().label()]] += 1 self._labels = labels self._confusion = confusion self._max_conf = max(Numeric.resize(confusion, (len(labels)**2, )))
def parseToken(self,text, interactive=0,trace=1,draw=0,print_parses=1,cumStats=None,chunker=None,trueTree=None): if chunker == None: chunker = self.period_chunker if self.stats: dirStats = parseStats() else: Parses = [] chunker.parse(text) for sent in text['TREE']: if not isinstance(sent,Tree): continue sentToken = Token(WORDS=sent.leaves(),SUBTOKENS=sent.leaves()) print 'parsing',sentToken if self.stats: parse_stats(self.Parsers, sentToken, dirStats, trace, trueTree) print_parse_summary(self.Parsers, dirStats, interactive, draw, print_parses) # Check for empty parse if (dirStats.parse_list == [] or not isinstance(dirStats.parse_list[-1][0],Tree)): chunker2 = None if chunker != self.period_chunker and sent.count('.')>1: chunker2 = self.period_chunker elif chunker != self.punct_chunker and sent.count('.')<=1: chunker2 = self.punct_chunker else: chunker2 = self.unigramTag(dirStats, sentToken) if chunker2: if __debug__: print 'No parse, retry',chunker2 dirStats = self.parseToken(sentToken,interactive,trace,draw,print_parses,dirStats,chunker2,trueTree) ###elif __debug__: print 'Good parse, no retry',dirStats.parse_list else: parse = self.Parsers[0].get_parse_list(sentToken) if parse: Parses.append(parse[0]) if self.stats: if cumStats: cumStats += dirStats.sum() return cumStats else: return dirStats.sum() else: return Parses
def log_likelihood(classifier, labeled_tokens): """ Evaluate the log likelihood of the given list of labeled tokens for the given classifier model. This nonpositive float gives an indication of how well the classifier models the data. Values closer to zero indicate that it models it more accurately. @rtype: C{float} @return: The log likelihood of C{labeled_tokens} for the given classifier model. @param labeled_tokens: The tokens whose log likelihood should be computed. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token, )) likelihood = 0.0 for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) label = ltok.type().label() dist = classifier.distribution_dictionary(utok) if dist[label] == 0: # Use some approximation to infinity. What this does # depends on your system's float implementation. likelihood -= 1e1000 else: likelihood += math.log(dist[label]) return likelihood / len(labeled_tokens)
def demo_pos(): from sys import stdout print 'Training HMM...' labelled_sequences, tag_set, num_features = load_pos() trainer = MultiOutputHMMTrainer(tag_set, [[] for x in range(num_features)]) hmm = trainer.train_supervised( Token(SUBTOKENS=labelled_sequences[100:]), estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Testing...' for super_token in labelled_sequences[:3]: print super_token print 'HMM >>>' print hmm.best_path(super_token.exclude('TAG')) print '-' * 60 count = correct = 0 for super_token in labelled_sequences[:100]: print '.', stdout.flush() pts = hmm.best_path(super_token.exclude('TAG')) for token, tag in zip(super_token['SUBTOKENS'], pts): count += 1 if tag == token['TAG']: correct += 1 print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
def classify(self, unlabeled_token): # inherit doco fv_list = self._fd_list.detect(unlabeled_token.type()) fnums = map(lambda x: x[0], fv_list.assignments()) leaf = self._root.traverse(fnums) label = leaf.label() return Token(LabeledText(unlabeled_token.type(), label), unlabeled_token.loc())
def demo(): # demonstrates HMM probability calculation # example taken from page 381, Huang et al symbols = ['up', 'down', 'unchanged'] states = ['bull', 'bear', 'static'] def pd(values, samples): d = {} for value, item in zip(values, samples): d[item] = value return DictionaryProbDist(d) def cpd(array, conditions, samples): d = {} for values, condition in zip(array, conditions): d[condition] = pd(values, samples) return DictionaryConditionalProbDist(d) A = array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], Float64) A = cpd(A, states, states) B = array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], Float64) B = cpd(B, states, symbols) pi = array([0.5, 0.2, 0.3], Float64) pi = pd(pi, states) model = HiddenMarkovModel(symbols=symbols, states=states, transitions=A, outputs=B, priors=pi) print 'Testing', model for test in [['up'] * 2, ['up'] * 5, ['up', 'down', 'up'], ['down'] * 5, ['unchanged'] * 5 + ['up']]: token = Token(SUBTOKENS=map(lambda t: Token(TEXT=t), test)) print 'Testing with state sequence', test print 'probability =', model.probability(token) print 'tagging = ', model.tag(token) print 'p(tagged) = ', model.probability(token) print
def random_sample(self, rng, length): """ Randomly sample the HMM to generate a sentence of a given length. This samples the prior distribution then the observation distribution and transition distribution for each subsequent observation and state. This will mostly generate unintelligible garbage, but can provide some amusement. @return: the randomly created state/observation sequence, generated according to the HMM's probability distributions. The SUBTOKENS have TEXT and TAG properties containing the observation and state respectively. @rtype: Token @param rng: random number generator @type rng: Random (or any object with a random() method) @param length: desired output length @type length: int """ assert chktype(2, length, types.IntType) # load the property names SUBTOKENS = self._properties.get('SUBTOKENS', 'SUBTOKENS') TEXT = self._properties.get('TEXT', 'TEXT') TAG = self._properties.get('TAG', 'TAG') # sample the starting state and symbol prob dists tokens = [] state = self._sample_probdist(self._priors, rng.random(), self._states) symbol = self._sample_probdist(self._outputs[state], rng.random(), self._symbols) tokens.append(Token(TEXT=symbol, TAG=state)) for i in range(1, length): # sample the state transition and symbol prob dists state = self._sample_probdist(self._transitions[state], rng.random(), self._states) symbol = self._sample_probdist(self._outputs[state], rng.random(), self._symbols) tokens.append(Token(TEXT=symbol, TAG=state)) return Token(SUBTOKENS=tokens)
def demo_kmeans(): # example from figure 14.9, page 517, Manning and Schutze tokens = [Token(FEATURES=Numeric.array(f)) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusterer.cluster(tokens, True, trace=True) print 'clustered', tokens print 'means', clusterer.means()
def demo_pos_bw(): # demonstrates the Baum-Welch algorithm in POS tagging from nltk.set import MutableSet print 'Training HMM (supervised)...' labelled_sequences, tag_set = load_pos() symbol_set = MutableSet() for sequence in labelled_sequences: for token in sequence['SUBTOKENS']: symbol_set.insert(token['TEXT']) trainer = HiddenMarkovModelTrainer(tag_set, symbol_set.elements()) hmm = trainer.train_supervised( Token(SUBTOKENS=labelled_sequences[100:300]), estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Training (unsupervised)...' # it's rather slow - so only use 10 samples unlabelled = Token(SUBTOKENS=_untag(labelled_sequences[301:311])) hmm = trainer.train_unsupervised(unlabelled, model=hmm, max_iterations=5) test_pos(hmm, labelled_sequences[:100], True)
def demo_pos(): # demonstrates POS tagging using supervised training print 'Training HMM...' labelled_sequences, tag_set, symbols = load_pos() trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised( Token(SUBTOKENS=labelled_sequences[100:]), estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Testing...' test_pos(hmm, labelled_sequences[:100], True)
def randomtreetok(depth=0, left=0, bf=None): if bf == None: bf = randint(1,2) if randint(0,7-depth) == 0 and depth>1: len = randint(1,5) return Token('L%d' % randint(0, 10), left, left+len) else: numchildren = randint(1,bf) children = [] for x in range(numchildren): children.append(randomtreetok(depth+1, left, bf)) left = children[-1].loc().end() return TreeToken('Node %d' % randint(0,10000), *children)
def load_pos(): from nltk.corpus import brown from nltk.tagger import TaggedTokenizer tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.tokenize(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] start_re = re.compile(r'[^-*+]*') for token in tagged_tokens: # the multi-output allows us to treat each word as a # tuple of features for sub_token in token['SUBTOKENS']: sequence.append(sub_token) # a feature for words as lower case features = [sub_token['TEXT'].lower()] #a feature for word suffixes of length 3 features.append(sub_token['TEXT'][-3:]) # a feature for the length of words features.append(len(sub_token['TEXT'])) # store the observation as a tuple of features sub_token['TEXT'] = tuple(features) m = start_re.match(sub_token['TAG']) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token['TAG'] = tag else: sub_token['TAG'] = '*' # split on the period tag if sub_token['TAG'] == '.': sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, 3
def _demo_stemmer(stemmer): # Tokenize a sample text. from nltk.tokenizer import WhitespaceTokenizer text = Token(TEXT='John was eating icecream') WhitespaceTokenizer().tokenize(text) # Use the stemmer to stem it. for word in text['SUBTOKENS']: stemmer.stem(word) # Print the results. print stemmer for word in text['SUBTOKENS']: print '%20s => %s' % (word['TEXT'], word['STEM']) print
def classify(self, unlabeled_token): # Inherit docs from ClassifierI assert _chktype(1, unlabeled_token, Token) text = unlabeled_token.type() # (label, likelihood) pair that maximizes likelihood max = (None, 0) # Find the label that maximizes the non-normalized probability # fv_list_likelihoods. for label in self._labels: fv_list = self._fd_list.detect(LabeledText(text, label)) p = self.fv_list_likelihood(fv_list, label) if p > max[1]: max = (label, p) return Token(LabeledText(text, max[0]), unlabeled_token.loc())
def tree2frame(Dirs, index = 0, parent = ''): """ @return: content frame representation of the surface semantics of the parse tree. @rtype: C{SurfaceSemanticsStructure} @return proposition name @rtype: C{str} @return index @rtype: C{int} """ Frame = SurfaceSemanticsStructure() if isinstance(Dirs,Tree): Prop = Dirs.node.capitalize() hasSubTree = True in [isinstance(child,Tree) for child in Dirs] else: Prop = None if isinstance(Dirs,Tree) and hasSubTree: for i,child in enumerate(Dirs): value,prop,index = tree2frame(child,index+1,Dirs.node.capitalize()) filed = False # Account for children with the same names if value and prop: prop_name = prop while not filed: if not Frame.has_key(prop): Frame[prop] = value filed = True else: prop= prop_name+'_'+str(i) elif value: Frame1 = Frame.unify(value) if Frame1: Frame = Frame1 else: while not filed: if not Frame.has_key('SubFrame'+'_'+str(index)): Frame['SubFrame'+'_'+str(index)] = value filed = True elif ((isinstance(Dirs,Tree) and not hasSubTree and Dirs) or isinstance(Dirs,Token)): index += 1 if isinstance(Dirs,Token): token = Dirs if isinstance(Dirs,Tree): token = Token(TEXT=' '.join([child['TEXT'] for child in Dirs])) parent = Dirs.node.capitalize() Frame['TEXT'] = token['TEXT'] Frame['MEAN'] = extractSurfaceSemantics(token,parent) Frame['INDEX']=index return Frame,Prop,index
def demo_bw(): # demo Baum Welch by generating some sequences and then performing # unsupervised training on them # example taken from page 381, Huang et al symbols = ['up', 'down', 'unchanged'] states = ['bull', 'bear', 'static'] def pd(values, samples): d = {} for value, item in zip(values, samples): d[item] = value return DictionaryProbDist(d) def cpd(array, conditions, samples): d = {} for values, condition in zip(array, conditions): d[condition] = pd(values, samples) return DictionaryConditionalProbDist(d) A = array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], Float64) A = cpd(A, states, states) B = array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], Float64) B = cpd(B, states, symbols) pi = array([0.5, 0.2, 0.3], Float64) pi = pd(pi, states) model = HiddenMarkovModel(symbols=symbols, states=states, transitions=A, outputs=B, priors=pi) # generate some random sequences training = [] import random rng = random.Random() for i in range(10): item = model.random_sample(rng, 5) training.append(item) training = Token(SUBTOKENS=training) # train on those examples, starting with the model that generated them trainer = HiddenMarkovModelTrainer(states, symbols) hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000)
def _get_toks(file='ca01', debug=0): """ Load tokens from the given file. """ assert _chktype(1, file, types.StringType) assert _chktype(2, debug, types.IntType) _resettime() if debug: print _timestamp(), 'tokenizing', file ttoks = brown.tokenize(file) labeled_tokens = [Token(LabeledText(tok.type().base().lower(), tok.type().tag()), tok.loc()) for tok in ttoks] if debug: print _timestamp(), ' done tokenizing' return labeled_tokens
def label_tokens(unlabeled_tokens, label): """ @return: a list of labeled tokens, whose text and location correspond to C{unlabeled_tokens}, and whose labels are C{label}. @rtype: C{list} of (C{Token} with type C{LabeledText}) @param unlabeled_tokens: The list of tokens for which a labeled token list should be created. @type unlabeled_tokens: C{list} of C{Token} @param label: The label for the new labeled tokens. @type label: (immutable) """ assert _chktype(1, unlabeled_tokens, [Token], (Token, )) return [ Token(LabeledText(tok.type(), label), tok.loc()) for tok in unlabeled_tokens ]
def stem(self, token): # inherit docs from StemmerI # TODO - when the new token comes out, use it to get the # part-of-speech, thus narrowing the search (and getting eg. # fly/verb for the query flies, rather than the plural noun). # This will only match the first POS from the list below... for pos in [NOUN, VERB, ADJECTIVE, ADVERB]: stemmed = morphy(token.type().lower(), pos) if stemmed: # restore the case new_string = '' for index in range(min(len(token.type()), len(stemmed))): if token.type()[index].isupper(): new_string += stemmed[index].upper() else: new_string += stemmed[index] return Token(new_string, token.loc()) return token
def load_pos(): from nltk.corpus import brown tagged_tokens = [] for item in brown.items()[:5]: tagged_tokens.append(brown.read(item)) tag_set = [ "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb' ] sequences = [] sequence = [] symbols = {} start_re = re.compile(r'[^-*+]*') for token in tagged_tokens: for sub_token in token['WORDS']: sequence.append(sub_token) # make words lower case sub_token['TEXT'] = sub_token['TEXT'].lower() symbols[sub_token['TEXT']] = 1 m = start_re.match(sub_token['TAG']) # cleanup the tag tag = m.group(0) if tag in tag_set: sub_token['TAG'] = tag else: sub_token['TAG'] = '*' # split on the period tag if sub_token['TAG'] == '.': sequences.append(Token(SUBTOKENS=sequence)) sequence = [] return sequences, tag_set, symbols.keys()
def demo(best_path, cache_factory): # demonstrates POS tagging using supervised training print 'Training HMM...' labelled_sequences, tag_set, symbols = load_pos() trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised( Token(SUBTOKENS=labelled_sequences[100:]), estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Creating cache', cache_factory cache = cache_factory(hmm) print 'Overriding best_path with', best_path hmm.__class__.best_path = lambda self, seq: best_path(self, seq, cache) print 'Testing...' import time start = time.clock() test_pos(hmm, labelled_sequences[:100], True) print 'elapsed time', (time.clock() - start)
def accuracy(classifier, labeled_tokens): """ @rtype: C{float} @return: the given classifier model's accuracy on the given list of labeled tokens. This float between zero and one indicates what proportion of the tokens the model would label correctly. @param labeled_tokens: The tokens for which the model's accuracy should be computed. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token, )) total = 0 correct = 0 for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) if classifier.classify(utok) == ltok: correct += 1 total += 1 return float(correct) / total
def test(numFiles=100, max_rules=200, min_score=2, ruleFile="dump.rules", errorOutput="errors.out", ruleOutput="rules.out", randomize=False, train=.8, trace=3): NN_CD_tagger = RegexpTagger([(r'^[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')], TAG='POS') # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." taggedData = getWSJTokens(numFiles, randomize) trainCutoff = int(len(taggedData) * train) trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff]) goldData = Token(SUBTOKENS=taggedData[trainCutoff:]) testingData = goldData.exclude('POS') # Unigram tagger print "Training unigram tagger:", u = UnigramTagger(TAG='POS') u.train(trainingData) backoff = BackoffTagger([u, NN_CD_tagger], TAG='POS') print("[accuracy: %f]" % tagger_accuracy(backoff, [goldData])) # Brill tagger templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)), # ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] #trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS') trainer = BrillTaggerTrainer(backoff, templates, trace, TAG='POS') b = trainer.train(trainingData, max_rules, min_score) print print("Brill accuracy: %f" % tagger_accuracy(b, [goldData])) print("\nRules: ") printRules = file(ruleOutput, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule) + "\n\n") #b.saveRules(ruleFile) b.tag(testingData) el = errorList(goldData, testingData) errorFile = file(errorOutput, 'w') for e in el: errorFile.write(e + "\n\n") errorFile.close() print("Done.") return b
def raw_stem(self, text): TEXT = self.property('TEXT') STEM = self.property('STEM') token = Token({TEXT: text}) self.stem(token) return token[STEM]
def cross_validate(trainer, labeled_tokens, n_folds=10, target=None, trace=False): """ Perform N-fold cross validation on the given classifier. This divides the tokens into N equally sized groups (subject to rounding), then performs N training and testing passes. Each pass involves testing on a single fold and testing on the remaining folds. This way every instance is used exactly once for testing. The results (predictive accuracy) are averaged over the N trials. The mean and standard deviation are returned as a tuple. """ assert len(labeled_tokens) >= n_folds # should randomly reorder labeled_tokens first? folds = [] n = len(labeled_tokens) for i in range(n_folds): start = i * n / n_folds end = (i + 1) * n / n_folds folds.append(labeled_tokens[start:end]) if trace: print 'cross_validate - using %d folds of %d items each approx' \ % (n_folds, len(folds[0])) accuracies = [] precisions = [] recalls = [] for i in range(n_folds): training = folds[:] testing = training[i] del training[i] training = reduce(operator.add, training) # flatten if trace: print 'cross_validate [%d] - training classifier...' % (i + 1) import time start = time.time() classifier = trainer.train(training) if trace: end = time.time() print 'cross_validate elapsed time %.2f seconds' % (end - start) print 'cross_validate [%d] - testing classifier...' % (i + 1) start = end yes = no = 0 tp = tn = fp = fn = 0 for ltok in testing: utok = Token(ltok.type().text(), ltok.loc()) if trace >= 2: print 'cross_validate [%d] - given' % (i + 1), ltok ctok = classifier.classify(utok) if trace >= 2: print 'cross_validate [%d] - classified' % (i + 1), print ctok.type().label() if ltok.type().label() == ctok.type().label(): yes += 1 else: no += 1 if target: if ltok.type().label() == target: if ltok.type().label() == ctok.type().label(): tp += 1 else: fn += 1 else: if ltok.type().label() == ctok.type().label(): fp += 1 else: tn += 1 acc = float(yes) / (yes + no) accuracies.append(acc) if target: precision = recall = None try: recall = float(tp) / (tp + fn) recalls.append(recall) except ZeroDivisionError: pass try: precision = float(tp) / (tp + fp) precisions.append(precision) except ZeroDivisionError: pass if trace: end = time.time() print 'cross_validate elapsed time %.2f seconds' % (end - start) print 'cross_validate [%d] - accuracy %.3f' % (i + 1, acc) if target: print 'cross_validate [%d] - precision %s recall %s' \ % (i + 1, precision, recall) if trace: print 'cross_validate - calculating mean and variance' # find the mean mean = reduce(operator.add, accuracies) / float(len(accuracies)) if target: recall = reduce(operator.add, recalls) / float(len(recalls)) if len(precisions) > 0: precision = reduce(operator.add, precisions) / float( len(precisions)) else: precision = None # find the standard deviation var = 0 for i in range(n_folds): var += accuracies[i] * (accuracies[i] - mean)**2 sd = var**0.5 if target: return mean, sd, precision, recall else: return mean, sd
def test( numFiles=100, max_rules=200, min_score=2, ruleFile="dump.rules", errorOutput="errors.out", ruleOutput="rules.out", randomize=False, train=0.8, trace=3, ): NN_CD_tagger = RegexpTagger([(r"^[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")], TAG="POS") # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." taggedData = getWSJTokens(numFiles, randomize) trainCutoff = int(len(taggedData) * train) trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff]) goldData = Token(SUBTOKENS=taggedData[trainCutoff:]) testingData = goldData.exclude("POS") # Unigram tagger print "Training unigram tagger:", u = UnigramTagger(TAG="POS") u.train(trainingData) backoff = BackoffTagger([u, NN_CD_tagger], TAG="POS") print ("[accuracy: %f]" % tagger_accuracy(backoff, [goldData])) # Brill tagger templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)), # ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] # trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS') trainer = BrillTaggerTrainer(backoff, templates, trace, TAG="POS") b = trainer.train(trainingData, max_rules, min_score) print print ("Brill accuracy: %f" % tagger_accuracy(b, [goldData])) print ("\nRules: ") printRules = file(ruleOutput, "w") for rule in b.rules(): print (str(rule)) printRules.write(str(rule) + "\n\n") # b.saveRules(ruleFile) b.tag(testingData) el = errorList(goldData, testingData) errorFile = file(errorOutput, "w") for e in el: errorFile.write(e + "\n\n") errorFile.close() print ("Done.") return b
##////////////////////////////////////////////////////// ## Demo Code ##////////////////////////////////////////////////////// import random if __name__ == '__main__': def fill(cw): cw['fill'] = '#%06d' % random.randint(0,999999) cf = CanvasFrame(width=550, height=450, closeenough=2) tree = Tree.parse(''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog)))) ''', leafparser = lambda t: Token(TEXT=t)) tc = TreeWidget(cf.canvas(), tree, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2') cf.add_widget(tc,10,10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan')