def log_likelihood(classifier, labeled_tokens): """ Evaluate the log likelihood of the given list of labeled tokens for the given classifier model. This nonpositive float gives an indication of how well the classifier models the data. Values closer to zero indicate that it models it more accurately. @rtype: C{float} @return: The log likelihood of C{labeled_tokens} for the given classifier model. @param labeled_tokens: The tokens whose log likelihood should be computed. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token,)) likelihood = 0.0 for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) label = ltok.type().label() dist = classifier.distribution_dictionary(utok) if dist[label] == 0: # Use some approximation to infinity. What this does # depends on your system's float implementation. likelihood -= 1e1000 else: likelihood += math.log(dist[label]) return likelihood / len(labeled_tokens)
def __init__(self, start, productions): """ Create a new context-free grammar, from the given start state and set of C{CFGProduction}s. @param start: The start symbol @type start: L{Nonterminal} @param productions: The list of productions that defines the grammar @type productions: C{list} of L{CFGProduction} """ assert _chktype(1, start, Nonterminal) assert _chktype(2, productions, (CFGProduction,), [CFGProduction]) self._start = start self._productions = tuple(productions) # Index of lhs nonterminals to rules self._index = {} # Reverse index of rhs tokens to rules self._rindex = {} # List of productions that have some terminals in the rhs self._lexicon_grammar = [] # List of productions that have no terminals in the rhs self._nt_grammar = [] for production,n in zip(self._productions,range(len(self._productions))): self._index.setdefault(production.lhs(),[]) self._index[production.lhs()].append(n) nonterminals = 1 for token in production.rhs(): nonterminals = nonterminals and isinstance(token,Nonterminal) if self._rindex.has_key(token): self._rindex[token].append(n) else: self._rindex[token] = [n] if nonterminals: self._nt_grammar.append(n) else: self._lexicon_grammar.append(n)
def __init__(self, classifier, labeled_tokens): """ Entry conf[i][j] is the number of times a document with label i was given label j. """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token,)) try: import numpy.oldnumeric as Numeric except: raise ImportError("ConfusionMatrix requires Numeric") # Extract the labels. ldict = {} for ltok in labeled_tokens: ldict[ltok.type().label()] = 1 labels = ldict.keys() # Construct a label->index dictionary indices = {} for i in range(len(labels)): indices[labels[i]] = i confusion = Numeric.zeros((len(labels), len(labels))) for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) ctok = classifier.classify(utok) confusion[indices[ltok.type().label()], indices[ctok.type().label()]] += 1 self._labels = labels self._confusion = confusion self._max_conf = max(Numeric.resize(confusion, (len(labels) ** 2,)))
def __init__(self, start, productions): """ Create a new context-free grammar, from the given start state and set of C{CFGProduction}s. @param start: The start symbol @type start: L{Nonterminal} @param productions: The list of productions that defines the grammar @type productions: C{list} of C{PCFGProduction} @raise ValueError: if the set of productions with any left-hand-side do not have probabilities that sum to a value within PCFG.EPSILON of 1. """ assert _chktype(1, start, Nonterminal) assert _chktype(2, productions, (PCFGProduction,), [PCFGProduction]) HashCFG.__init__(self, start, productions) # Make sure that the probabilities sum to one. probs = {} for production in productions: probs[production.lhs()] = (probs.get(production.lhs(), 0) + production.prob()) for (lhs, p) in probs.items(): if not ((1-PCFG.EPSILON) < p < (1+PCFG.EPSILON)): raise ValueError("CFGProductions for %r do not sum to 1" % lhs) for lhs in self._index: self._index[lhs].sort(lambda x,y: cmp(self._productions[y].prob(), self._productions[x].prob()))
def printDirs(Dirs, margin=70, indent=0): """ @return: A pretty-printed string representation of this tree. @rtype: C{string} @param margin: The right margin at which to do line-wrapping. @type margin: C{int} @param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. @type indent: C{int} """ assert _chktype(1, margin, types.IntType) assert _chktype(2, indent, types.IntType) # return repr(Dirs) if (isinstance(Dirs,str) or isinstance(Dirs,tuple) or (isinstance(Dirs,list) and not isinstance(Dirs,Tree))): return '\n%s%s\n' % (' '*(indent),Dirs) rep = reprDirs(Dirs) if len(rep)+indent < margin: if indent: return rep else: return rep+'\n' if isinstance(Dirs,Token) and Dirs.has_key('TREE'): tree=Dirs['TREE'] else: tree = Dirs s = ['[',tree.node] for child in tree: if isinstance(child, Tree): s.extend(['\n',' '*(indent+2),printDirs(child,margin, indent+2)]) elif isinstance(child, Token): s.extend([' ',child['TEXT']]) else: s.extend(['\n',' '*(indent),str(child)]) s.append(']') if indent == 0: s.append('\n') return ''.join(s)
def __init__(self, sub_fd_list, selected_ids): """ Construct a new C{SelectedFDList}. This feature detector list implements the features from C{sub_fd_list} that are indicated by C{selected_ids}. In particular, the following is true for all feature ids M{0<=id<len(self)} and C{LabeledText}s M{lt}:: self[id].detect(lt) = sub_fd_list[selected_ids.index(id)].detect(lt) @type sub_fd_list: C{FeatureDetectorListI} @param sub_fd_list: The C{FeatureDetectorList} that this C{SelectedFDList} is based on. @type selected_ids: C{sequence} of C{int} @param selected_ids: The feature ids for the feature detectors that should be included in the C{SelectedFDList}. This list should not contain duplicate feature ids. """ assert _chktype(1, sub_fd_list, FeatureDetectorListI) assert _chktype(2, selected_ids, [types.IntType], (types.IntType,)) N = 0 idmap = {} for id in selected_ids: if not idmap.has_key(id): idmap[id] = N N += 1 self._N = N self._idmap = idmap self._sub_fd_list = sub_fd_list
def __init__(self, function, range): """ Construct a new C{LabeledTextFunctionFDList}. This feature detector list contains one feature detector for each value in C{range}. When applied to a labeled text M{ltext}, the feature detector corresponding to the function value M{v} will return: - 1, if C{function(M{ltext})==M{v}} - 0, otherwise @type function: C{LabeledText} -> (immutable) @param function: The function on which this C{LabeledTextFunctionFDList} is based. @type range: C{list} of (immutable) @param range: The range of C{function}. """ assert _chktype(1, function, types.FunctionType, types.BuiltinFunctionType, types.ClassType) assert _chktype(2, range, [], ()) self._func = function self._map = {} self._N = 0 for elt in range: if not self._map.has_key(elt): self._map[elt] = self._N self._N += 1
def __init__(self, base_fd_list, texts, labels): """ Construct a new C{MemoizedFDList}. Pre-compute the C{FeatureValueList} for each C{LabeledText(M{t}, M{l})} where C{M{t}} is an element of C{texts} and C{M{l}} is an element of C{labels}. These pre-computed C{FeatureValueList}s will be returned whenever C{detect} is called with the corresponding labeled text. @param base_fd_list: The base C{FeatureDetectorList}. This C{MemoizedFDList} always returns the same C{FeatureValueList} that C{base_fd_list} would. @type base_fd_list: C{FeatureDetectorListI} @param texts: The list of texts for which C{FeatureValueList}s should be pre-computed. @type texts: C{sequence} of (immutable) @param labels: The list of labels for which C{FeatureValueList}s should be pre-computed. @type labels: C{sequence} of (immutable) """ assert _chktype(1, base_fd_list, FeatureDetectorListI) assert _chktype(2, texts, (), []) assert _chktype(3, labels, (), []) self._cache = {} self._base_fd_list = base_fd_list for text in texts: for label in labels: ltext = LabeledText(text, label) self._cache[ltext] = base_fd_list.detect(ltext)
def __init__(self, start, productions): """ Create a new context-free grammar, from the given start state and set of C{CFGProduction}s. @param start: The start symbol @type start: L{Nonterminal} @param productions: The list of productions that defines the grammar @type productions: C{list} of L{CFGProduction} """ assert _chktype(1, start, Nonterminal) assert _chktype(2, productions, (CFGProduction,), [CFGProduction]) self._start = start self._productions = tuple(productions)
def __init__(self, fd_list, **kwargs): """ Construct a new classifier trainer, using the given feature detector list. @type fd_list: C{FeatureDetectorListI} @param fd_list: A feature detector llist defining the features that are used by the C{NBClassifier}s generated by this C{NBClassifierTrainer}. @param kwargs: Keyword arguments. - C{labels}: The set of possible labels. If none is given, then the set of all labels attested in the training data will be used instead. (type=C{list} of (immutable)). - C{estimator}: The smoothing algorithm that should be applied to the probability estimates for feature value assignments. Currently, the possible values are: - C{'ELE'}: The expected likelihood estimation. This is curently the default value. - C{'MLE'}: The maximum likelihood estimation. This does not apply any smoothing. - C{'Laplace'}: The Laplace estimation. - C{('Lidstone', lambda)}: The Lidstone estimation. Lambda is a parameter to that estimation; it is a positive float, typically between 0 and 1. """ assert _chktype(1, fd_list, FeatureDetectorListI) self._fd_list = fd_list self._kwargs = kwargs
def __init__(self, *sub_fd_lists): """ Construct a new feature detector list, containing the features from each of the feature detector lists in C{sub_fd_lists}. If M{N[i]} is the length of the M{i}th feature detector list, then feature id M{j} in C{sub_fd_list[M{i}]} corresponds to feature id M{N[0]+N[1]+...+N[i-1]+j} in the merged feature detector list. @param sub_fd_lists: The feature detector lists to combine. @type sub_fd_lists: C{list} of C{FeatureDetectorListI} """ assert _chktype('vararg', sub_fd_lists, (FeatureDetectorListI,)) self._sub_fd_lists = [] self._offsets = [] offset = 0 for sublist in sub_fd_lists: if isinstance(sublist, MergedFDList): # Create a single flat merged feature detector list, # rather than a tree of them. self._sub_fd_lists += sublist._sub_fd_lists self._offsets += [x+offset for x in sublist._offsets] else: self._sub_fd_lists.append(sublist) self._offsets.append(offset) offset += len(sublist) self._N = offset
def __add__(self, other): # Inherit docs from FeatureDetectorListI # n.b.: Slight circular dependency (since # MergedFDList is itself derived from # AbstractFDList). assert _chktype(1, other, FeatureDetectorListI) return MergedFDList(self, other)
def select(self, fd_list): # Count the number of times each feature is attested. assert _chktype(1, fd_list, FeatureDetectorListI) attested = {} for labeled_token in self._training_data: text = labeled_token.type().text() for label in self._labels: fv_list = fd_list.detect(LabeledText(text, label)) default = fv_list.default() for (id, val) in fv_list.assignments(): if val != default: attested[id] = attested.get(id, 0) + 1 # Construct the list of selected ids. This is easy if # min_count = 1. Otherwise, loop through the entries of # attested. if self._min_count == 1: selected_ids = attested.keys() else: selected_ids = [] for (id, count) in attested.items(): if count >= self._min_count: selected_ids.append(id) # Return the selected feature detector list. return SelectedFDList(fd_list, selected_ids)
def detect(self, labeled_text): # Inherit docs assert _chktype(1, labeled_text, LabeledText) fv_list = self._cache.get(labeled_text, None) if fv_list is not None: return fv_list else: return self._base_fd_list.detect(labeled_text)
def distribution_dictionary(self, unlabeled_token): # Inherit docs from ClassifierI assert _chktype(1, unlabeled_token, Token) dist_dict = {} dist_list = self.distribution_list(unlabeled_token) for labelnum in range(len(self._labels)): dist_dict[self._labels[labelnum]] = dist_list[labelnum] return dist_dict
def __getitem__(self, feature_id): # Inherit docs from FeatureDetectorListI assert _chktype(1, feature_id, types.IntType) if feature_id >= len(self) or feature_id < 0: raise IndexError('FeatureDetectorList index out of range') def f(labeled_text, detect=self.detect, id=feature_id): return detect(labeled_text)[id] return FunctionFeatureDetector(f, ('f_%d()' % feature_id))
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) fid = self._map.get(self._func(labeled_text), None) if fid is None: return EmptyFeatureValueList(self._N) else: return SimpleFeatureValueList(((fid, 1),), self._N)
def __getitem__(self, feature_id): # Inherit docs from FeatureValueListI assert _chktype(1, feature_id, types.IntType) if feature_id >= self._len: raise IndexError('FeatureValueList index out of range') for (id, val) in self._assignments: if id == feature_id: return val return self._default
def _get_toks(file='ca01', debug=0): """ Load tokens from the given file. """ assert _chktype(1, file, types.StringType) assert _chktype(2, debug, types.IntType) _resettime() if debug: print _timestamp(), 'tokenizing', file ttoks = brown.tokenize(file) labeled_tokens = [Token(LabeledText(tok.type().base().lower(), tok.type().tag()), tok.loc()) for tok in ttoks] if debug: print _timestamp(), ' done tokenizing' return labeled_tokens
def __init__(self, function, name=None): """ Construct a new C{FunctionFeatureDetector} from the given function. @param function: The function that this feature detector is based on. When this feature detector is applied to a labeled text M{lt}, it will return M{C{func}(lt)}. @type function: C{LabeledText} -> (any) @param name: A name for the function used by this feature detector. This name is used in the string representation of the feature detector. """ assert _chktype(1, function, types.FunctionType, types.BuiltinFunctionType, types.ClassType) assert _chktype(2, name, types.NoneType, types.StringType) self._name = name self._func = function
def detect(self, labeled_text): # Inherit docs from AbstractFDList assert _chktype(1, labeled_text, LabeledText) fv_list = self._sub_fd_list.detect(labeled_text) assignments = [(self._idmap.get(id), val) for (id, val) in fv_list.assignments() if self._idmap.has_key(id)] return SimpleFeatureValueList(assignments, self._N, fv_list.default())
def __init__(self, grammar, trace=0, **property_names): """ Create a new C{BottomUpPCFGChartParser}, that uses C{grammar} to parse texts. @type grammar: C{PCFG} @param grammar: The grammar used to parse texts. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; and higher numbers will produce more verbose tracing output. """ assert _chktype(1, grammar, PCFG) assert _chktype(2, trace, types.IntType) self._grammar = grammar self._trace = trace AbstractParser.__init__(self, **property_names)
def __init__(self, words, labels): """ Construct a new C{BagOfWordsFDList}. This feature detector list contains one feature detector for each M{(word,label)} pair, where M{word} is an element of C{words}, and M{label} is an element of C{labels}. When the feature detector corresponding to M{(word, label)} is applied to a labeled text M{ltext}, it will return: - 1, if C{M{word} in M{ltext}.text() and M{ltext}.label()==M{label}} - 0, otherwise @type words: C{list} of (immutable) @param words: The list of words to look for. @type labels: C{list} of (immutable) @param labels: The set of labels used by this C{TextFunctionFDList}. """ assert _chktype(1, words, (), []) assert _chktype(2, labels, (), []) if None in words: raise ValueError('TextFunctionFDList can not '+ 'be used if words contains None') if None in labels: raise ValueError('TextFunctionFDList can not '+ 'be used if labels contains None') self._wmap = {} self._num_values = 0 for word in words: if not self._wmap.has_key(word): self._wmap[word] = self._num_values self._num_values += 1 self._lmap = {} self._num_labels = 0 for label in labels: if not self._lmap.has_key(label): self._lmap[label] = self._num_labels self._num_labels += 1 self._N = self._num_values * self._num_labels
def __init__(self, function, range, labels): """ Construct a new C{TextFunctionFDList}. This feature detector list contains one feature detector for each M{(v, l)} pair, where M{v} is a value in C{range}, and M{l} is an element of C{labels}. When applied to a labeled text M{ltext}, the feature detector corresponding to the pair M{(v, l)} will return: - 1, if C{function(M{ltext}.text())=M{v} and M{ltext}.label()==M{l}} - 0, otherwise @type function: (immutable) -> (immutable) @param function: The function on which this C{TextFunctionFDList} is based. @type range: C{list} of (immutable) @param range: The range of C{function}. @type labels: C{list} of (immutable) @param labels: The set of labels used by this C{TextFunctionFDList}. """ assert _chktype(1, function, types.FunctionType, types.BuiltinFunctionType, types.ClassType) assert _chktype(2, range, [], ()) assert _chktype(3, labels, (), []) self._func = function self._labels = labels self._vmap = {} self._num_values = 0 for elt in range: if not self._vmap.has_key(elt): self._vmap[elt] = self._num_values self._num_values += 1 self._lmap = {} self._num_labels = 0 for elt in labels: if not self._lmap.has_key(elt): self._lmap[elt] = self._num_labels self._num_labels += 1 self._N = self._num_values * self._num_labels
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) lnum = self._lmap.get(labeled_text.label(), None) vnum = self._vmap.get(self._func(labeled_text.text()), None) if (lnum == None) or (vnum == None): return EmptyFeatureValueList(self._N) else: fid = vnum + lnum*self._num_values return SimpleFeatureValueList(((fid, 1),), self._N)
def __init__(self, beam_size, grammar, trace=0, **property_names): """ Create a new C{BottomUpPCFGChartParser}, that uses C{grammar} to parse texts. @type beam_size: C{int} @param beam_size: The maximum length for the parser's edge queue. @type grammar: C{PCFG} @param grammar: The grammar used to parse texts. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; and higher numbers will produce more verbose tracing output. """ assert _chktype(1, beam_size, types.IntType) assert _chktype(2, grammar, PCFG) assert _chktype(3, trace, types.IntType) BottomUpPCFGChartParser.__init__(self, grammar, trace, **property_names) self._beam_size = beam_size
def __init__(self, fd_list, labels): """ Initialize the feature detector list and label list for this classifier. This constructor should be called by subclasses, using the statement:: AbstractFeatureClassifier.__init__(self, fd_list, labels) @type fd_list: C{FeatureDetectorListI} @param fd_list: The feature detector list defining the features that are used by the C{Classifier}. @type labels: C{list} of (immutable) @param labels: A list of the labels that should be considered by this C{NBClassifier}. Typically, labels are C{string}s or C{int}s. """ assert _chktype(1, fd_list, FeatureDetectorListI) assert _chktype(2, labels, (), []) self._fd_list = fd_list self._labels = labels
def fv_list_likelihood(self, fv_list, label): # Inherit docs from AbstractFeatureClassifier assert _chktype(1, fv_list, FeatureValueListI) p = self._label_probdist.logprob(label) #DEBUG = '%20s' % label for fid, fval in fv_list.assignments(): z = self._fval_probdist[label].prob((fid, fval)) #DEBUG += '%10.5f' % z p += self._fval_probdist[label].logprob((fid, fval)) #print DEBUG, '=> %10.8f (%10.8f)' % (p, math.exp(p)) return math.exp(p)
def attested_classes(tokens, **property_names): """ @return: A list of all classes that are attested in the given list of tokens. @rtype: C{list} of (immutable) @param tokens: The list of tokens from which to extract classes. @type tokens: C{list} of (C{Token} with type C{ClassedText}) """ CLASS = property_names.get('CLASS', 'CLASS') assert _chktype(1, tokens, [Token], (Token,)) return list(sets.Set([token[CLASS] for token in tokens]))
def attested_classes(tokens, **property_names): """ @return: A list of all classes that are attested in the given list of tokens. @rtype: C{list} of (immutable) @param tokens: The list of tokens from which to extract classes. @type tokens: C{list} of (C{Token} with type C{ClassedText}) """ CLASS = property_names.get('CLASS', 'CLASS') assert _chktype(1, tokens, [Token], (Token, )) return list(sets.Set([token[CLASS] for token in tokens]))
def __div__(self, rhs): """ @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where C{M{A}} is the symbol for this nonterminal, and C{M{B}} is the symbol for rhs. @rtype: L{Nonterminal} @param rhs: The nonterminal used to form the right hand side of the new nonterminal. @type rhs: L{Nonterminal} """ assert _chktype(1, rhs, Nonterminal) return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
def __init__(self, lhs, rhs): """ Construct a new C{CFGProduction}. @param lhs: The left-hand side of the new C{CFGProduction}. @type lhs: L{Nonterminal} @param rhs: The right-hand side of the new C{CFGProduction}. @type rhs: sequence of (C{Nonterminal} and (terminal)) """ assert _chktype(1, lhs, Nonterminal) self._lhs = lhs self._rhs = tuple(rhs)
def __init__(self, grammar, trace=0, guessUnknown=0, spellchecker=None, **property_names): """ Create a new C{ViterbiPCFGParser}, that uses {grammar} to parse texts. @type grammar: C{PCFG} @param grammar: The grammar used to parse texts. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; and higher numbers will produce more verbose tracing output. """ assert _chktype(1, grammar, PCFG) assert _chktype(2, trace, types.IntType) self._grammar = grammar self._trace = trace self._guessUnknown = guessUnknown self._guessKnown = 0 self._spellchecker = spellchecker AbstractParser.__init__(self, **property_names)
def accuracy(classifier, labeled_tokens): """ @rtype: C{float} @return: the given classifier model's accuracy on the given list of labeled tokens. This float between zero and one indicates what proportion of the tokens the model would label correctly. @param labeled_tokens: The tokens for which the model's accuracy should be computed. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) """ assert _chktype(1, classifier, ClassifierI) assert _chktype(2, labeled_tokens, [Token], (Token, )) total = 0 correct = 0 for ltok in labeled_tokens: utok = Token(ltok.type().text(), ltok.loc()) if classifier.classify(utok) == ltok: correct += 1 total += 1 return float(correct) / total
def __init__(self, feature_detectors): """ Construct a new C{SimpleFDList}. @param feature_detectors: The C{list} of C{FeatureDetector}s that make up the new C{SimpleFeatureDetector}. The M{i}th element of this list is the feature detector for the feature with id M{i}. @type feature_detectors: C{sequence} of C{FeatureDetectorI} """ assert _chktype(1, feature_detectors, [FeatureDetectorI], (FeatureDetectorI, )) self._feature_detectors = feature_detectors
def __init__(self, assignments, len, default=0): """ Construct a new C{SimpleFeatureValueList}. @type assignments: C{list} of (C{tuple} of C{int} and (immutable)) @param assignments: A list of the feature value assignments for each feature in this feature value list whose value is not the default value. These assignments are specified as a list of C{(id, value)} pairs. @type len: C{int} @param len: The number of features whose values are specified by this feature value list. @type default: (immutable) @param default: The default value for this feature value list. If a feature's value is not specified by C{assignments}, then that feature's value is the default value. """ assert _chktype(1, assignments, [()], ((), )) assert _chktype(2, len, types.IntType) self._assignments = assignments self._len = len self._default = default
def __init__(self, fd_list, labels, label_probdist, fval_probdist): """ Construct a new Naive Bayes classifier model. Typically, new classifier models are created by C{ClassifierTrainer}s. @type fd_list: C{FeatureDetectorListI} @param fd_list: The feature detector list defining the features that are used by the C{NBClassifier}. This should be the same feature detector list that was used to construct the feature value lists that are the samples of C{prob_dist}. @type labels: C{list} of (immutable) @param labels: A list of the labels that should be considered by this C{NBClassifier}. Typically, labels are C{string}s or C{int}s. @type label_probdist: C{ProbDistI} @param label_probdist: A probability distribution that specifies the probability that a randomly chosen text will have each label. In particular, C{label_probdist.prob(M{l})} is the probability that a text has label M{l}. @type fval_probdist: C{ConditionalProbDist} @param fval_probdist: A conditional probability distribution that specifies the probability of each feature value, given a label and a feature id. In particular, C{fval_probdist[M{l}, M{fid}].prob(M{fval})} is the probability that a text with label M{l} will assign feature value M{fval} to the feature whose id is M{fid}. """ assert _chktype(1, fd_list, FeatureDetectorListI) assert _chktype(2, labels, [], ()) assert _chktype(3, label_probdist, ProbDistI) assert _chktype(4, fval_probdist, ConditionalProbDist) self._label_probdist = label_probdist self._fval_probdist = fval_probdist AbstractFeatureClassifier.__init__(self, fd_list, labels)
def printDirs(Dirs, margin=70, indent=0): """ @return: A pretty-printed string representation of this tree. @rtype: C{string} @param margin: The right margin at which to do line-wrapping. @type margin: C{int} @param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. @type indent: C{int} """ assert _chktype(1, margin, types.IntType) assert _chktype(2, indent, types.IntType) # return repr(Dirs) if (isinstance(Dirs, str) or isinstance(Dirs, tuple) or (isinstance(Dirs, list) and not isinstance(Dirs, Tree))): return '\n%s%s\n' % (' ' * (indent), Dirs) rep = reprDirs(Dirs) if len(rep) + indent < margin: if indent: return rep else: return rep + '\n' if isinstance(Dirs, Token) and Dirs.has_key('TREE'): tree = Dirs['TREE'] else: tree = Dirs s = ['[', tree.node] for child in tree: if isinstance(child, Tree): s.extend([ '\n', ' ' * (indent + 2), printDirs(child, margin, indent + 2) ]) elif isinstance(child, Token): s.extend([' ', child['TEXT']]) else: s.extend(['\n', ' ' * (indent), str(child)]) s.append(']') if indent == 0: s.append('\n') return ''.join(s)
def find_labels(labeled_tokens): """ @return: A list of all labels that are attested in the given list of labeled tokens. @rtype: C{list} of (immutable) @param labeled_tokens: The list of labeled tokens from which to extract labels. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) """ assert _chktype(1, labeled_tokens, [Token], (Token, )) labelmap = {} for token in labeled_tokens: labelmap[token.type().label()] = 1 return labelmap.keys()
def __init__(self, len, default=0): """ Construct a new C{EmptyFeatureValueList} @type len: C{int} @param len: The number of features whose values are specified by this feature value list. @type default: (immutable) @param default: The default value for this feature value list. This is used as the feature value for every feature. """ assert _chktype(1, len, types.IntType) self._len = len self._default = default
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) lnum = self._lmap.get(labeled_text.label(), None) if lnum is None: return EmptyFeatureValueList(self._N) offset = lnum * self._num_values assignments = {} for word in labeled_text.text(): wnum = self._wmap.get(word) if wnum is not None: assignments[wnum + offset] = 1 return SimpleFeatureValueList(assignments.items(), self._N)
def classify(self, unlabeled_token): # Inherit docs from ClassifierI assert _chktype(1, unlabeled_token, Token) text = unlabeled_token.type() # (label, likelihood) pair that maximizes likelihood max = (None, 0) # Find the label that maximizes the non-normalized probability # fv_list_likelihoods. for label in self._labels: fv_list = self._fd_list.detect(LabeledText(text, label)) p = self.fv_list_likelihood(fv_list, label) if p > max[1]: max = (label, p) return Token(LabeledText(text, max[0]), unlabeled_token.loc())
def __init__(self, freqdist): """ Use the Witten-Bell estimate to create a probability distribution for the experiment used to generate C{freqdist}. @type freqdist: C{FreqDist} @param freqdist: The frequency distribution that the probability estimates should be based on. """ assert _chktype(1, freqdist, FreqDist) self._freqdist = freqdist ### B*N/(B+N) approximates the count of the unseen words in a kinda Witten-Bell like way self._freqdist.inc( '$UNKNOWN$', self._freqdist.B() * self._freqdist.N() / (self._freqdist.N() + self._freqdist.B())) self._N = self._freqdist.N()
def label_tokens(unlabeled_tokens, label): """ @return: a list of labeled tokens, whose text and location correspond to C{unlabeled_tokens}, and whose labels are C{label}. @rtype: C{list} of (C{Token} with type C{LabeledText}) @param unlabeled_tokens: The list of tokens for which a labeled token list should be created. @type unlabeled_tokens: C{list} of C{Token} @param label: The label for the new labeled tokens. @type label: (immutable) """ assert _chktype(1, unlabeled_tokens, [Token], (Token, )) return [ Token(LabeledText(tok.type(), label), tok.loc()) for tok in unlabeled_tokens ]
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) assignments = [] default = None for i in range(len(self._sub_fd_lists)): offset = self._offsets[i] fv_list = self._sub_fd_lists[i].detect(labeled_text) if default != fv_list.default(): if default is None: default = fv_list.default() else: raise ValueError('MergedFDList can ' + 'not merge feature value lists ' + 'with different default values.') assignments += [(fnum + offset, val) for (fnum, val) in fv_list.assignments()] return SimpleFeatureValueList(assignments, self._N, default)
def distribution_list(self, unlabeled_token): # Inherit docs from ClassifierI assert _chktype(1, unlabeled_token, Token) total_p = 0.0 text = unlabeled_token.type() # Construct a list containing the probability of each label. dist_list = [] for label in self._labels: fv_list = self._fd_list.detect(LabeledText(text, label)) p = self.fv_list_likelihood(fv_list, label) dist_list.append(p) total_p += p # If p=0 for all samples, return a uniform distribution. if total_p == 0: return self.zero_distribution_list(unlabeled_token) # Normalize the probability fv_list_likelihoods. return [p / total_p for p in dist_list]
def zero_distribution_list(self, unlabeled_token): """ Return a list indicating the likelihood that C{unlabeled_token} is a member of each category. This method is called whenever C{fv_list_likelihood} returns zero for every C{LabeledText} whose text is C{unlabled_token.type()}. Its default behavior is to return a uniform distribution; however, it can be overridden to provide a different behavior. Reasonable alternatives might include: - Return zero for each label. - Use a modified C{fv_list_likelihood} that allows zeros to "cancel out" between different label values. @return: a list of probabilities. The M{i}th element of the list is the probability that C{unlabeled_text} belongs to C{labels()[M{i}]}'s category. @rtype: C{sequence} of C{float} @param unlabeled_token: The text to be classified. @type unlabeled_token: C{Token} """ assert _chktype(1, unlabeled_token, Token) return [1.0 / len(self._labels) for l in self._labels]
def __init__(self, training_data, **kwargs): """ Construct a new C{AttestedFeatureSelector}. Given a C{FeatureDetectorList} M{fd_list}, this feature selector will select any feature with id M{id} such that:: fd_list[id].detect(LabeledText(l, t)) != (default) For any text M{t} from training_data, and any label M{l}. @param kwargs: Keyword arguments. - C{labels}: The set of labels that should be considered by this C{AttestedFeatureSelector} to decide whether a feature can apply to a text. If none is given, then the set of all labels attested in the training data will be used instead. (type=C{list} of (immutable)). - C{min_count}: The minimum number of C{LabeledText}s to which a feature must apply, in order to be included in the feature value list. Default=1. (type=C{int}) """ assert _chktype(1, training_data, [Token], (Token, )) self._training_data = training_data # Process the keyword arguments. self._min_count = 1 self._labels = None for (key, val) in kwargs.items(): if key == 'min_count': self._min_count = val elif key == 'labels': self._labels = val else: raise TypeError('Unknown keyword arg %s' % key) # Find the labels, if necessary. if self._labels is None: self._labels = find_labels(training_data)
def __getitem__(self, feature_id): # Inherit docs from FeatureValueListI assert _chktype(1, feature_id, types.IntType) return self._default
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) values = [fd.detect(labeled_text) for fd in self._feature_detectors] return ArrayFeatureValueList(values)
def detect(self, labeled_text): assert _chktype(1, labeled_text, LabeledText) # Inherit docs from FeatureDetectorI return self._func(labeled_text)
def prob(self, labeled_token): # Inherit docs from ClassifierI assert _chktype(1, labeled_token, Token) text = labeled_token.type().text() label = labeled_token.type().label() return distribution_dictionary(text)[label]
def __add__(self, other): # Inherit docs from FeatureDetectorListI assert _chktype(1, other, FeatureDetectorListI) return MergedFDList(self, other)
def train(self, train_toks, **kwargs): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} should encode the model that maximizes entropy from all the models that are emperically consistant with C{train_toks}. @param kwargs: Keyword arguments. - C{iterations}: The maximum number of times IIS should iterate. If IIS converges before this number of iterations, it may terminate. Default=C{20}. (type=C{int}) - C{debug}: The debugging level. Higher values will cause more verbose output. Default=C{0}. (type=C{int}) - C{classes}: The set of possible classes. If none is given, then the set of all classes attested in the training data will be used instead. (type=C{list} of (immutable)). - C{accuracy_cutoff}: The accuracy value that indicates convergence. If the accuracy becomes closer to one than the specified value, then IIS will terminate. The default value is None, which indicates that no accuracy cutoff should be used. (type=C{float}) - C{delta_accuracy_cutoff}: The change in accuracy should be taken to indicate convergence. If the accuracy changes by less than this value in a single iteration, then IIS will terminate. The default value is C{None}, which indicates that no accuracy-change cutoff should be used. (type=C{float}) - C{log_likelihood_cutoff}: specifies what log-likelihood value should be taken to indicate convergence. If the log-likelihod becomes closer to zero than the specified value, then IIS will terminate. The default value is C{None}, which indicates that no log-likelihood cutoff should be used. (type=C{float}) - C{delta_log_likelihood_cutoff}: specifies what change in log-likelihood should be taken to indicate convergence. If the log-likelihood changes by less than this value in a single iteration, then IIS will terminate. The default value is C{None}, which indicates that no log-likelihood-change cutoff should be used. (type=C{float}) """ assert _chktype(1, train_toks, [Token], (Token, )) # Process the keyword arguments. iter = 20 debug = 0 classes = None ll_cutoff = lldelta_cutoff = None acc_cutoff = accdelta_cutoff = None for (key, val) in kwargs.items(): if key in ('iterations', 'iter'): iter = val elif key == 'debug': debug = val elif key == 'classes': classes = val elif key == 'log_likelihood_cutoff': ll_cutoff = abs(val) elif key == 'delta_log_likelihood_cutoff': lldelta_cutoff = abs(val) elif key == 'accuracy_cutoff': acc_cutoff = abs(val) elif key == 'delta_accuracy_cutoff': accdelta_cutoff = abs(val) else: raise TypeError('Unknown keyword arg %s' % key) if classes is None: classes = attested_classes(train_toks) self._classes = classes # Find the classes, if necessary. if classes is None: classes = find_classes(train_toks) # Find the length of the first token's feature vector. if len(train_toks) == 0: raise ValueError('Expected at least one training token') vector0 = train_toks[0]['FEATURE_VECTOR'] self._feature_vector_len = len(vector0) self._weight_vector_len = self._feature_vector_len * len(self._classes) # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. self._offsets = dict([(cls, i * self._feature_vector_len) for i, cls in enumerate(classes)]) # Find the frequency with which each feature occurs in the # training data. ffreq_emperical = self._ffreq_emperical(train_toks) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = self._nfmap(train_toks) nfs = nfmap.items() nfs.sort(lambda x, y: cmp(x[1], y[1])) nfarray = numarray.array([nf for (nf, i) in nfs], 'd') nftranspose = numarray.reshape(nfarray, (len(nfarray), 1)) # An array that is 1 whenever ffreq_emperical is zero. In # other words, it is one for any feature that's not attested # in the data. This is used to avoid division by zero. unattested = numarray.zeros(self._weight_vector_len, 'd') for i in range(len(unattested)): if ffreq_emperical[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numarray.ones(self._weight_vector_len, 'd') weights -= unattested classifier = ConditionalExponentialClassifier(classes, weights) if debug > 0: print ' ==> Training (%d iterations)' % iter if debug > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations. for iternum in range(iter): if debug > 2: print(' %9d %14.5f %9.3f' % (iternum, classifier_log_likelihood(classifier, train_toks), classifier_accuracy(classifier, train_toks))) # Calculate the deltas for this iteration, using Newton's method. deltas = self._deltas(train_toks, classifier, unattested, ffreq_emperical, nfmap, nfarray, nftranspose) # Use the deltas to update our weights. weights = classifier.weights() weights *= numarray.exp(deltas) classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = classifier_log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll > -ll_cutoff: break if lldelta_cutoff is not None: if (ll - ll_old) < lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = classifier_accuracy(classifier, train_toks) if acc_cutoff is not None and acc < acc_cutoff: break if accdelta_cutoff is not None: if (acc_old - acc) < accdelta_cutoff: break acc_old = acc if debug > 2: print(' %9d %14.5f %9.3f' % (iternum + 1, classifier_log_likelihood(classifier, train_toks), classifier_accuracy(classifier, train_toks))) print # Return the classifier. return classifier
def detect(self, labeled_text): # Inherit docs from FeatureDetectorListI assert _chktype(1, labeled_text, LabeledText) return AlwaysOnFDList._FVLIST
def __getitem__(self, index): assert _chktype(1, index, types.IntType) return self._confusion[index[0], index[1]]
def __getitem__(self, feature_id): # Inherit docs from FeatureDetectorListI assert _chktype(1, feature_id, types.IntType) if feature_id >= len(self) or feature_id < 0: raise IndexError('FeatureDetectorList index out of range') return self._feature_detectors[feature_id]