def __init__(self, start, productions): """ Create a new context-free grammar, from the given start state and set of C{CFGProduction}s. @param start: The start symbol @type start: L{Nonterminal} @param productions: The list of productions that defines the grammar @type productions: C{list} of C{PCFGProduction} @raise ValueError: if the set of productions with any left-hand-side do not have probabilities that sum to a value within PCFG.EPSILON of 1. """ assert _chktype(1, start, Nonterminal) assert _chktype(2, productions, (PCFGProduction,), [PCFGProduction]) CFG.__init__(self, start, productions) # Make sure that the probabilities sum to one. probs = {} for production in productions: probs[production.lhs()] = (probs.get(production.lhs(), 0) + production.prob()) for (lhs, p) in probs.items(): if not ((1-PCFG.EPSILON) < p < (1+PCFG.EPSILON)): raise ValueError("CFGProductions for %r do not sum to 1" % lhs)
def __init__(self, start, productions): """ Create a new context-free grammar, from the given start state and set of C{CFGProduction}s. @param start: The start symbol @type start: L{Nonterminal} @param productions: The list of productions that defines the grammar @type productions: C{list} of L{CFGProduction} """ assert _chktype(1, start, Nonterminal) assert _chktype(2, productions, (CFGProduction,), [CFGProduction]) self._start = start self._productions = tuple(productions)
def __init__(self, grammar, trace=0, **property_names): """ Create a new C{BottomUpPCFGChartParser}, that uses C{grammar} to parse texts. @type grammar: C{PCFG} @param grammar: The grammar used to parse texts. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; and higher numbers will produce more verbose tracing output. """ assert _chktype(1, grammar, PCFG) assert _chktype(2, trace, types.IntType) self._grammar = grammar self._trace = trace AbstractParser.__init__(self, **property_names)
def __init__(self, beam_size, grammar, trace=0, **property_names): """ Create a new C{BottomUpPCFGChartParser}, that uses C{grammar} to parse texts. @type beam_size: C{int} @param beam_size: The maximum length for the parser's edge queue. @type grammar: C{PCFG} @param grammar: The grammar used to parse texts. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; and higher numbers will produce more verbose tracing output. """ assert _chktype(1, beam_size, types.IntType) assert _chktype(2, grammar, PCFG) assert _chktype(3, trace, types.IntType) BottomUpPCFGChartParser.__init__(self, grammar, trace, **property_names) self._beam_size = beam_size
def attested_classes(tokens, **property_names): """ @return: A list of all classes that are attested in the given list of tokens. @rtype: C{list} of (immutable) @param tokens: The list of tokens from which to extract classes. @type tokens: C{list} of (C{Token} with type C{ClassedText}) """ CLASS = property_names.get('CLASS', 'CLASS') assert _chktype(1, tokens, [Token], (Token,)) return list(sets.Set([token[CLASS] for token in tokens]))
def __init__(self, lhs, rhs): """ Construct a new C{CFGProduction}. @param lhs: The left-hand side of the new C{CFGProduction}. @type lhs: L{Nonterminal} @param rhs: The right-hand side of the new C{CFGProduction}. @type rhs: sequence of (C{Nonterminal} and (terminal)) """ assert _chktype(1, lhs, Nonterminal) self._lhs = lhs self._rhs = tuple(rhs)
def __div__(self, rhs): """ @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where C{M{A}} is the symbol for this nonterminal, and C{M{B}} is the symbol for rhs. @rtype: L{Nonterminal} @param rhs: The nonterminal used to form the right hand side of the new nonterminal. @type rhs: L{Nonterminal} """ assert _chktype(1, rhs, Nonterminal) return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. @type trace: C{int} @param trace: The trace level. A trace level of C{0} will generate no tracing output; and higher trace levels will produce more verbose tracing output. @rtype: C{None} """ assert _chktype(1, trace, types.IntType) self._trace = trace
def get_parse_list(self, token): # Inherit docs from ParserI assert _chktype(1, token, Token) SUBTOKENS = self.property("SUBTOKENS") LEAF = self.property("LEAF") subtokens = token[SUBTOKENS] # The most likely consituant table. This table specifies the # most likely constituent for a given span and type. # Constituents can be either Trees or Tokens. For # Trees, the "type" is the Nonterminal for the tree's # root node value. For Tokens, the "type" is the token's # type. The table is stored as a dictionary, since it is # sparse. constituents = {} # Initialize the constituents dictionary with the words from # the text. if self._trace: print ("Inserting tokens into the most likely" + " constituents table...") for index in range(len(subtokens)): tok = subtokens[index] probtok = tok.copy() constituents[index, index + 1, tok[LEAF]] = probtok if self._trace > 1: self._trace_lexical_insertion(tok, subtokens) # Consider each span of length 1, 2, ..., n; and add any trees # that might cover that span to the constituents dictionary. for length in range(1, len(subtokens) + 1): if self._trace: if self._trace > 1: print print ("Finding the most likely constituents" + " spanning %d text elements..." % length) # print constituents for start in range(len(subtokens) - length + 1): span = (start, start + length) self._add_constituents_spanning(span, constituents, subtokens) # Find all trees that span the entire text & have the right cat trees = [constituents.get((0, len(subtokens), self._grammar.start()), [])] # Sort the trees, and return the requested number of them. trees.sort(lambda t1, t2: cmp(t2.prob(), t1.prob())) return trees
def train(self, train_toks, **kwargs): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} should encode the model that maximizes entropy from all the models that are emperically consistant with C{train_toks}. @param kwargs: Keyword arguments. - C{iterations}: The maximum number of times IIS should iterate. If IIS converges before this number of iterations, it may terminate. Default=C{20}. (type=C{int}) - C{debug}: The debugging level. Higher values will cause more verbose output. Default=C{0}. (type=C{int}) - C{classes}: The set of possible classes. If none is given, then the set of all classes attested in the training data will be used instead. (type=C{list} of (immutable)). - C{accuracy_cutoff}: The accuracy value that indicates convergence. If the accuracy becomes closer to one than the specified value, then IIS will terminate. The default value is None, which indicates that no accuracy cutoff should be used. (type=C{float}) - C{delta_accuracy_cutoff}: The change in accuracy should be taken to indicate convergence. If the accuracy changes by less than this value in a single iteration, then IIS will terminate. The default value is C{None}, which indicates that no accuracy-change cutoff should be used. (type=C{float}) - C{log_likelihood_cutoff}: specifies what log-likelihood value should be taken to indicate convergence. If the log-likelihod becomes closer to zero than the specified value, then IIS will terminate. The default value is C{None}, which indicates that no log-likelihood cutoff should be used. (type=C{float}) - C{delta_log_likelihood_cutoff}: specifies what change in log-likelihood should be taken to indicate convergence. If the log-likelihood changes by less than this value in a single iteration, then IIS will terminate. The default value is C{None}, which indicates that no log-likelihood-change cutoff should be used. (type=C{float}) """ assert _chktype(1, train_toks, [Token], (Token,)) # Process the keyword arguments. iter = 20 debug = 0 classes = None ll_cutoff = lldelta_cutoff = None acc_cutoff = accdelta_cutoff = None for (key, val) in kwargs.items(): if key in ('iterations', 'iter'): iter = val elif key == 'debug': debug = val elif key == 'classes': classes = val elif key == 'log_likelihood_cutoff': ll_cutoff = abs(val) elif key == 'delta_log_likelihood_cutoff': lldelta_cutoff = abs(val) elif key == 'accuracy_cutoff': acc_cutoff = abs(val) elif key == 'delta_accuracy_cutoff': accdelta_cutoff = abs(val) else: raise TypeError('Unknown keyword arg %s' % key) if classes is None: classes = attested_classes(train_toks) self._classes = classes # Find the classes, if necessary. if classes is None: classes = find_classes(train_toks) # Find the length of the first token's feature vector. if len(train_toks) == 0: raise ValueError('Expected at least one training token') vector0 = train_toks[0]['FEATURE_VECTOR'] self._feature_vector_len = len(vector0) self._weight_vector_len = self._feature_vector_len*len(self._classes) # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. self._offsets = dict([(cls, i*self._feature_vector_len) for i, cls in enumerate(classes)]) # Find the frequency with which each feature occurs in the # training data. ffreq_emperical = self._ffreq_emperical(train_toks) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = self._nfmap(train_toks) nfs = nfmap.items() nfs.sort(lambda x,y:cmp(x[1],y[1])) nfarray = numarray.array([nf for (nf, i) in nfs], 'd') nftranspose = numarray.reshape(nfarray, (len(nfarray), 1)) # An array that is 1 whenever ffreq_emperical is zero. In # other words, it is one for any feature that's not attested # in the data. This is used to avoid division by zero. unattested = numarray.zeros(self._weight_vector_len, 'd') for i in range(len(unattested)): if ffreq_emperical[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numarray.ones(self._weight_vector_len, 'd') weights -= unattested classifier = ConditionalExponentialClassifier(classes, weights) if debug > 0: print ' ==> Training (%d iterations)' % iter if debug > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations. for iternum in range(iter): if debug > 2: print (' %9d %14.5f %9.3f' % (iternum, classifier_log_likelihood(classifier, train_toks), classifier_accuracy(classifier, train_toks))) # Calculate the deltas for this iteration, using Newton's method. deltas = self._deltas(train_toks, classifier, unattested, ffreq_emperical, nfmap, nfarray, nftranspose) # Use the deltas to update our weights. weights = classifier.weights() weights *= numarray.exp(deltas) classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = classifier_log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll > -ll_cutoff: break if lldelta_cutoff is not None: if (ll - ll_old) < lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = classifier_accuracy(classifier, train_toks) if acc_cutoff is not None and acc < acc_cutoff: break if accdelta_cutoff is not None: if (acc_old - acc) < accdelta_cutoff: break acc_old = acc if debug > 2: print (' %9d %14.5f %9.3f' % (iternum+1, classifier_log_likelihood(classifier, train_toks), classifier_accuracy(classifier, train_toks))) print # Return the classifier. return classifier