예제 #1
0
def log_likelihood(classifier, labeled_tokens):
    """
    Evaluate the log likelihood of the given list of labeled
    tokens for the given classifier model.  This nonpositive float
    gives an indication of how well the classifier models the
    data.  Values closer to zero indicate that it models it more
    accurately.

    @rtype: C{float}
    @return: The log likelihood of C{labeled_tokens} for the given
        classifier model.
    @param labeled_tokens: The tokens whose log likelihood should
        be computed.
    @type labeled_tokens: C{list} of (C{Token} with type
        C{LabeledText}) 
    """
    assert _chktype(1, classifier, ClassifierI)
    assert _chktype(2, labeled_tokens, [Token], (Token,))
    likelihood = 0.0
    for ltok in labeled_tokens:
        utok = Token(ltok.type().text(), ltok.loc())
        label = ltok.type().label()
        dist = classifier.distribution_dictionary(utok)
        if dist[label] == 0:
            # Use some approximation to infinity.  What this does
            # depends on your system's float implementation.
            likelihood -= 1e1000
        else:
            likelihood += math.log(dist[label])

    return likelihood / len(labeled_tokens)
예제 #2
0
 def __init__(self, start, productions):
     """
     Create a new context-free grammar, from the given start state
     and set of C{CFGProduction}s.
     
     @param start: The start symbol
     @type start: L{Nonterminal}
     @param productions: The list of productions that defines the grammar
     @type productions: C{list} of L{CFGProduction}
     """
     assert _chktype(1, start, Nonterminal)
     assert _chktype(2, productions, (CFGProduction,), [CFGProduction])
     self._start = start
     self._productions = tuple(productions)
     # Index of lhs nonterminals to rules
     self._index = {}
     # Reverse index of rhs tokens to rules
     self._rindex = {}
     # List of productions that have some terminals in the rhs
     self._lexicon_grammar = []
     # List of productions that have no terminals in the rhs
     self._nt_grammar = []
     for production,n in zip(self._productions,range(len(self._productions))):
         self._index.setdefault(production.lhs(),[])
         self._index[production.lhs()].append(n)
         nonterminals = 1
         for token in production.rhs():
             nonterminals = nonterminals and isinstance(token,Nonterminal)
             if self._rindex.has_key(token): self._rindex[token].append(n)
             else: self._rindex[token] = [n]
         if nonterminals: self._nt_grammar.append(n)
         else: self._lexicon_grammar.append(n)
예제 #3
0
    def __init__(self, classifier, labeled_tokens):
        """
        Entry conf[i][j] is the number of times a document with label i
        was given label j.
        """
        assert _chktype(1, classifier, ClassifierI)
        assert _chktype(2, labeled_tokens, [Token], (Token,))
        try:
            import numpy.oldnumeric as Numeric
        except:
            raise ImportError("ConfusionMatrix requires Numeric")

        # Extract the labels.
        ldict = {}
        for ltok in labeled_tokens:
            ldict[ltok.type().label()] = 1
        labels = ldict.keys()

        # Construct a label->index dictionary
        indices = {}
        for i in range(len(labels)):
            indices[labels[i]] = i

        confusion = Numeric.zeros((len(labels), len(labels)))
        for ltok in labeled_tokens:
            utok = Token(ltok.type().text(), ltok.loc())
            ctok = classifier.classify(utok)
            confusion[indices[ltok.type().label()], indices[ctok.type().label()]] += 1

        self._labels = labels
        self._confusion = confusion
        self._max_conf = max(Numeric.resize(confusion, (len(labels) ** 2,)))
예제 #4
0
    def __init__(self, start, productions):
        """
        Create a new context-free grammar, from the given start state
        and set of C{CFGProduction}s.
        
        @param start: The start symbol
        @type start: L{Nonterminal}
        @param productions: The list of productions that defines the grammar
        @type productions: C{list} of C{PCFGProduction}
        @raise ValueError: if the set of productions with any left-hand-side
            do not have probabilities that sum to a value within
            PCFG.EPSILON of 1.
        """
        assert _chktype(1, start, Nonterminal)
        assert _chktype(2, productions, (PCFGProduction,), [PCFGProduction])
        HashCFG.__init__(self, start, productions)

        # Make sure that the probabilities sum to one.
        probs = {}
        for production in productions:
            probs[production.lhs()] = (probs.get(production.lhs(), 0) +
                                       production.prob())
        for (lhs, p) in probs.items():
            if not ((1-PCFG.EPSILON) < p < (1+PCFG.EPSILON)):
                raise ValueError("CFGProductions for %r do not sum to 1" % lhs)
        for lhs in self._index:
            self._index[lhs].sort(lambda x,y: cmp(self._productions[y].prob(),
                                                  self._productions[x].prob()))
def printDirs(Dirs, margin=70, indent=0):
    """
    @return: A pretty-printed string representation of this tree.
    @rtype: C{string}
    @param margin: The right margin at which to do line-wrapping.
    @type margin: C{int}
    @param indent: The indentation level at which printing
        begins.  This number is used to decide how far to indent
        subsequent lines.
    @type indent: C{int}
    """
    assert _chktype(1, margin, types.IntType)
    assert _chktype(2, indent, types.IntType)
    #    return repr(Dirs)
    if (isinstance(Dirs,str) or isinstance(Dirs,tuple)
        or (isinstance(Dirs,list) and not isinstance(Dirs,Tree))):
        return '\n%s%s\n' % (' '*(indent),Dirs)
    rep = reprDirs(Dirs)
    if len(rep)+indent < margin:
        if indent: return rep
        else: return rep+'\n'
    
    if isinstance(Dirs,Token) and Dirs.has_key('TREE'): tree=Dirs['TREE']
    else: tree = Dirs
    s = ['[',tree.node]
    for child in tree:
        if isinstance(child, Tree):
            s.extend(['\n',' '*(indent+2),printDirs(child,margin, indent+2)])
        elif isinstance(child, Token):
            s.extend([' ',child['TEXT']])
        else: s.extend(['\n',' '*(indent),str(child)])
    s.append(']')
    if indent == 0: s.append('\n')
    return ''.join(s)
    def __init__(self, sub_fd_list, selected_ids):
        """
        Construct a new C{SelectedFDList}.  This feature detector list
        implements the features from C{sub_fd_list} that are indicated
        by C{selected_ids}.  In particular, the following is true for
        all feature ids M{0<=id<len(self)} and C{LabeledText}s M{lt}::

          self[id].detect(lt) = sub_fd_list[selected_ids.index(id)].detect(lt)

        @type sub_fd_list: C{FeatureDetectorListI}
        @param sub_fd_list: The C{FeatureDetectorList} that this
            C{SelectedFDList} is based on.
        @type selected_ids: C{sequence} of C{int}
        @param selected_ids: The feature ids for the feature detectors
            that should be included in the C{SelectedFDList}.  This
            list should not contain duplicate feature ids.
        """
        assert _chktype(1, sub_fd_list, FeatureDetectorListI)
        assert _chktype(2, selected_ids, [types.IntType], (types.IntType,))
        N = 0
        idmap = {}
        for id in selected_ids:
            if not idmap.has_key(id):
                idmap[id] = N
                N += 1
                
        self._N = N
        self._idmap = idmap
        self._sub_fd_list = sub_fd_list
예제 #7
0
    def __init__(self, function, range):
        """
        Construct a new C{LabeledTextFunctionFDList}.  This 
        feature detector list contains one feature detector for each
        value in C{range}.  When applied to a labeled text M{ltext},
        the feature detector corresponding to the function value M{v}
        will return:

            - 1, if C{function(M{ltext})==M{v}}
            - 0, otherwise

        @type function: C{LabeledText} -> (immutable)
        @param function: The function on which this 
            C{LabeledTextFunctionFDList} is based.
        @type range: C{list} of (immutable)
        @param range: The range of C{function}.  
        """
        assert _chktype(1, function, types.FunctionType,
                        types.BuiltinFunctionType, types.ClassType)
        assert _chktype(2, range, [], ())
        self._func = function

        self._map = {}
        self._N = 0
        for elt in range:
            if not self._map.has_key(elt):
                self._map[elt] = self._N
                self._N += 1
예제 #8
0
    def __init__(self, base_fd_list, texts, labels):
        """
        Construct a new C{MemoizedFDList}.  Pre-compute the
        C{FeatureValueList} for each C{LabeledText(M{t}, M{l})} where
        C{M{t}} is an element of C{texts} and C{M{l}} is an element of
        C{labels}.  These pre-computed C{FeatureValueList}s will be
        returned whenever C{detect} is called with the corresponding
        labeled text.

        @param base_fd_list: The base C{FeatureDetectorList}.  This
            C{MemoizedFDList} always returns the same
            C{FeatureValueList} that C{base_fd_list} would.
        @type base_fd_list: C{FeatureDetectorListI}
        @param texts: The list of texts for which C{FeatureValueList}s
            should be pre-computed.
        @type texts: C{sequence} of (immutable)
        @param labels: The list of labels for which C{FeatureValueList}s
            should be pre-computed.
        @type labels: C{sequence} of (immutable)
        """
        assert _chktype(1, base_fd_list, FeatureDetectorListI)
        assert _chktype(2, texts, (), [])
        assert _chktype(3, labels, (), [])
        self._cache = {}
        self._base_fd_list = base_fd_list
        for text in texts:
            for label in labels:
                ltext = LabeledText(text, label)
                self._cache[ltext] = base_fd_list.detect(ltext)
예제 #9
0
 def __init__(self, start, productions):
     """
     Create a new context-free grammar, from the given start state
     and set of C{CFGProduction}s.
     
     @param start: The start symbol
     @type start: L{Nonterminal}
     @param productions: The list of productions that defines the grammar
     @type productions: C{list} of L{CFGProduction}
     """
     assert _chktype(1, start, Nonterminal)
     assert _chktype(2, productions, (CFGProduction,), [CFGProduction])
     self._start = start
     self._productions = tuple(productions)
예제 #10
0
    def __init__(self, fd_list, **kwargs):
        """
        Construct a new classifier trainer, using the given feature
        detector list.

        @type fd_list: C{FeatureDetectorListI}
        @param fd_list: A feature detector llist defining
            the features that are used by the C{NBClassifier}s
            generated by this C{NBClassifierTrainer}.
        @param kwargs: Keyword arguments.
            - C{labels}: The set of possible labels.  If none is
              given, then the set of all labels attested in the
              training data will be used instead.  (type=C{list} of
              (immutable)).
            - C{estimator}: The smoothing algorithm that should be
              applied to the probability estimates for feature value
              assignments.  Currently, the possible values are:
                - C{'ELE'}: The expected likelihood estimation.  This
                  is curently the default value.
                - C{'MLE'}: The maximum likelihood estimation.  This
                  does not apply any smoothing.  
                - C{'Laplace'}: The Laplace estimation.
                - C{('Lidstone', lambda)}: The Lidstone estimation.
                  Lambda is a parameter to that estimation; it is a
                  positive float, typically between 0 and 1.
        """
        assert _chktype(1, fd_list, FeatureDetectorListI)
        self._fd_list = fd_list
        self._kwargs = kwargs
예제 #11
0
    def __init__(self, *sub_fd_lists):
        """
        Construct a new feature detector list, containing the features
        from each of the feature detector lists in C{sub_fd_lists}.

        If M{N[i]} is the length of the M{i}th feature detector list,
        then feature id M{j} in C{sub_fd_list[M{i}]} corresponds to
        feature id M{N[0]+N[1]+...+N[i-1]+j} in the merged feature
        detector list.

        @param sub_fd_lists: The feature detector lists to combine.
        @type sub_fd_lists: C{list} of C{FeatureDetectorListI}
        """
        assert _chktype('vararg', sub_fd_lists, (FeatureDetectorListI,))
        self._sub_fd_lists = []
        self._offsets = []
        offset = 0
        for sublist in sub_fd_lists:
            if isinstance(sublist, MergedFDList):
                # Create a single flat merged feature detector list, 
                # rather than a tree of them.
                self._sub_fd_lists += sublist._sub_fd_lists
                self._offsets += [x+offset for x in sublist._offsets]
            else:
                self._sub_fd_lists.append(sublist)
                self._offsets.append(offset)
            offset += len(sublist)

        self._N = offset
예제 #12
0
 def __add__(self, other):
     # Inherit docs from FeatureDetectorListI
     # n.b.: Slight circular dependency (since
     # MergedFDList is itself derived from
     # AbstractFDList).
     assert _chktype(1, other, FeatureDetectorListI)
     return MergedFDList(self, other)
    def select(self, fd_list):
        # Count the number of times each feature is attested.
        assert _chktype(1, fd_list, FeatureDetectorListI)
        attested = {}
        for labeled_token in self._training_data:
            text = labeled_token.type().text()
            for label in self._labels:
                fv_list = fd_list.detect(LabeledText(text, label))
                default = fv_list.default()
                for (id, val) in fv_list.assignments():
                    if val != default:
                        attested[id] = attested.get(id, 0) + 1

        # Construct the list of selected ids.  This is easy if
        # min_count = 1.  Otherwise, loop through the entries of
        # attested. 
        if self._min_count == 1:
            selected_ids = attested.keys()
        else:
            selected_ids = []
            for (id, count) in attested.items():
                if count >= self._min_count:
                    selected_ids.append(id)

        # Return the selected feature detector list.
        return SelectedFDList(fd_list, selected_ids)
예제 #14
0
 def detect(self, labeled_text):
     # Inherit docs
     assert _chktype(1, labeled_text, LabeledText)
     fv_list = self._cache.get(labeled_text, None)
     if fv_list is not None:
         return fv_list
     else:
         return self._base_fd_list.detect(labeled_text)
예제 #15
0
 def distribution_dictionary(self, unlabeled_token):
     # Inherit docs from ClassifierI
     assert _chktype(1, unlabeled_token, Token)
     dist_dict = {}
     dist_list = self.distribution_list(unlabeled_token)
     for labelnum in range(len(self._labels)):
         dist_dict[self._labels[labelnum]] = dist_list[labelnum]
     return dist_dict
예제 #16
0
 def __getitem__(self, feature_id):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, feature_id, types.IntType)
     if feature_id >= len(self) or feature_id < 0:
         raise IndexError('FeatureDetectorList index out of range')
     def f(labeled_text, detect=self.detect, id=feature_id):
         return detect(labeled_text)[id]
     return FunctionFeatureDetector(f, ('f_%d()' % feature_id))
예제 #17
0
 def detect(self, labeled_text):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, labeled_text, LabeledText)
     fid = self._map.get(self._func(labeled_text), None)
     if fid is None:
         return EmptyFeatureValueList(self._N)
     else:
         return SimpleFeatureValueList(((fid, 1),), self._N)
예제 #18
0
 def __getitem__(self, feature_id):
     # Inherit docs from FeatureValueListI
     assert _chktype(1, feature_id, types.IntType)
     if feature_id >= self._len:
         raise IndexError('FeatureValueList index out of range')
     for (id, val) in self._assignments:
         if id == feature_id:
             return val
     return self._default
예제 #19
0
def _get_toks(file='ca01', debug=0):
    """
    Load tokens from the given file.  
    """
    assert _chktype(1, file, types.StringType)
    assert _chktype(2, debug, types.IntType)
    
    _resettime()
    if debug: print _timestamp(), 'tokenizing', file

    ttoks = brown.tokenize(file)

    labeled_tokens = [Token(LabeledText(tok.type().base().lower(),
                                           tok.type().tag()),
                               tok.loc())
                         for tok in ttoks]
    if debug: print _timestamp(), '  done tokenizing'
    return labeled_tokens
예제 #20
0
    def __init__(self, function, name=None):
        """
        Construct a new C{FunctionFeatureDetector} from the given
        function.

        @param function: The function that this feature detector is based
            on.  When this feature detector is applied to a labeled
            text M{lt}, it will return M{C{func}(lt)}.
        @type function: C{LabeledText} -> (any)
        @param name: A name for the function used by this feature
            detector.  This name is used in the string representation
            of the feature detector.
        """
        assert _chktype(1, function, types.FunctionType,
                        types.BuiltinFunctionType, types.ClassType)
        assert _chktype(2, name, types.NoneType, types.StringType)
        self._name = name
        self._func = function
 def detect(self, labeled_text):
     # Inherit docs from AbstractFDList
     assert _chktype(1, labeled_text, LabeledText)
     fv_list = self._sub_fd_list.detect(labeled_text)
     assignments = [(self._idmap.get(id), val) for (id, val)
                    in fv_list.assignments()
                    if self._idmap.has_key(id)]
     return SimpleFeatureValueList(assignments, self._N,
                                   fv_list.default())
예제 #22
0
    def __init__(self, grammar, trace=0, **property_names):
        """
        Create a new C{BottomUpPCFGChartParser}, that uses C{grammar}
        to parse texts.

        @type grammar: C{PCFG}
        @param grammar: The grammar used to parse texts.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            and higher numbers will produce more verbose tracing
            output.
        """
        assert _chktype(1, grammar, PCFG)
        assert _chktype(2, trace, types.IntType)
        self._grammar = grammar
        self._trace = trace
        AbstractParser.__init__(self, **property_names)
예제 #23
0
    def __init__(self, words, labels):
        """
        Construct a new C{BagOfWordsFDList}.  This feature

        detector list contains one feature detector for each
        M{(word,label)} pair, where M{word} is an element of C{words},
        and M{label} is an element of C{labels}.  When the feature
        detector corresponding to M{(word, label)} is applied to a
        labeled text M{ltext}, it will return:

            - 1, if C{M{word} in M{ltext}.text() and
                      M{ltext}.label()==M{label}}
            - 0, otherwise

        @type words: C{list} of (immutable)
        @param words: The list of words to look for.
        @type labels: C{list} of (immutable)
        @param labels: The set of labels used by this
            C{TextFunctionFDList}. 
        """
        assert _chktype(1, words, (), [])
        assert _chktype(2, labels, (), [])
        if None in words:
            raise ValueError('TextFunctionFDList can not '+
                             'be used if words contains None')
        if None in labels:
            raise ValueError('TextFunctionFDList can not '+
                             'be used if labels contains None')
        
        self._wmap = {}
        self._num_values = 0
        for word in words:
            if not self._wmap.has_key(word):
                self._wmap[word] = self._num_values
                self._num_values += 1

        self._lmap = {}
        self._num_labels = 0
        for label in labels:
            if not self._lmap.has_key(label):
                self._lmap[label] = self._num_labels
                self._num_labels += 1

        self._N = self._num_values * self._num_labels
예제 #24
0
    def __init__(self, function, range, labels):
        """
        Construct a new C{TextFunctionFDList}.  This feature
        detector list contains one feature detector for each M{(v, l)}
        pair, where M{v} is a value in C{range}, and M{l} is an
        element of C{labels}.  When applied to a labeled text
        M{ltext}, the feature detector corresponding to the pair M{(v,
        l)} will return:

            - 1, if C{function(M{ltext}.text())=M{v} and
                      M{ltext}.label()==M{l}}
            - 0, otherwise

        @type function: (immutable) -> (immutable)
        @param function: The function on which this 
            C{TextFunctionFDList} is based.
        @type range: C{list} of (immutable)
        @param range: The range of C{function}.
        @type labels: C{list} of (immutable)
        @param labels: The set of labels used by this
            C{TextFunctionFDList}. 
        """
        assert _chktype(1, function, types.FunctionType,
                        types.BuiltinFunctionType, types.ClassType)
        assert _chktype(2, range, [], ())
        assert _chktype(3, labels, (), [])
        self._func = function
        self._labels = labels

        self._vmap = {}
        self._num_values = 0
        for elt in range:
            if not self._vmap.has_key(elt):
                self._vmap[elt] = self._num_values
                self._num_values += 1

        self._lmap = {}
        self._num_labels = 0
        for elt in labels:
            if not self._lmap.has_key(elt):
                self._lmap[elt] = self._num_labels
                self._num_labels += 1

        self._N = self._num_values * self._num_labels
 def detect(self, labeled_text):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, labeled_text, LabeledText)
     lnum = self._lmap.get(labeled_text.label(), None)
     vnum = self._vmap.get(self._func(labeled_text.text()), None)
     if (lnum == None) or (vnum == None):
         return EmptyFeatureValueList(self._N)
     else:
         fid = vnum + lnum*self._num_values
         return SimpleFeatureValueList(((fid, 1),), self._N)
예제 #26
0
    def __init__(self, beam_size, grammar, trace=0, **property_names):
        """
        Create a new C{BottomUpPCFGChartParser}, that uses C{grammar}
        to parse texts.

        @type beam_size: C{int}
        @param beam_size: The maximum length for the parser's edge queue.
        @type grammar: C{PCFG}
        @param grammar: The grammar used to parse texts.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            and higher numbers will produce more verbose tracing
            output.
        """
        assert _chktype(1, beam_size, types.IntType)
        assert _chktype(2, grammar, PCFG)
        assert _chktype(3, trace, types.IntType)
        BottomUpPCFGChartParser.__init__(self, grammar, trace, **property_names)
        self._beam_size = beam_size
예제 #27
0
    def __init__(self, fd_list, labels):
        """
        Initialize the feature detector list and label list for this
        classifier.  This constructor should be called by subclasses,
        using the statement::

            AbstractFeatureClassifier.__init__(self, fd_list, labels)
            
        @type fd_list: C{FeatureDetectorListI}
        @param fd_list: The feature detector list defining
            the features that are used by the C{Classifier}.
        @type labels: C{list} of (immutable)
        @param labels: A list of the labels that should be considered
            by this C{NBClassifier}.  Typically, labels are C{string}s
            or C{int}s.
        """
        assert _chktype(1, fd_list, FeatureDetectorListI)
        assert _chktype(2, labels, (), [])
        self._fd_list = fd_list
        self._labels = labels
 def fv_list_likelihood(self, fv_list, label):
     # Inherit docs from AbstractFeatureClassifier
     assert _chktype(1, fv_list, FeatureValueListI)
     p = self._label_probdist.logprob(label)
     #DEBUG = '%20s' % label
     for fid, fval in fv_list.assignments():
         z = self._fval_probdist[label].prob((fid, fval))
         #DEBUG += '%10.5f' % z
         p += self._fval_probdist[label].logprob((fid, fval))
     #print DEBUG, '=> %10.8f (%10.8f)' % (p, math.exp(p))
     return math.exp(p)
예제 #29
0
def attested_classes(tokens, **property_names):
    """
    @return: A list of all classes that are attested in the given list
        of tokens.
    @rtype: C{list} of (immutable)
    @param tokens: The list of tokens from which to extract classes.
    @type tokens: C{list} of (C{Token} with type C{ClassedText})
    """
    CLASS = property_names.get('CLASS', 'CLASS')
    assert _chktype(1, tokens, [Token], (Token,))
    return list(sets.Set([token[CLASS] for token in tokens]))
def attested_classes(tokens, **property_names):
    """
    @return: A list of all classes that are attested in the given list
        of tokens.
    @rtype: C{list} of (immutable)
    @param tokens: The list of tokens from which to extract classes.
    @type tokens: C{list} of (C{Token} with type C{ClassedText})
    """
    CLASS = property_names.get('CLASS', 'CLASS')
    assert _chktype(1, tokens, [Token], (Token, ))
    return list(sets.Set([token[CLASS] for token in tokens]))
    def __init__(self, fd_list, labels):
        """
        Initialize the feature detector list and label list for this
        classifier.  This constructor should be called by subclasses,
        using the statement::

            AbstractFeatureClassifier.__init__(self, fd_list, labels)
            
        @type fd_list: C{FeatureDetectorListI}
        @param fd_list: The feature detector list defining
            the features that are used by the C{Classifier}.
        @type labels: C{list} of (immutable)
        @param labels: A list of the labels that should be considered
            by this C{NBClassifier}.  Typically, labels are C{string}s
            or C{int}s.
        """
        assert _chktype(1, fd_list, FeatureDetectorListI)
        assert _chktype(2, labels, (), [])
        self._fd_list = fd_list
        self._labels = labels
예제 #32
0
 def __div__(self, rhs):
     """
     @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where
         C{M{A}} is the symbol for this nonterminal, and C{M{B}}
         is the symbol for rhs.
     @rtype: L{Nonterminal}
     @param rhs: The nonterminal used to form the right hand side
         of the new nonterminal.
     @type rhs: L{Nonterminal}
     """
     assert _chktype(1, rhs, Nonterminal)
     return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
예제 #33
0
    def __init__(self, lhs, rhs):
        """
        Construct a new C{CFGProduction}.

        @param lhs: The left-hand side of the new C{CFGProduction}.
        @type lhs: L{Nonterminal}
        @param rhs: The right-hand side of the new C{CFGProduction}.
        @type rhs: sequence of (C{Nonterminal} and (terminal))
        """
        assert _chktype(1, lhs, Nonterminal)
        self._lhs = lhs
        self._rhs = tuple(rhs)
예제 #34
0
    def __init__(self, grammar, trace=0, guessUnknown=0, spellchecker=None, **property_names):
        """
        Create a new C{ViterbiPCFGParser}, that uses {grammar} to
        parse texts.

        @type grammar: C{PCFG}
        @param grammar: The grammar used to parse texts.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            and higher numbers will produce more verbose tracing
            output.
        """
        assert _chktype(1, grammar, PCFG)
        assert _chktype(2, trace, types.IntType)
        self._grammar = grammar
        self._trace = trace
        self._guessUnknown = guessUnknown
        self._guessKnown = 0
        self._spellchecker = spellchecker
        AbstractParser.__init__(self, **property_names)
def accuracy(classifier, labeled_tokens):
    """
    @rtype: C{float}
    @return: the given classifier model's accuracy on the given list
        of labeled tokens.  This float between zero and one indicates
        what proportion of the tokens the model would label correctly.
    
    @param labeled_tokens: The tokens for which the model's
        accuracy should be computed.
    @type labeled_tokens: C{list} of (C{Token} with type
        C{LabeledText}) 
    """
    assert _chktype(1, classifier, ClassifierI)
    assert _chktype(2, labeled_tokens, [Token], (Token, ))
    total = 0
    correct = 0
    for ltok in labeled_tokens:
        utok = Token(ltok.type().text(), ltok.loc())
        if classifier.classify(utok) == ltok:
            correct += 1
        total += 1
    return float(correct) / total
    def __init__(self, feature_detectors):
        """
        Construct a new C{SimpleFDList}.

        @param feature_detectors: The C{list} of C{FeatureDetector}s
            that make up the new C{SimpleFeatureDetector}.  The M{i}th
            element of this list is the feature detector for the
            feature with id M{i}.
        @type feature_detectors: C{sequence} of C{FeatureDetectorI}
        """
        assert _chktype(1, feature_detectors, [FeatureDetectorI],
                        (FeatureDetectorI, ))
        self._feature_detectors = feature_detectors
    def __init__(self, assignments, len, default=0):
        """
        Construct a new C{SimpleFeatureValueList}.

        @type assignments: C{list} of (C{tuple} of C{int} and (immutable))
        @param assignments: A list of the feature value assignments for
            each feature in this feature value list whose value is not
            the default value.  These assignments are specified as a
            list of C{(id, value)} pairs.
        @type len: C{int}
        @param len: The number of features whose values are specified
            by this feature value list.
        @type default: (immutable)
        @param default: The default value for this feature value list.
            If a feature's value is not specified by C{assignments},
            then that feature's value is the default value.
        """
        assert _chktype(1, assignments, [()], ((), ))
        assert _chktype(2, len, types.IntType)
        self._assignments = assignments
        self._len = len
        self._default = default
    def __init__(self, fd_list, labels, label_probdist, fval_probdist):
        """
        Construct a new Naive Bayes classifier model.  Typically, new
        classifier models are created by C{ClassifierTrainer}s.

        @type fd_list: C{FeatureDetectorListI}
        @param fd_list: The feature detector list defining
            the features that are used by the C{NBClassifier}.  This
            should be the same feature detector list that was used to
            construct the feature value lists that are the samples of
            C{prob_dist}.
        @type labels: C{list} of (immutable)
        @param labels: A list of the labels that should be considered
            by this C{NBClassifier}.  Typically, labels are C{string}s
            or C{int}s.

        @type label_probdist: C{ProbDistI}
        @param label_probdist: A probability distribution that
            specifies the probability that a randomly chosen text will
            have each label.  In particular,
            C{label_probdist.prob(M{l})} is the probability that a text
            has label M{l}.
        @type fval_probdist: C{ConditionalProbDist}
        @param fval_probdist: A conditional probability distribution
            that specifies the probability of each feature value,
            given a label and a feature id.  In particular,
            C{fval_probdist[M{l}, M{fid}].prob(M{fval})} is the
            probability that a text with label M{l} will assign
            feature value M{fval} to the feature whose id is M{fid}. 
        """
        assert _chktype(1, fd_list, FeatureDetectorListI)
        assert _chktype(2, labels, [], ())
        assert _chktype(3, label_probdist, ProbDistI)
        assert _chktype(4, fval_probdist, ConditionalProbDist)
                        
        self._label_probdist = label_probdist
        self._fval_probdist = fval_probdist
        AbstractFeatureClassifier.__init__(self, fd_list, labels)
def printDirs(Dirs, margin=70, indent=0):
    """
    @return: A pretty-printed string representation of this tree.
    @rtype: C{string}
    @param margin: The right margin at which to do line-wrapping.
    @type margin: C{int}
    @param indent: The indentation level at which printing
        begins.  This number is used to decide how far to indent
        subsequent lines.
    @type indent: C{int}
    """
    assert _chktype(1, margin, types.IntType)
    assert _chktype(2, indent, types.IntType)
    #    return repr(Dirs)
    if (isinstance(Dirs, str) or isinstance(Dirs, tuple)
            or (isinstance(Dirs, list) and not isinstance(Dirs, Tree))):
        return '\n%s%s\n' % (' ' * (indent), Dirs)
    rep = reprDirs(Dirs)
    if len(rep) + indent < margin:
        if indent: return rep
        else: return rep + '\n'

    if isinstance(Dirs, Token) and Dirs.has_key('TREE'): tree = Dirs['TREE']
    else: tree = Dirs
    s = ['[', tree.node]
    for child in tree:
        if isinstance(child, Tree):
            s.extend([
                '\n', ' ' * (indent + 2),
                printDirs(child, margin, indent + 2)
            ])
        elif isinstance(child, Token):
            s.extend([' ', child['TEXT']])
        else:
            s.extend(['\n', ' ' * (indent), str(child)])
    s.append(']')
    if indent == 0: s.append('\n')
    return ''.join(s)
def find_labels(labeled_tokens):
    """
    @return: A list of all labels that are attested in the given list
        of labeled tokens.
    @rtype: C{list} of (immutable)
    @param labeled_tokens: The list of labeled tokens from which to
        extract labels.
    @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText})
    """
    assert _chktype(1, labeled_tokens, [Token], (Token, ))
    labelmap = {}
    for token in labeled_tokens:
        labelmap[token.type().label()] = 1
    return labelmap.keys()
 def __init__(self, len, default=0):
     """
     Construct a new C{EmptyFeatureValueList}
     
     @type len: C{int}
     @param len: The number of features whose values are specified
         by this feature value list.
     @type default: (immutable)
     @param default: The default value for this feature value list.
         This is used as the feature value for every feature.
     """
     assert _chktype(1, len, types.IntType)
     self._len = len
     self._default = default
    def detect(self, labeled_text):
        # Inherit docs from FeatureDetectorListI
        assert _chktype(1, labeled_text, LabeledText)
        lnum = self._lmap.get(labeled_text.label(), None)
        if lnum is None: return EmptyFeatureValueList(self._N)
        offset = lnum * self._num_values

        assignments = {}
        for word in labeled_text.text():
            wnum = self._wmap.get(word)
            if wnum is not None:
                assignments[wnum + offset] = 1

        return SimpleFeatureValueList(assignments.items(), self._N)
    def classify(self, unlabeled_token):
        # Inherit docs from ClassifierI
        assert _chktype(1, unlabeled_token, Token)
        text = unlabeled_token.type()

        # (label, likelihood) pair that maximizes likelihood
        max = (None, 0)

        # Find the label that maximizes the non-normalized probability
        # fv_list_likelihoods.
        for label in self._labels:
            fv_list = self._fd_list.detect(LabeledText(text, label))
            p = self.fv_list_likelihood(fv_list, label)
            if p > max[1]: max = (label, p)

        return Token(LabeledText(text, max[0]), unlabeled_token.loc())
예제 #44
0
    def __init__(self, freqdist):
        """
        Use the Witten-Bell estimate to create a probability distribution
        for the experiment used to generate C{freqdist}.

        @type freqdist: C{FreqDist}
        @param freqdist: The frequency distribution that the
            probability estimates should be based on.
        """
        assert _chktype(1, freqdist, FreqDist)

        self._freqdist = freqdist
        ### B*N/(B+N) approximates the count of the unseen words in a kinda Witten-Bell like way
        self._freqdist.inc(
            '$UNKNOWN$',
            self._freqdist.B() * self._freqdist.N() /
            (self._freqdist.N() + self._freqdist.B()))
        self._N = self._freqdist.N()
def label_tokens(unlabeled_tokens, label):
    """
    @return: a list of labeled tokens, whose text and location
        correspond to C{unlabeled_tokens}, and whose labels are
        C{label}.
    @rtype: C{list} of (C{Token} with type C{LabeledText})

    @param unlabeled_tokens: The list of tokens for which a labeled
        token list should be created.
    @type unlabeled_tokens: C{list} of C{Token}
    @param label: The label for the new labeled tokens.
    @type label: (immutable)
    """
    assert _chktype(1, unlabeled_tokens, [Token], (Token, ))
    return [
        Token(LabeledText(tok.type(), label), tok.loc())
        for tok in unlabeled_tokens
    ]
    def detect(self, labeled_text):
        # Inherit docs from FeatureDetectorListI
        assert _chktype(1, labeled_text, LabeledText)
        assignments = []
        default = None
        for i in range(len(self._sub_fd_lists)):
            offset = self._offsets[i]
            fv_list = self._sub_fd_lists[i].detect(labeled_text)
            if default != fv_list.default():
                if default is None:
                    default = fv_list.default()
                else:
                    raise ValueError('MergedFDList can ' +
                                     'not merge feature value lists ' +
                                     'with different default values.')
            assignments += [(fnum + offset, val)
                            for (fnum, val) in fv_list.assignments()]

        return SimpleFeatureValueList(assignments, self._N, default)
    def distribution_list(self, unlabeled_token):
        # Inherit docs from ClassifierI
        assert _chktype(1, unlabeled_token, Token)
        total_p = 0.0
        text = unlabeled_token.type()

        # Construct a list containing the probability of each label.
        dist_list = []
        for label in self._labels:
            fv_list = self._fd_list.detect(LabeledText(text, label))
            p = self.fv_list_likelihood(fv_list, label)
            dist_list.append(p)
            total_p += p

        # If p=0 for all samples, return a uniform distribution.
        if total_p == 0:
            return self.zero_distribution_list(unlabeled_token)

        # Normalize the probability fv_list_likelihoods.
        return [p / total_p for p in dist_list]
 def zero_distribution_list(self, unlabeled_token):
     """
     Return a list indicating the likelihood that
     C{unlabeled_token} is a member of each category.  This method
     is called whenever C{fv_list_likelihood} returns zero for every
     C{LabeledText} whose text is C{unlabled_token.type()}.  Its
     default behavior is to return a uniform distribution; however,
     it can be overridden to provide a different behavior.
     Reasonable alternatives might include:
         - Return zero for each label.
         - Use a modified C{fv_list_likelihood} that allows zeros to
           "cancel out" between different label values.
     
     @return: a list of probabilities.  The M{i}th element of the
         list is the probability that C{unlabeled_text} belongs to
         C{labels()[M{i}]}'s category.
     @rtype: C{sequence} of C{float}
     @param unlabeled_token: The text to be classified.
     @type unlabeled_token: C{Token}
     """
     assert _chktype(1, unlabeled_token, Token)
     return [1.0 / len(self._labels) for l in self._labels]
    def __init__(self, training_data, **kwargs):
        """
        Construct a new C{AttestedFeatureSelector}.  Given a
        C{FeatureDetectorList} M{fd_list}, this feature selector will
        select any feature with id M{id} such that::

            fd_list[id].detect(LabeledText(l, t)) != (default)

        For any text M{t} from training_data, and any label M{l}.
        
        @param kwargs: Keyword arguments.
          - C{labels}: The set of labels that should be considered
            by this C{AttestedFeatureSelector} to decide whether a
            feature can apply to a text.  If none is given, then the
            set of all labels attested in the training data will be
            used instead.  (type=C{list} of (immutable)).             
          - C{min_count}: The minimum number of C{LabeledText}s to 
            which a feature must apply, in order to be included in the
            feature value list.  Default=1.  (type=C{int})
        """
        assert _chktype(1, training_data, [Token], (Token, ))
        self._training_data = training_data

        # Process the keyword arguments.
        self._min_count = 1
        self._labels = None
        for (key, val) in kwargs.items():
            if key == 'min_count':
                self._min_count = val
            elif key == 'labels':
                self._labels = val
            else:
                raise TypeError('Unknown keyword arg %s' % key)

        # Find the labels, if necessary.
        if self._labels is None:
            self._labels = find_labels(training_data)
 def __getitem__(self, feature_id):
     # Inherit docs from FeatureValueListI
     assert _chktype(1, feature_id, types.IntType)
     return self._default
 def detect(self, labeled_text):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, labeled_text, LabeledText)
     values = [fd.detect(labeled_text) for fd in self._feature_detectors]
     return ArrayFeatureValueList(values)
 def detect(self, labeled_text):
     assert _chktype(1, labeled_text, LabeledText)
     # Inherit docs from FeatureDetectorI
     return self._func(labeled_text)
 def prob(self, labeled_token):
     # Inherit docs from ClassifierI
     assert _chktype(1, labeled_token, Token)
     text = labeled_token.type().text()
     label = labeled_token.type().label()
     return distribution_dictionary(text)[label]
 def __add__(self, other):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, other, FeatureDetectorListI)
     return MergedFDList(self, other)
예제 #55
0
    def train(self, train_toks, **kwargs):
        """
        Train a new C{ConditionalExponentialClassifier}, using the
        given training samples.  This
        C{ConditionalExponentialClassifier} should encode the model
        that maximizes entropy from all the models that are
        emperically consistant with C{train_toks}.
        
        @param kwargs: Keyword arguments.
          - C{iterations}: The maximum number of times IIS should
            iterate.  If IIS converges before this number of
            iterations, it may terminate.  Default=C{20}.
            (type=C{int})
            
          - C{debug}: The debugging level.  Higher values will cause
            more verbose output.  Default=C{0}.  (type=C{int})
            
          - C{classes}: The set of possible classes.  If none is given,
            then the set of all classes attested in the training data
            will be used instead.  (type=C{list} of (immutable)).
            
          - C{accuracy_cutoff}: The accuracy value that indicates
            convergence.  If the accuracy becomes closer to one
            than the specified value, then IIS will terminate.  The
            default value is None, which indicates that no accuracy
            cutoff should be used. (type=C{float})

          - C{delta_accuracy_cutoff}: The change in accuracy should be
            taken to indicate convergence.  If the accuracy changes by
            less than this value in a single iteration, then IIS will
            terminate.  The default value is C{None}, which indicates
            that no accuracy-change cutoff should be
            used. (type=C{float})

          - C{log_likelihood_cutoff}: specifies what log-likelihood
            value should be taken to indicate convergence.  If the
            log-likelihod becomes closer to zero than the specified
            value, then IIS will terminate.  The default value is
            C{None}, which indicates that no log-likelihood cutoff
            should be used. (type=C{float})

          - C{delta_log_likelihood_cutoff}: specifies what change in
            log-likelihood should be taken to indicate convergence.
            If the log-likelihood changes by less than this value in a
            single iteration, then IIS will terminate.  The default
            value is C{None}, which indicates that no
            log-likelihood-change cutoff should be used.  (type=C{float})
        """
        assert _chktype(1, train_toks, [Token], (Token, ))
        # Process the keyword arguments.
        iter = 20
        debug = 0
        classes = None
        ll_cutoff = lldelta_cutoff = None
        acc_cutoff = accdelta_cutoff = None
        for (key, val) in kwargs.items():
            if key in ('iterations', 'iter'): iter = val
            elif key == 'debug': debug = val
            elif key == 'classes': classes = val
            elif key == 'log_likelihood_cutoff':
                ll_cutoff = abs(val)
            elif key == 'delta_log_likelihood_cutoff':
                lldelta_cutoff = abs(val)
            elif key == 'accuracy_cutoff':
                acc_cutoff = abs(val)
            elif key == 'delta_accuracy_cutoff':
                accdelta_cutoff = abs(val)
            else:
                raise TypeError('Unknown keyword arg %s' % key)
        if classes is None:
            classes = attested_classes(train_toks)
            self._classes = classes

        # Find the classes, if necessary.
        if classes is None:
            classes = find_classes(train_toks)

        # Find the length of the first token's feature vector.
        if len(train_toks) == 0:
            raise ValueError('Expected at least one training token')
        vector0 = train_toks[0]['FEATURE_VECTOR']
        self._feature_vector_len = len(vector0)
        self._weight_vector_len = self._feature_vector_len * len(self._classes)

        # Build the offsets dictionary.  This maps from a class to the
        # index in the weight vector where that class's weights begin.
        self._offsets = dict([(cls, i * self._feature_vector_len)
                              for i, cls in enumerate(classes)])

        # Find the frequency with which each feature occurs in the
        # training data.
        ffreq_emperical = self._ffreq_emperical(train_toks)

        # Find the nf map, and related variables nfarray and nfident.
        # nf is the sum of the features for a given labeled text.
        # nfmap compresses this sparse set of values to a dense list.
        # nfarray performs the reverse operation.  nfident is
        # nfarray multiplied by an identity matrix.
        nfmap = self._nfmap(train_toks)
        nfs = nfmap.items()
        nfs.sort(lambda x, y: cmp(x[1], y[1]))
        nfarray = numarray.array([nf for (nf, i) in nfs], 'd')
        nftranspose = numarray.reshape(nfarray, (len(nfarray), 1))

        # An array that is 1 whenever ffreq_emperical is zero.  In
        # other words, it is one for any feature that's not attested
        # in the data.  This is used to avoid division by zero.
        unattested = numarray.zeros(self._weight_vector_len, 'd')
        for i in range(len(unattested)):
            if ffreq_emperical[i] == 0: unattested[i] = 1

        # Build the classifier.  Start with weight=1 for each feature,
        # except for the unattested features.  Start those out at
        # zero, since we know that's the correct value.
        weights = numarray.ones(self._weight_vector_len, 'd')
        weights -= unattested
        classifier = ConditionalExponentialClassifier(classes, weights)

        if debug > 0: print '  ==> Training (%d iterations)' % iter
        if debug > 2:
            print
            print '      Iteration    Log Likelihood    Accuracy'
            print '      ---------------------------------------'

        # Train for a fixed number of iterations.
        for iternum in range(iter):
            if debug > 2:
                print('     %9d    %14.5f    %9.3f' %
                      (iternum,
                       classifier_log_likelihood(classifier, train_toks),
                       classifier_accuracy(classifier, train_toks)))

            # Calculate the deltas for this iteration, using Newton's method.
            deltas = self._deltas(train_toks, classifier, unattested,
                                  ffreq_emperical, nfmap, nfarray, nftranspose)

            # Use the deltas to update our weights.
            weights = classifier.weights()
            weights *= numarray.exp(deltas)
            classifier.set_weights(weights)

            # Check log-likelihood cutoffs.
            if ll_cutoff is not None or lldelta_cutoff is not None:
                ll = classifier_log_likelihood(classifier, train_toks)
                if ll_cutoff is not None and ll > -ll_cutoff: break
                if lldelta_cutoff is not None:
                    if (ll - ll_old) < lldelta_cutoff: break
                    ll_old = ll

            # Check accuracy cutoffs.
            if acc_cutoff is not None or accdelta_cutoff is not None:
                acc = classifier_accuracy(classifier, train_toks)
                if acc_cutoff is not None and acc < acc_cutoff: break
                if accdelta_cutoff is not None:
                    if (acc_old - acc) < accdelta_cutoff: break
                    acc_old = acc

        if debug > 2:
            print('     %9d    %14.5f    %9.3f' %
                  (iternum + 1,
                   classifier_log_likelihood(classifier, train_toks),
                   classifier_accuracy(classifier, train_toks)))
            print

        # Return the classifier.
        return classifier
 def detect(self, labeled_text):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, labeled_text, LabeledText)
     return AlwaysOnFDList._FVLIST
 def __getitem__(self, index):
     assert _chktype(1, index, types.IntType)
     return self._confusion[index[0], index[1]]
 def __getitem__(self, feature_id):
     # Inherit docs from FeatureDetectorListI
     assert _chktype(1, feature_id, types.IntType)
     if feature_id >= len(self) or feature_id < 0:
         raise IndexError('FeatureDetectorList index out of range')
     return self._feature_detectors[feature_id]