def demo():
    """
    Non-interactive demonstration of the clusterers with simple 2-D data.
    """
    # use a set of tokens with 2D indices
    tokens = [Token(FEATURES=Numeric.array([3, 3])),
              Token(FEATURES=Numeric.array([1, 2])),
              Token(FEATURES=Numeric.array([4, 2])),
              Token(FEATURES=Numeric.array([4, 0])),
              Token(FEATURES=Numeric.array([2, 3])),
              Token(FEATURES=Numeric.array([3, 1]))]
    
    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds
    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusterer.cluster(tokens, True)
    print 'using clusterer', clusterer
    print 'clustered', str(tokens)[:60], '...'
    # classify a new token
    token = Token(FEATURES=Numeric.array([3, 3]))
    print 'classify(%s)' % token,
    clusterer.classify(token)
    print token

    # test the GAAC clusterer with 4 clusters
    clusterer = GroupAverageAgglomerativeClusterer(4)
    print 'using clusterer', clusterer
    clusterer.cluster(tokens, True)
    #print 'clustered', str(tokens)[:60], '...'
    print 'clustered', tokens
    # show the dendogram
    clusterer.dendogram().show()
    # classify a new token
    token = Token(FEATURES=Numeric.array([3, 3]))
    print 'classify(%s)' % token,
    clusterer.classify(token)
    print token
    print

    # test the EM clusterer with means given by k-means (2) and
    # dimensionality reduction
    clusterer = KMeansClusterer(2, euclidean_distance, svd_dimensions=1)
    clusterer.cluster(tokens)
    means = clusterer.means()
    clusterer = ExpectationMaximizationClusterer(means, svd_dimensions=1)
    clusterer.cluster(tokens, True)
    print 'using clusterer', clusterer
    print 'clustered', str(tokens)[:60], '...'
    # classify a new token
    token = Token(FEATURES=Numeric.array([3, 3]))
    print 'classify(%s)' % token,
    clusterer.classify(token)
    print token
    # show the classification probabilities
    token = Token(FEATURES=Numeric.array([2.2, 2]))
    print 'classification_probdist(%s)' % token
    clusterer.classification_probdist(token)
    for sample in token['CLUSTER_PROBDIST'].samples():
        print '%s => %.0f%%' % (sample,
                    token['CLUSTER_PROBDIST'].prob(sample) *100)
示例#2
0
def demo_em():
    # example from figure 14.10, page 519, Manning and Schutze
    tokens = [
        Token(FEATURES=Numeric.array(f))
        for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]
    ]
    means = [[4, 2], [4, 2.01]]

    clusterer = ExpectationMaximizationClusterer(means, bias=0.1)
    clusterer.cluster(tokens, True, trace=True)

    print 'clustered', tokens
    for c in range(2):
        print 'cluster %d' % c
        print 'prior', clusterer._priors[c]
        print 'mean ', clusterer._means[c]
        print 'covar', clusterer._covariance_matrices[c]

    # classify a new token
    token = Token(FEATURES=Numeric.array([2, 2]))
    print 'classify(%s)' % token,
    clusterer.classify(token)
    print token

    # show the classification probabilities
    token = Token(FEATURES=Numeric.array([2, 2]))
    print 'classification_probdist(%s)' % token
    clusterer.classification_probdist(token)
    for sample in token['CLUSTER_PROBDIST'].samples():
        print '%s => %.0f%%' % (sample,
                                token['CLUSTER_PROBDIST'].prob(sample) * 100)
 def raw_tag(self, words):
     SUBTOKENS = self.property('SUBTOKENS')
     TEXT = self.property('TEXT')
     TAG = self.property('TAG')
     
     subtoks = [Token({TEXT:w}) for w in words]
     token = Token({SUBTOKENS:subtoks})
     self.tag(token)
     return [token[TAG] for token in token[SUBTOKENS]]
    def __init__(self, classifier, labeled_tokens):
        """
        Entry conf[i][j] is the number of times a document with label i
        was given label j.
        """
        assert _chktype(1, classifier, ClassifierI)
        assert _chktype(2, labeled_tokens, [Token], (Token, ))
        try:
            import Numeric
        except:
            raise ImportError('ConfusionMatrix requires Numeric')

        # Extract the labels.
        ldict = {}
        for ltok in labeled_tokens:
            ldict[ltok.type().label()] = 1
        labels = ldict.keys()

        # Construct a label->index dictionary
        indices = {}
        for i in range(len(labels)):
            indices[labels[i]] = i

        confusion = Numeric.zeros((len(labels), len(labels)))
        for ltok in labeled_tokens:
            utok = Token(ltok.type().text(), ltok.loc())
            ctok = classifier.classify(utok)
            confusion[indices[ltok.type().label()],
                      indices[ctok.type().label()]] += 1

        self._labels = labels
        self._confusion = confusion
        self._max_conf = max(Numeric.resize(confusion, (len(labels)**2, )))
示例#5
0
 def parseToken(self,text,
                interactive=0,trace=1,draw=0,print_parses=1,cumStats=None,chunker=None,trueTree=None):       
     if chunker == None: chunker = self.period_chunker
     if self.stats:
         dirStats = parseStats()
     else:
         Parses = []
     chunker.parse(text)
     for sent in text['TREE']:
         if not isinstance(sent,Tree): continue
         sentToken = Token(WORDS=sent.leaves(),SUBTOKENS=sent.leaves())
         print 'parsing',sentToken
         if self.stats:
             parse_stats(self.Parsers, sentToken, dirStats, trace, trueTree)
             print_parse_summary(self.Parsers, dirStats, interactive, draw, print_parses)
             # Check for empty parse
             if (dirStats.parse_list == [] or not isinstance(dirStats.parse_list[-1][0],Tree)):
                 chunker2 = None
                 if chunker != self.period_chunker and sent.count('.')>1: chunker2 = self.period_chunker
                 elif chunker != self.punct_chunker and sent.count('.')<=1: chunker2 = self.punct_chunker
                 else: chunker2 = self.unigramTag(dirStats, sentToken)
                 if chunker2:
                     if __debug__: print 'No parse, retry',chunker2
                     dirStats = self.parseToken(sentToken,interactive,trace,draw,print_parses,dirStats,chunker2,trueTree)
              ###elif __debug__: print 'Good parse, no retry',dirStats.parse_list
         else:
             parse = self.Parsers[0].get_parse_list(sentToken)
             if parse: Parses.append(parse[0])
     if self.stats:
         if cumStats:
             cumStats += dirStats.sum()
             return cumStats
         else: return dirStats.sum()
     else:
         return Parses
def log_likelihood(classifier, labeled_tokens):
    """
    Evaluate the log likelihood of the given list of labeled
    tokens for the given classifier model.  This nonpositive float
    gives an indication of how well the classifier models the
    data.  Values closer to zero indicate that it models it more
    accurately.

    @rtype: C{float}
    @return: The log likelihood of C{labeled_tokens} for the given
        classifier model.
    @param labeled_tokens: The tokens whose log likelihood should
        be computed.
    @type labeled_tokens: C{list} of (C{Token} with type
        C{LabeledText}) 
    """
    assert _chktype(1, classifier, ClassifierI)
    assert _chktype(2, labeled_tokens, [Token], (Token, ))
    likelihood = 0.0
    for ltok in labeled_tokens:
        utok = Token(ltok.type().text(), ltok.loc())
        label = ltok.type().label()
        dist = classifier.distribution_dictionary(utok)
        if dist[label] == 0:
            # Use some approximation to infinity.  What this does
            # depends on your system's float implementation.
            likelihood -= 1e1000
        else:
            likelihood += math.log(dist[label])

    return likelihood / len(labeled_tokens)
def demo_pos():
    from sys import stdout

    print 'Training HMM...'
    labelled_sequences, tag_set, num_features = load_pos()
    trainer = MultiOutputHMMTrainer(tag_set, [[] for x in range(num_features)])
    hmm = trainer.train_supervised(
        Token(SUBTOKENS=labelled_sequences[100:]),
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print 'Testing...'

    for super_token in labelled_sequences[:3]:
        print super_token
        print 'HMM >>>'
        print hmm.best_path(super_token.exclude('TAG'))
        print '-' * 60

    count = correct = 0
    for super_token in labelled_sequences[:100]:
        print '.',
        stdout.flush()
        pts = hmm.best_path(super_token.exclude('TAG'))
        for token, tag in zip(super_token['SUBTOKENS'], pts):
            count += 1
            if tag == token['TAG']:
                correct += 1

    print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct /
                                                         count)
 def classify(self, unlabeled_token):
     # inherit doco
     fv_list = self._fd_list.detect(unlabeled_token.type())
     fnums = map(lambda x: x[0], fv_list.assignments())
     leaf = self._root.traverse(fnums)
     label = leaf.label()
     return Token(LabeledText(unlabeled_token.type(), label),
                  unlabeled_token.loc())
示例#9
0
def demo():
    # demonstrates HMM probability calculation

    # example taken from page 381, Huang et al
    symbols = ['up', 'down', 'unchanged']
    states = ['bull', 'bear', 'static']

    def pd(values, samples):
        d = {}
        for value, item in zip(values, samples):
            d[item] = value
        return DictionaryProbDist(d)

    def cpd(array, conditions, samples):
        d = {}
        for values, condition in zip(array, conditions):
            d[condition] = pd(values, samples)
        return DictionaryConditionalProbDist(d)

    A = array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], Float64)
    A = cpd(A, states, states)
    B = array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], Float64)
    B = cpd(B, states, symbols)
    pi = array([0.5, 0.2, 0.3], Float64)
    pi = pd(pi, states)

    model = HiddenMarkovModel(symbols=symbols,
                              states=states,
                              transitions=A,
                              outputs=B,
                              priors=pi)

    print 'Testing', model

    for test in [['up'] * 2, ['up'] * 5, ['up', 'down', 'up'], ['down'] * 5,
                 ['unchanged'] * 5 + ['up']]:

        token = Token(SUBTOKENS=map(lambda t: Token(TEXT=t), test))
        print 'Testing with state sequence', test
        print 'probability =', model.probability(token)
        print 'tagging =    ', model.tag(token)
        print 'p(tagged) =  ', model.probability(token)
        print
示例#10
0
    def random_sample(self, rng, length):
        """
        Randomly sample the HMM to generate a sentence of a given length. This
        samples the prior distribution then the observation distribution and
        transition distribution for each subsequent observation and state.
        This will mostly generate unintelligible garbage, but can provide some
        amusement.

        @return:        the randomly created state/observation sequence,
                        generated according to the HMM's probability
                        distributions. The SUBTOKENS have TEXT and TAG
                        properties containing the observation and state
                        respectively.
        @rtype:         Token
        @param rng:     random number generator
        @type rng:      Random (or any object with a random() method)
        @param length:  desired output length
        @type length:   int
        """
        assert chktype(2, length, types.IntType)

        # load the property names
        SUBTOKENS = self._properties.get('SUBTOKENS', 'SUBTOKENS')
        TEXT = self._properties.get('TEXT', 'TEXT')
        TAG = self._properties.get('TAG', 'TAG')

        # sample the starting state and symbol prob dists
        tokens = []
        state = self._sample_probdist(self._priors, rng.random(), self._states)
        symbol = self._sample_probdist(self._outputs[state], rng.random(),
                                       self._symbols)
        tokens.append(Token(TEXT=symbol, TAG=state))

        for i in range(1, length):
            # sample the state transition and symbol prob dists
            state = self._sample_probdist(self._transitions[state],
                                          rng.random(), self._states)
            symbol = self._sample_probdist(self._outputs[state], rng.random(),
                                           self._symbols)
            tokens.append(Token(TEXT=symbol, TAG=state))

        return Token(SUBTOKENS=tokens)
def demo_kmeans():
    # example from figure 14.9, page 517, Manning and Schutze
    tokens = [Token(FEATURES=Numeric.array(f))
              for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusterer.cluster(tokens, True, trace=True)

    print 'clustered', tokens
    print 'means', clusterer.means()
示例#12
0
def demo_pos_bw():
    # demonstrates the Baum-Welch algorithm in POS tagging
    from nltk.set import MutableSet

    print 'Training HMM (supervised)...'
    labelled_sequences, tag_set = load_pos()
    symbol_set = MutableSet()
    for sequence in labelled_sequences:
        for token in sequence['SUBTOKENS']:
            symbol_set.insert(token['TEXT'])

    trainer = HiddenMarkovModelTrainer(tag_set, symbol_set.elements())
    hmm = trainer.train_supervised(
        Token(SUBTOKENS=labelled_sequences[100:300]),
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
    print 'Training (unsupervised)...'
    # it's rather slow - so only use 10 samples
    unlabelled = Token(SUBTOKENS=_untag(labelled_sequences[301:311]))
    hmm = trainer.train_unsupervised(unlabelled, model=hmm, max_iterations=5)
    test_pos(hmm, labelled_sequences[:100], True)
示例#13
0
def demo_pos():
    # demonstrates POS tagging using supervised training

    print 'Training HMM...'
    labelled_sequences, tag_set, symbols = load_pos()
    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
    hmm = trainer.train_supervised(
        Token(SUBTOKENS=labelled_sequences[100:]),
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print 'Testing...'
    test_pos(hmm, labelled_sequences[:100], True)
 def randomtreetok(depth=0, left=0, bf=None):
     if bf == None: bf = randint(1,2)
     if randint(0,7-depth) == 0 and depth>1:
         len = randint(1,5)
         return Token('L%d' % randint(0, 10), left, left+len)
     else:
         numchildren = randint(1,bf)
         children = []
         for x in range(numchildren):
             children.append(randomtreetok(depth+1, left, bf))
             left = children[-1].loc().end()
         return TreeToken('Node %d' % randint(0,10000), *children)
def load_pos():
    from nltk.corpus import brown
    from nltk.tagger import TaggedTokenizer

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        # the multi-output allows us to treat each word as a
        # tuple of features
        for sub_token in token['SUBTOKENS']:
            sequence.append(sub_token)
            # a feature for words as lower case
            features = [sub_token['TEXT'].lower()]
            #a feature for word suffixes of length 3
            features.append(sub_token['TEXT'][-3:])
            # a feature for the length of words
            features.append(len(sub_token['TEXT']))
            # store the observation as a tuple of features
            sub_token['TEXT'] = tuple(features)
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, 3
def _demo_stemmer(stemmer):
    # Tokenize a sample text.
    from nltk.tokenizer import WhitespaceTokenizer
    text = Token(TEXT='John was eating icecream')
    WhitespaceTokenizer().tokenize(text)

    # Use the stemmer to stem it.
    for word in text['SUBTOKENS']:
        stemmer.stem(word)

    # Print the results.
    print stemmer
    for word in text['SUBTOKENS']:
        print '%20s => %s' % (word['TEXT'], word['STEM'])
    print
    def classify(self, unlabeled_token):
        # Inherit docs from ClassifierI
        assert _chktype(1, unlabeled_token, Token)
        text = unlabeled_token.type()

        # (label, likelihood) pair that maximizes likelihood
        max = (None, 0)

        # Find the label that maximizes the non-normalized probability
        # fv_list_likelihoods.
        for label in self._labels:
            fv_list = self._fd_list.detect(LabeledText(text, label))
            p = self.fv_list_likelihood(fv_list, label)
            if p > max[1]: max = (label, p)

        return Token(LabeledText(text, max[0]), unlabeled_token.loc())
def tree2frame(Dirs, index = 0, parent = ''):
    """
    @return: content frame representation of the surface semantics of the parse tree.
    @rtype: C{SurfaceSemanticsStructure}
    
    @return proposition name
    @rtype: C{str}
    
    @return index
    @rtype: C{int}
    """
    Frame = SurfaceSemanticsStructure()
    if isinstance(Dirs,Tree):
        Prop = Dirs.node.capitalize()
        hasSubTree = True in [isinstance(child,Tree) for child in Dirs]
    else: Prop = None
    if isinstance(Dirs,Tree) and hasSubTree:
        for i,child in enumerate(Dirs):
            value,prop,index = tree2frame(child,index+1,Dirs.node.capitalize())
            filed = False # Account for children with the same names
            if value and prop:
                prop_name = prop
                while not filed:
                    if not Frame.has_key(prop):
                        Frame[prop] = value
                        filed = True
                    else:
                        prop= prop_name+'_'+str(i)
            elif value:
                Frame1 = Frame.unify(value)
                if Frame1: Frame = Frame1
                else:
                    while not filed:
                        if not Frame.has_key('SubFrame'+'_'+str(index)):
                            Frame['SubFrame'+'_'+str(index)] = value
                            filed = True
    elif ((isinstance(Dirs,Tree) and not hasSubTree and Dirs)
          or isinstance(Dirs,Token)):
        index += 1
        if isinstance(Dirs,Token): token = Dirs
        if isinstance(Dirs,Tree):
            token = Token(TEXT=' '.join([child['TEXT'] for child in Dirs]))
            parent = Dirs.node.capitalize()
        Frame['TEXT'] = token['TEXT']
        Frame['MEAN'] = extractSurfaceSemantics(token,parent)
        Frame['INDEX']=index
    return Frame,Prop,index
示例#19
0
def demo_bw():
    # demo Baum Welch by generating some sequences and then performing
    # unsupervised training on them

    # example taken from page 381, Huang et al
    symbols = ['up', 'down', 'unchanged']
    states = ['bull', 'bear', 'static']

    def pd(values, samples):
        d = {}
        for value, item in zip(values, samples):
            d[item] = value
        return DictionaryProbDist(d)

    def cpd(array, conditions, samples):
        d = {}
        for values, condition in zip(array, conditions):
            d[condition] = pd(values, samples)
        return DictionaryConditionalProbDist(d)

    A = array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], Float64)
    A = cpd(A, states, states)
    B = array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], Float64)
    B = cpd(B, states, symbols)
    pi = array([0.5, 0.2, 0.3], Float64)
    pi = pd(pi, states)

    model = HiddenMarkovModel(symbols=symbols,
                              states=states,
                              transitions=A,
                              outputs=B,
                              priors=pi)

    # generate some random sequences
    training = []
    import random
    rng = random.Random()
    for i in range(10):
        item = model.random_sample(rng, 5)
        training.append(item)
    training = Token(SUBTOKENS=training)

    # train on those examples, starting with the model that generated them
    trainer = HiddenMarkovModelTrainer(states, symbols)
    hmm = trainer.train_unsupervised(training,
                                     model=model,
                                     max_iterations=1000)
def _get_toks(file='ca01', debug=0):
    """
    Load tokens from the given file.  
    """
    assert _chktype(1, file, types.StringType)
    assert _chktype(2, debug, types.IntType)
    
    _resettime()
    if debug: print _timestamp(), 'tokenizing', file

    ttoks = brown.tokenize(file)

    labeled_tokens = [Token(LabeledText(tok.type().base().lower(),
                                           tok.type().tag()),
                               tok.loc())
                         for tok in ttoks]
    if debug: print _timestamp(), '  done tokenizing'
    return labeled_tokens
def label_tokens(unlabeled_tokens, label):
    """
    @return: a list of labeled tokens, whose text and location
        correspond to C{unlabeled_tokens}, and whose labels are
        C{label}.
    @rtype: C{list} of (C{Token} with type C{LabeledText})

    @param unlabeled_tokens: The list of tokens for which a labeled
        token list should be created.
    @type unlabeled_tokens: C{list} of C{Token}
    @param label: The label for the new labeled tokens.
    @type label: (immutable)
    """
    assert _chktype(1, unlabeled_tokens, [Token], (Token, ))
    return [
        Token(LabeledText(tok.type(), label), tok.loc())
        for tok in unlabeled_tokens
    ]
示例#22
0
    def stem(self, token):
        # inherit docs from StemmerI
        # TODO - when the new token comes out, use it to get the
        # part-of-speech, thus narrowing the search (and getting eg.
        # fly/verb for the query flies, rather than the plural noun).
        # This will only match the first POS from the list below...
        for pos in [NOUN, VERB, ADJECTIVE, ADVERB]:
            stemmed = morphy(token.type().lower(), pos)
            if stemmed:
                # restore the case
                new_string = ''
                for index in range(min(len(token.type()), len(stemmed))):
                    if token.type()[index].isupper():
                        new_string += stemmed[index].upper()
                    else:
                        new_string += stemmed[index]
                return Token(new_string, token.loc())

        return token
示例#23
0
def load_pos():
    from nltk.corpus import brown

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.read(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    symbols = {}
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        for sub_token in token['WORDS']:
            sequence.append(sub_token)
            # make words lower case
            sub_token['TEXT'] = sub_token['TEXT'].lower()
            symbols[sub_token['TEXT']] = 1
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, symbols.keys()
def demo(best_path, cache_factory):
    # demonstrates POS tagging using supervised training

    print 'Training HMM...'
    labelled_sequences, tag_set, symbols = load_pos()
    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
    hmm = trainer.train_supervised(
        Token(SUBTOKENS=labelled_sequences[100:]),
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print 'Creating cache', cache_factory
    cache = cache_factory(hmm)

    print 'Overriding best_path with', best_path
    hmm.__class__.best_path = lambda self, seq: best_path(self, seq, cache)

    print 'Testing...'
    import time
    start = time.clock()
    test_pos(hmm, labelled_sequences[:100], True)
    print 'elapsed time', (time.clock() - start)
def accuracy(classifier, labeled_tokens):
    """
    @rtype: C{float}
    @return: the given classifier model's accuracy on the given list
        of labeled tokens.  This float between zero and one indicates
        what proportion of the tokens the model would label correctly.
    
    @param labeled_tokens: The tokens for which the model's
        accuracy should be computed.
    @type labeled_tokens: C{list} of (C{Token} with type
        C{LabeledText}) 
    """
    assert _chktype(1, classifier, ClassifierI)
    assert _chktype(2, labeled_tokens, [Token], (Token, ))
    total = 0
    correct = 0
    for ltok in labeled_tokens:
        utok = Token(ltok.type().text(), ltok.loc())
        if classifier.classify(utok) == ltok:
            correct += 1
        total += 1
    return float(correct) / total
def test(numFiles=100,
         max_rules=200,
         min_score=2,
         ruleFile="dump.rules",
         errorOutput="errors.out",
         ruleOutput="rules.out",
         randomize=False,
         train=.8,
         trace=3):

    NN_CD_tagger = RegexpTagger([(r'^[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')],
                                TAG='POS')

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    taggedData = getWSJTokens(numFiles, randomize)

    trainCutoff = int(len(taggedData) * train)
    trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff])
    goldData = Token(SUBTOKENS=taggedData[trainCutoff:])
    testingData = goldData.exclude('POS')

    # Unigram tagger

    print "Training unigram tagger:",
    u = UnigramTagger(TAG='POS')
    u.train(trainingData)
    backoff = BackoffTagger([u, NN_CD_tagger], TAG='POS')
    print("[accuracy: %f]" % tagger_accuracy(backoff, [goldData]))

    # Brill tagger

    templates = [
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
        ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
        #        ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ]

    #trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    trainer = BrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    b = trainer.train(trainingData, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tagger_accuracy(b, [goldData]))

    print("\nRules: ")
    printRules = file(ruleOutput, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule) + "\n\n")
    #b.saveRules(ruleFile)

    b.tag(testingData)
    el = errorList(goldData, testingData)
    errorFile = file(errorOutput, 'w')

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print("Done.")
    return b
 def raw_stem(self, text):
     TEXT = self.property('TEXT')
     STEM = self.property('STEM')
     token = Token({TEXT: text})
     self.stem(token)
     return token[STEM]
def cross_validate(trainer,
                   labeled_tokens,
                   n_folds=10,
                   target=None,
                   trace=False):
    """
    Perform N-fold cross validation on the given classifier. This divides the
    tokens into N equally sized groups (subject to rounding), then performs N
    training and testing passes. Each pass involves testing on a single fold
    and testing on the remaining folds. This way every instance is used
    exactly once for testing. The results (predictive accuracy) are averaged
    over the N trials. The mean and standard deviation are returned as a
    tuple.
    """
    assert len(labeled_tokens) >= n_folds

    # should randomly reorder labeled_tokens first?
    folds = []
    n = len(labeled_tokens)
    for i in range(n_folds):
        start = i * n / n_folds
        end = (i + 1) * n / n_folds
        folds.append(labeled_tokens[start:end])

    if trace:
        print 'cross_validate - using %d folds of %d items each approx' \
            % (n_folds, len(folds[0]))

    accuracies = []
    precisions = []
    recalls = []
    for i in range(n_folds):
        training = folds[:]
        testing = training[i]
        del training[i]
        training = reduce(operator.add, training)  # flatten

        if trace:
            print 'cross_validate [%d] - training classifier...' % (i + 1)
            import time
            start = time.time()

        classifier = trainer.train(training)

        if trace:
            end = time.time()
            print 'cross_validate elapsed time %.2f seconds' % (end - start)
            print 'cross_validate [%d] - testing classifier...' % (i + 1)
            start = end

        yes = no = 0
        tp = tn = fp = fn = 0
        for ltok in testing:
            utok = Token(ltok.type().text(), ltok.loc())
            if trace >= 2:
                print 'cross_validate [%d] - given' % (i + 1), ltok
            ctok = classifier.classify(utok)
            if trace >= 2:
                print 'cross_validate [%d] - classified' % (i + 1),
                print ctok.type().label()

            if ltok.type().label() == ctok.type().label():
                yes += 1
            else:
                no += 1

            if target:
                if ltok.type().label() == target:
                    if ltok.type().label() == ctok.type().label():
                        tp += 1
                    else:
                        fn += 1
                else:
                    if ltok.type().label() == ctok.type().label():
                        fp += 1
                    else:
                        tn += 1

        acc = float(yes) / (yes + no)
        accuracies.append(acc)
        if target:
            precision = recall = None
            try:
                recall = float(tp) / (tp + fn)
                recalls.append(recall)
            except ZeroDivisionError:
                pass
            try:
                precision = float(tp) / (tp + fp)
                precisions.append(precision)
            except ZeroDivisionError:
                pass

        if trace:
            end = time.time()
            print 'cross_validate elapsed time %.2f seconds' % (end - start)
            print 'cross_validate [%d] - accuracy %.3f' % (i + 1, acc)
            if target:
                print 'cross_validate [%d] - precision %s recall %s' \
                    % (i + 1, precision, recall)

    if trace:
        print 'cross_validate - calculating mean and variance'

    # find the mean
    mean = reduce(operator.add, accuracies) / float(len(accuracies))
    if target:
        recall = reduce(operator.add, recalls) / float(len(recalls))
        if len(precisions) > 0:
            precision = reduce(operator.add, precisions) / float(
                len(precisions))
        else:
            precision = None

    # find the standard deviation
    var = 0
    for i in range(n_folds):
        var += accuracies[i] * (accuracies[i] - mean)**2

    sd = var**0.5

    if target:
        return mean, sd, precision, recall
    else:
        return mean, sd
示例#29
0
def test(
    numFiles=100,
    max_rules=200,
    min_score=2,
    ruleFile="dump.rules",
    errorOutput="errors.out",
    ruleOutput="rules.out",
    randomize=False,
    train=0.8,
    trace=3,
):

    NN_CD_tagger = RegexpTagger([(r"^[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")], TAG="POS")

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    taggedData = getWSJTokens(numFiles, randomize)

    trainCutoff = int(len(taggedData) * train)
    trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff])
    goldData = Token(SUBTOKENS=taggedData[trainCutoff:])
    testingData = goldData.exclude("POS")

    # Unigram tagger

    print "Training unigram tagger:",
    u = UnigramTagger(TAG="POS")
    u.train(trainingData)
    backoff = BackoffTagger([u, NN_CD_tagger], TAG="POS")
    print ("[accuracy: %f]" % tagger_accuracy(backoff, [goldData]))

    # Brill tagger

    templates = [
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
        ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
        #        ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ]

    # trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    trainer = BrillTaggerTrainer(backoff, templates, trace, TAG="POS")
    b = trainer.train(trainingData, max_rules, min_score)

    print
    print ("Brill accuracy: %f" % tagger_accuracy(b, [goldData]))

    print ("\nRules: ")
    printRules = file(ruleOutput, "w")
    for rule in b.rules():
        print (str(rule))
        printRules.write(str(rule) + "\n\n")
    # b.saveRules(ruleFile)

    b.tag(testingData)
    el = errorList(goldData, testingData)
    errorFile = file(errorOutput, "w")

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print ("Done.")
    return b
示例#30
0
##//////////////////////////////////////////////////////
##  Demo Code
##//////////////////////////////////////////////////////

import random
if __name__ == '__main__':
    def fill(cw):
        cw['fill'] = '#%06d' % random.randint(0,999999)
    
    cf = CanvasFrame(width=550, height=450, closeenough=2)

    tree = Tree.parse('''
    (S (NP the very big cat)
       (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))
    ''', leafparser = lambda t: Token(TEXT=t))
                
    tc = TreeWidget(cf.canvas(), tree, draggable=1, 
                    node_font=('helvetica', -14, 'bold'),
                    leaf_font=('helvetica', -12, 'italic'),
                    roof_fill='white', roof_color='black',
                    leaf_color='green4', node_color='blue2')
    cf.add_widget(tc,10,10)
    
    def boxit(canvas, text):
        big = ('helvetica', -16, 'bold')
        return BoxWidget(canvas, TextWidget(canvas, text,
                                            font=big), fill='green')
    def ovalit(canvas, text):
        return OvalWidget(canvas, TextWidget(canvas, text),
                          fill='cyan')