示例#1
0
def demo_pos_supervised():
    from nltk.corpus import brown
    from sys import stdout
    print 'Loading data from Brown corpus...'
    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl',
        'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem',
        'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz',
        'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg',
        'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$',
        'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn',
        'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb',
        'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn',
        'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb']
        
    annul_nonmatching_tags(tagged_tokens, tag_set, '*')
    words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens)

    word_set.sort()
    tag_set.sort()

    print 'output alphabet', `word_set`[:50], '...'
    print 'state labels   ', `tag_set`[:50], '...'
    print tag_set

    print 'Training HMM...'

    #print 'training data:'
    #print zip(words[1:], tags[1:])

    trainer = HMMTrainer(tag_set, word_set)
    hmm = trainer.train_supervised(words[100:], tags[100:],
                    lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print hmm
    print 'Testing...'
    
    for ws, ts in zip(words[:3], tags[:3]):
        print ws
        print 'HMM >>>'
        print hmm.best_path(ws)
        print 'CORRECT >>>'
        print ts
        print '-' * 60

    count = correct = 0
    for ws, ts in zip(words[:100], tags[:100]):
        print '.',
        stdout.flush()
        pts = hmm.best_path(ws)
        for t, pt in zip(ts, pts):
            count += 1
            if t == pt:
                correct += 1

    print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
def load_pos():
    from nltk.corpus import brown
    from nltk.tagger import TaggedTokenizer

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        # the multi-output allows us to treat each word as a
        # tuple of features
        for sub_token in token['SUBTOKENS']:
            sequence.append(sub_token)
            # a feature for words as lower case
            features = [sub_token['TEXT'].lower()]
            #a feature for word suffixes of length 3
            features.append(sub_token['TEXT'][-3:])
            # a feature for the length of words
            features.append(len(sub_token['TEXT']))
            # store the observation as a tuple of features
            sub_token['TEXT'] = tuple(features)
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, 3
def load_pos():
    from nltk.corpus import brown
    from nltk.tagger import TaggedTokenizer

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        # the multi-output allows us to treat each word as a
        # tuple of features
        for sub_token in token['SUBTOKENS']:
            sequence.append(sub_token)
            # a feature for words as lower case
            features = [sub_token['TEXT'].lower()]
            #a feature for word suffixes of length 3
            features.append(sub_token['TEXT'][-3:])
            # a feature for the length of words
            features.append(len(sub_token['TEXT']))
            # store the observation as a tuple of features
            sub_token['TEXT'] = tuple(features)
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, 3
def _get_toks(file='ca01', debug=0):
    """
    Load tokens from the given file.  
    """
    assert _chktype(1, file, types.StringType)
    assert _chktype(2, debug, types.IntType)
    
    _resettime()
    if debug: print _timestamp(), 'tokenizing', file

    ttoks = brown.tokenize(file)

    labeled_tokens = [Token(LabeledText(tok.type().base().lower(),
                                           tok.type().tag()),
                               tok.loc())
                         for tok in ttoks]
    if debug: print _timestamp(), '  done tokenizing'
    return labeled_tokens
def demo_pos_supervised():
    from nltk.corpus import brown
    from sys import stdout
    print 'Loading data from Brown corpus...'
    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    annul_nonmatching_tags(tagged_tokens, tag_set, '*')
    words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens)

    word_set.sort()
    tag_set.sort()

    print 'output alphabet', ` word_set ` [:50], '...'
    print 'state labels   ', ` tag_set ` [:50], '...'
    print tag_set

    print 'Training HMM...'

    #print 'training data:'
    #print zip(words[1:], tags[1:])

    trainer = HMMTrainer(tag_set, word_set)
    hmm = trainer.train_supervised(
        words[100:], tags[100:],
        lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print hmm
    print 'Testing...'

    for ws, ts in zip(words[:3], tags[:3]):
        print ws
        print 'HMM >>>'
        print hmm.best_path(ws)
        print 'CORRECT >>>'
        print ts
        print '-' * 60

    count = correct = 0
    for ws, ts in zip(words[:100], tags[:100]):
        print '.',
        stdout.flush()
        pts = hmm.best_path(ws)
        for t, pt in zip(ts, pts):
            count += 1
            if t == pt:
                correct += 1

    print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct /
                                                         count)