예제 #1
0
def test_sentencetokenizer():

    tokens = SentenceTokenizer.tokenize("This is a sentence. And this is another.")
    assert tokens == [('FIRST_CAPITALIZED_STOPWORD', 'This'), ('LOWER_STOPWORD', 'is'),
		     ('LOWER_STOPWORD', 'a'), ('LOWER', 'sentence'), ('TERMINATOR', '.'),
                     ('CAPITALIZED_STOPWORD', 'And'), ('LOWER_STOPWORD', 'this'),
                     ('LOWER_STOPWORD', 'is'), ('LOWER', 'another'), ('TERMINATOR', '.')]

    tokens = SentenceTokenizer.tokenize("This Is A Sentence Of Type Allcaps.")
    assert tokens == [('FIRST_CAPITALIZED_STOPWORD', 'This'), ('CAPITALIZED_STOPWORD', 'Is'),
		     ('MIXED_STOPWORD', 'A'), ('CAPITALIZED', 'Sentence'), 
            	     ('CAPITALIZED_STOPWORD', 'Of'), ('CAPITALIZED', 'Type'),
		     ('CAPITALIZED', 'Allcaps'), ('TERMINATOR', '.')]
예제 #2
0
def capitalization_type(text):
    """Determine the capilitization type of the text
Types are:

- REGULAR: First letter of First word in sentences
is capitilized as well as first letter of proper nouns.

- GERMAN: First letter of First word in sentences as
well as first letter of any noun.

- ALLCAPS: First letter of every word is capitilized.
- SHOUT: Every letter is uppercase.

- LOWER: Every letter is lowercase.

- OTHER: None of the above definitions apply.
(This may also mean mixed type)

"""
    d = dict(text=text)
    d['tokens'] = [x[0] for x in SentenceTokenizer.tokenize(d['text'])]
    result = apply_multinomial_NB(C, V, prior, condprob, d)
    result = result[0]
    type_map = dict(
        REGULAR=CapType.REGULAR,
        GERMAN=CapType.GERMAN,
        ALLCAPS=CapType.ALLCAPS,
        SHOUT=CapType.SHOUT,
        LOWER=CapType.LOWER,
        OTHER=CapType.OTHER
        )
    return type_map[result]
예제 #3
0
    'ALLCAPS'
    'LOWER'
    'SHOUT'

    and possibly 'GERMAN'
    '''
    
    # C holds our categories
    C = ['REGULAR', 'ALLCAPS', 'LOWER', 'SHOUT']

    # Now we place all of our training examples into D
    D = get_training_examples(options.corpus_filename)

    # Now extract the features for the trainer...
    for d in D:
        d['tokens'] = [x[0] for x in SentenceTokenizer.tokenize(d['text'])]

    print 'Training...'
    V, prior, condprob = train_multinomial_NB(C, D)

    # Now pickle these for use by capnormalizer
    stuff_to_pickle = (C, V, prior, condprob)
    print 'Pickling...'
    pickle.dump(stuff_to_pickle, open('data/weights/capnorm_weights.pickle', 'wb'))
    print 'Done.'

    if options.test:
        # Now test the training examples as well, 
        # most should give correct category if training went well...
        for d in D:
            result = apply_multinomial_NB(C, V, prior, condprob, d)