示例#1
0
文件: nb.py 项目: bpcb/twittervaccine
def main():
    random.seed(7)

    # AAA: Use a random shuffle to select test/training sets
    print('Extracting twitter data from the database...')
    tm1 = time.time()
    tweets = extract()
    tm2 = time.time()

    print('  time=%0.3fs' % (tm2 - tm1))

    test_set_size = int(TEST_SET_PROPORTION * len(tweets))

    print('Training on %d tweets' % (len(tweets) - test_set_size))

    tm1 = time.time()
    random.shuffle(tweets)

    test_set = tweets[:test_set_size]
    training_set = tweets[test_set_size:]

    nb = NaiveBayes()
    for tweet in training_set:
        toks = ttokenize.tokenize(tweet.text)
        nb.train(toks, tweet.get_majority_vote())

    tm2 = time.time()

    print('  time=%0.3fs' % (tm2 - tm1))

    print('Testing accuracy on %d tweets' % test_set_size)
    tm1 = time.time()

    predictions = []
    references = []
    for tweet in test_set:
        references.append(tweet.get_majority_vote())
        toks = ttokenize.tokenize(tweet.text)
        predictions.append(nb.classify(toks))

    mat = nltk.ConfusionMatrix(references, predictions)
    tm2 = time.time()

    print mat.pp(show_percents=True)
    print ('%d of %d correct ==> %f%%' % (mat._correct, mat._total,
                                          float(mat._correct) / mat._total))
    print('  time=%0.3fs' % (tm2 - tm1))
示例#2
0
def get_instance(tweet):
    """Return a tuple of (feature_dictionary, label)."""
    toks = ttokenize.tokenize(tweet.text)
    feature_dict = dict([(t, True) for t in toks])
    return (feature_dict, tweet.get_majority_vote())