Пример #1
0
def classify_naive_bayes():
    reviews = get_reviews(db=db)
    text = get_text_for_reviews(reviews)
    logger.debug('Tokenizing text...')
    words = tokenize(text)
    feature_finder = MostCommonWordsFinder(n_words=500)
    feature_finder.process(words)
    feature_finder.save('top500reviews.featureset')

    def pred(review):
        return review['sentiment'] == SentimentEnum.GOOD

    partitioner = DistributionPartitioner(reviews, pred=pred, ratio=0.85)
    partitioner.partition()
    classifier = NaiveBayesClassifier()
    logger.debug('Labeling training data...')
    train_data = get_labeled_review_data(
        partitioner.training_data,
        feature_finder.find_features,
    )
    test_data = get_labeled_review_data(
        partitioner.test_data,
        feature_finder.find_features,
    )
    classifier.train(train_data)
    accuracy = classifier.accuracy(test_data)
    print('Accuracy: {:.2f}'.format(accuracy * 100))
    classifier.show_top_features()
    classifier.save()
Пример #2
0
def show_stats_for_text(text):
    words = tokenize(text, clean_filter=FILTER_ALL)
    fd = FreqDist(words)
    logger.info('Total words: %s', len(words))
    logger.info('Recurrent words: %s', fd.B())
    logger.info('Most common words')
    for word, count in fd.most_common(20):
        logger.info('%s\t%s', word, count)
Пример #3
0
def check_text(text, verbose=True):
    unknown = defaultdict(list)
    stats = defaultdict(int)
    tokens = tokenize(text, language='english')
    testers = (
        ('one_letter', checks._is_one_letter),
        ('punctuation', checks._is_punct),
        ('uri', checks._is_uri),
        ('time', checks._is_time),
        ('code', checks._is_code),
        ('name', checks._is_name),
        ('variable', checks._is_variable),
        ('number', checks._is_number),
        ('aux', checks._is_aux),
    )
    for index, token in enumerate(tokens):
        # because of default nltk tokenization
        if token.startswith("'"):
            token = token[1:]
            stats['quoted'] += 1
        if token.endswith("'"):
            token = token[:-1]

        try:
            for key, tester in testers:
                if tester(token):
                    stats[key] += 1
                    raise DoNotCheck('Test passed')
        except DoNotCheck:
            continue

        for word in token.split('-'):
            if not _check_word(word):
                stats['unknown'] += 1
                unknown[token].append(index)

    if verbose:
        for elem, idx in unknown.items():
            if len(idx) > 2:
                continue
            for index in idx:
                print_unknown_context(elem, index, tokens)

    return stats
Пример #4
0
def load_trained_naive_bayes_classifier():
    feature_finder = MostCommonWordsFinder.load('top500reviews.featureset')
    classifier = NaiveBayesClassifier.load(
        'naivebayesclassifier_20181025.classifier')

    reviews = get_reviews_by_sentiment(SentimentEnum.BAD, db=db)
    correct = 0
    wrong = 0
    for review in reviews:
        featureset = feature_finder.find_features(
            tokenize(review['text'], clean_filter=F_LOWERCASE))
        original_sentiment = SentimentEnum(review['sentiment'])
        guessed_sentiment = SentimentEnum(classifier.classify(featureset))
        if original_sentiment == guessed_sentiment:
            correct += 1
        else:
            wrong += 1

    print('Bad correct: %s, wrong: %s' % (correct, wrong))
Пример #5
0
def get_labeled_review_data(reviews, feature_finder):
    return [(feature_finder(tokenize(r['text'], clean_filter=F_LOWERCASE)),
             r['sentiment']) for r in reviews]
Пример #6
0
def get_most_common_words(text, n_words=20):
    words = tokenize(text)
    fd = FreqDist(words)
    return [item[0] for item in fd.most_common(n_words)]