def classify_naive_bayes(): reviews = get_reviews(db=db) text = get_text_for_reviews(reviews) logger.debug('Tokenizing text...') words = tokenize(text) feature_finder = MostCommonWordsFinder(n_words=500) feature_finder.process(words) feature_finder.save('top500reviews.featureset') def pred(review): return review['sentiment'] == SentimentEnum.GOOD partitioner = DistributionPartitioner(reviews, pred=pred, ratio=0.85) partitioner.partition() classifier = NaiveBayesClassifier() logger.debug('Labeling training data...') train_data = get_labeled_review_data( partitioner.training_data, feature_finder.find_features, ) test_data = get_labeled_review_data( partitioner.test_data, feature_finder.find_features, ) classifier.train(train_data) accuracy = classifier.accuracy(test_data) print('Accuracy: {:.2f}'.format(accuracy * 100)) classifier.show_top_features() classifier.save()
def show_stats_for_text(text): words = tokenize(text, clean_filter=FILTER_ALL) fd = FreqDist(words) logger.info('Total words: %s', len(words)) logger.info('Recurrent words: %s', fd.B()) logger.info('Most common words') for word, count in fd.most_common(20): logger.info('%s\t%s', word, count)
def check_text(text, verbose=True): unknown = defaultdict(list) stats = defaultdict(int) tokens = tokenize(text, language='english') testers = ( ('one_letter', checks._is_one_letter), ('punctuation', checks._is_punct), ('uri', checks._is_uri), ('time', checks._is_time), ('code', checks._is_code), ('name', checks._is_name), ('variable', checks._is_variable), ('number', checks._is_number), ('aux', checks._is_aux), ) for index, token in enumerate(tokens): # because of default nltk tokenization if token.startswith("'"): token = token[1:] stats['quoted'] += 1 if token.endswith("'"): token = token[:-1] try: for key, tester in testers: if tester(token): stats[key] += 1 raise DoNotCheck('Test passed') except DoNotCheck: continue for word in token.split('-'): if not _check_word(word): stats['unknown'] += 1 unknown[token].append(index) if verbose: for elem, idx in unknown.items(): if len(idx) > 2: continue for index in idx: print_unknown_context(elem, index, tokens) return stats
def load_trained_naive_bayes_classifier(): feature_finder = MostCommonWordsFinder.load('top500reviews.featureset') classifier = NaiveBayesClassifier.load( 'naivebayesclassifier_20181025.classifier') reviews = get_reviews_by_sentiment(SentimentEnum.BAD, db=db) correct = 0 wrong = 0 for review in reviews: featureset = feature_finder.find_features( tokenize(review['text'], clean_filter=F_LOWERCASE)) original_sentiment = SentimentEnum(review['sentiment']) guessed_sentiment = SentimentEnum(classifier.classify(featureset)) if original_sentiment == guessed_sentiment: correct += 1 else: wrong += 1 print('Bad correct: %s, wrong: %s' % (correct, wrong))
def get_labeled_review_data(reviews, feature_finder): return [(feature_finder(tokenize(r['text'], clean_filter=F_LOWERCASE)), r['sentiment']) for r in reviews]
def get_most_common_words(text, n_words=20): words = tokenize(text) fd = FreqDist(words) return [item[0] for item in fd.most_common(n_words)]