def test_kaggle(): N = config.N reviews = parse_data.parse_training_file() popular_words = features.get_popular_words(reviews) word_lists = features.create_word_lists() sentiment_counts = likelihood.get_sentiment_counts(reviews) states = v.get_states(sentiment_counts) ngram_counts = prior_prob.gen_ngram_counts(reviews, N) l = prior_prob.train_interpolation(ngram_counts, N) vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words) test_reviews = parse_data.parse_training_file('data/test_data_no_true_labels.txt') f = open('kaggle.txt', 'w') f.write("Id,answer\n") rid = 0 for i in test_reviews: lines = [] answer = [] tags = [] for line in test_reviews[i]['reviews']: lines.append(line[1]) tags.append(test_reviews[i]['title']) M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags) result = v.backtrace_viterbi(M, states, ngram_counts, l) for res in result: if res == "pos": f.write(str(rid) + ",1\n") elif res == "neu": f.write(str(rid) + ",0\n") elif res == "neg": f.write(str(rid) + ",-1\n") else: raise Error("Unexpected error: unidentified state.") rid += 1 print "On row " + str(rid) f.close() print "Done. Output saved to kaggle.txt."
def test_training(): N = config.N reviews = parse_data.parse_training_file('data/training_data.txt', 0, 150) popular_words = features.get_popular_words(reviews) word_lists = features.create_word_lists() sentiment_counts = likelihood.get_sentiment_counts(reviews) states = v.get_states(sentiment_counts) ngram_counts = prior_prob.gen_ngram_counts(reviews, N) l = prior_prob.train_interpolation(ngram_counts, N) vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words) test_reviews = parse_data.parse_training_file('data/training_data.txt', 150) correct = 0 false = 0 for i in test_reviews: lines = [] answer = [] tags = [] for line in test_reviews[i]['reviews']: lines.append(line[1]) tags.append(test_reviews[i]['title']) answer.append(line[0]) M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags) result = v.backtrace_viterbi(M, states, ngram_counts, l) for j in range(len(answer)): if answer[j] == result[j]: correct += 1 else: false += 1 percent = float(correct) / float(correct + false) return percent
import parse_data import features f = open('kaggle_baseline.txt', 'w') f.write("Id,answer\n") test_reviews = parse_data.parse_training_file("data/training_data.txt") neg_words = features.get_negative_words() pos_words = features.get_positive_words() lines = [] answer = [] tags = [] for i in test_reviews: for line in test_reviews[i]['reviews']: tags.append(line[0]) lines.append(line[1]) for line in lines: word_list = line.split(" ") pos_words_count = 0 neg_words_count = 0 for word in word_list: if word in pos_words: pos_words_count += 1 if word in neg_words: neg_words_count += 1 if pos_words_count > neg_words_count + 1: answer.append("pos")