Пример #1
0
def test_kaggle():
	N = config.N
	reviews = parse_data.parse_training_file()
	popular_words = features.get_popular_words(reviews)
	word_lists = features.create_word_lists()
	sentiment_counts = likelihood.get_sentiment_counts(reviews)
	states = v.get_states(sentiment_counts)
	ngram_counts = prior_prob.gen_ngram_counts(reviews, N)
	l = prior_prob.train_interpolation(ngram_counts, N)
	vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words)

	test_reviews = parse_data.parse_training_file('data/test_data_no_true_labels.txt')

	f = open('kaggle.txt', 'w')
	f.write("Id,answer\n")
	rid = 0

	for i in test_reviews:
		lines = []
		answer = []
		tags = []

		for line in test_reviews[i]['reviews']:
			lines.append(line[1])
			tags.append(test_reviews[i]['title'])

		M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags)
		result = v.backtrace_viterbi(M, states, ngram_counts, l)

		for res in result:
			if res == "pos":
				f.write(str(rid) + ",1\n")
			elif res == "neu":
				f.write(str(rid) + ",0\n")
			elif res == "neg":
				f.write(str(rid) + ",-1\n")
			else:
				raise Error("Unexpected error: unidentified state.")

			rid += 1

		print "On row " + str(rid)

	f.close()
	print "Done. Output saved to kaggle.txt."
Пример #2
0
def test_training():
	N = config.N
	reviews = parse_data.parse_training_file('data/training_data.txt', 0, 150)
	popular_words = features.get_popular_words(reviews)
	word_lists = features.create_word_lists()
	sentiment_counts = likelihood.get_sentiment_counts(reviews)
	states = v.get_states(sentiment_counts)
	ngram_counts = prior_prob.gen_ngram_counts(reviews, N)
	l = prior_prob.train_interpolation(ngram_counts, N)
	vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words)

	test_reviews = parse_data.parse_training_file('data/training_data.txt', 150)

	correct = 0
	false = 0

	for i in test_reviews:
		lines = []
		answer = []
		tags = []

		for line in test_reviews[i]['reviews']:
			lines.append(line[1])
			tags.append(test_reviews[i]['title'])
			answer.append(line[0])

		M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags)
		result = v.backtrace_viterbi(M, states, ngram_counts, l)

		for j in range(len(answer)):
				if answer[j] == result[j]:
					correct += 1
				else:
					false += 1

		percent = float(correct) / float(correct + false)
	
	return percent
Пример #3
0
import parse_data
import features

f = open('kaggle_baseline.txt', 'w')
f.write("Id,answer\n")

test_reviews = parse_data.parse_training_file("data/training_data.txt")
neg_words = features.get_negative_words()
pos_words = features.get_positive_words()

lines = []
answer = []
tags = []

for i in test_reviews:
	for line in test_reviews[i]['reviews']:
		tags.append(line[0])
		lines.append(line[1])

for line in lines:

	word_list = line.split(" ")
	pos_words_count = 0
	neg_words_count = 0
	for word in word_list:
		if word in pos_words:
			pos_words_count += 1
		if word in neg_words:
			neg_words_count += 1
	if pos_words_count > neg_words_count + 1:
		answer.append("pos")