示例#1
0
文件: knn.py 项目: ageek/kaggle-1
def data(csv_file, vector_length, validation, word_vector_hash=None):
	if word_vector_hash == None:
		word_vector_hash = word_vectors(csv_file, vector_length, validation)

	# generate the sku-words
	sku_words = []
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, 1)
	text = kaggle.slice(array, 3)
	indexes = len(text)
	for i in range(indexes):
		word_count = kaggle.string_to_hash(text[i])
		label = class_labels[i]
		line_array = [label, word_count]
		sku_words.append(line_array)

	# get a list of only the vectors
	sku_vects = sku_vectors(sku_words, word_vector_hash, vector_length)
	vects = []
	for triplet in sku_vects:
		vect = triplet[2]
		vects.append(vect)

	sku_hash = sku_vector_hash(sku_words, word_vector_hash, vector_length)
	return vects, class_labels, word_vector_hash, sku_hash
示例#2
0
def csv_with_more_data():
	new_path = "../data/extra.csv"
	more_data = xml_to_array()
	old = kaggle.file_to_array(training)
	skus = set(kaggle.slice(old, 1))
	added_skus = set([])
	count = 0

	with open(new_path, "a") as new:
		for o in old:
			new.write(",".join(o).strip() + "\n")

		for product in more_data:
			sku = product[0]
			name = product[1]

			if sku in skus and sku not in added_skus:
				count += 1
				added_skus.add(sku)
				fake_line = ",".join(['fakeuser', sku, 'fakecategory', name, 'fake_time', 'fake_time']).encode('utf-8').strip()
				for i in range(15):
					new.write(fake_line + "\n")
	print len(added_skus)
	print len(skus)
	print count
	return None
示例#3
0
def train():
	csv = kaggle.file_to_array(training, 'all')
	times = query_times(csv, 4)
	all_skus = skus(csv)
	xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0)
	model = xtrees.fit(times, all_skus)
	model = ConfidenceDecorator(model, times, all_skus)
	return model
示例#4
0
文件: tf_idf.py 项目: ageek/kaggle-1
def train_model(csv_file, ngram=1, validation=False):
	class_labels_index = 1
	input_data_index = 3
	data = kaggle.file_to_hash(csv_file, class_labels_index, input_data_index, validation)
	model = train(data, ngram)
	data = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(data, class_labels_index)
	popularity = popular.popularity_hash(class_labels, data)
	return model, popularity
示例#5
0
文件: knn.py 项目: ageek/kaggle-1
def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None):
	word_vects = {}
	words_index = 3
	queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index)
	for q in queries:
		formatted = kaggle.format_string(q)
		for word in kaggle.tokenize(formatted):
			if word not in word_vects:
				word_vects[word] = vector.random_vector(vector_length)

	return word_vects
示例#6
0
文件: tf_idf.py 项目: ageek/kaggle-1
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1):
	array = kaggle.file_to_array(csv_file, validation)
	class_labels = kaggle.slice(array, class_labels_index)
	test_data = kaggle.slice(array, input_data_index)
	formatted_test_data = []
	for d in test_data:
		formatted = kaggle.format_string(d)
		tokens = kaggle.tokenize(formatted, ngram)
		formatted_test_data.append(tokens)
	if items_count != 'All':
		class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count]
	return class_labels, formatted_test_data
示例#7
0
文件: knn.py 项目: ageek/kaggle-1
def real_test():
	neighbors = 20
	output = []
	model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False)
	
	queries = kaggle.slice(kaggle.file_to_array(training, True), 3)
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = query_vector(word_hash, word_vectors, vector_length)
		pred = model.predict(vect)[0]
		output.append([pred])
	return output
示例#8
0
def validation_test():
	start = time.time()
	answers = kaggle.slice(kaggle.file_to_array(training, True), 1)
	#xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, knn_preds = decision_tree.real_test()
	models = decision_tree.real_test()
	#time_preds = time_rank.real_test()
	#tf_preds = tf_idf.real_test(w)
	#merged = vote.merge_answers(xtrees_preds, xtrees2, xtrees3, xtrees4, forest_preds, tf_preds, knn_preds, time_preds)
	merged = vote.vote(models)
	#kaggle.write_predictions(merged, "../data/predictions_9_29_12.csv")
	accuracy = score(merged, answers)
	print "Duration: " + str(time.time() - start)
	return accuracy
示例#9
0
def test_data():
	# validation set
	csv = kaggle.file_to_array(training, 'all')
	_times = query_times(csv)
	_all_skus = skus(csv)
	sku_times = {}
	for i in range(len(_all_skus)):
		sku = _all_skus[i]
		t = array(_times[i])
		if sku in sku_times:
			sku_times[sku].append(t)
		else:
			sku_times[sku] = [t]
	return sku_times
示例#10
0
def sku_to_searches():
	array = kaggle.file_to_array(training, "all")
	#array = array[0:10000:4] + array[1:10000:4] + array[2:10000:4]
	skus = kaggle.slice(array, 1)
	skus = set(skus)
	skus_searches = {}
	for sku in skus:
		skus_searches[sku] = []

	for line in array:
		sku = line[1]
		search = line[3]
		search = " ".join(kaggle.tokenize(search.lower()))
		search = re.sub("\"", '', search)
		skus_searches[sku].append(search)
	return skus_searches
示例#11
0
文件: knn.py 项目: ageek/kaggle-1
def validation_test():
	n = 20
	sample_size = 10591
	start = time.time()
	
	model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False)
	array = kaggle.file_to_array(training, True)
	labels = kaggle.slice(array, 1)
	print "Examples: " + str(len(labels))
	queries = kaggle.slice(array, 3)
	test_data = []
	for q in queries:
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		test_data.append(query_vector(word_hash, word_vectors, vector_length))
	score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash)
	print "Duration: " + str(time.time() - start)
	return score
示例#12
0
def validation_test():
	models, word_vectors = train_tree()
	correct_data = best_buy.sku_to_searches()
	# Create an array for each model to store predictions.
	output = []
	for m in models:
		output.append([])

	file_array = kaggle.file_to_array(training, True)
	queries = kaggle.slice(file_array, 3)
	skus = kaggle.slice(file_array, 1)
	total = 0.
	correct = 0.
	correct_pop = 0.
	wrong_pop = 0.

	right_answers = []
	wrong_answers = []

	for index,q in enumerate(queries):
		word_hash = kaggle.string_to_hash(kaggle.format_string(q))
		vect = knn.query_vector(word_hash, word_vectors, vector_length)
		all_preds = []
		for i,m in enumerate(models):
			preds = m.predictions(vect)[0:5]
			output[i].append(preds)
			pred = []
			for p in preds:
				pred.append(p[0])
				all_preds.append(p[0])

		#For testing accuracy.
		sub_out = set(all_preds)
		total += len(sub_out)

		correct_sku = skus[index]
		pop = len(correct_data[correct_sku])
		if correct_sku in sub_out:
			correct += 1.
			correct_pop += pop
			right_answers.append([q,correct_sku])
		else:
			wrong_pop += pop
			wrong_answers.append([q,correct_sku])
			#print "\nQuery: " + q
			#print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku]))
			#for p in sub_out:
			#	print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p]))
	
	wrong = len(queries) - correct
	#print "Avg wrong pop: " + str(wrong_pop/wrong)
	#print "Avg correct pop: " + str(correct_pop/correct)
	answered = total/len(queries)
	#print "Correct: " + str(correct)
	precision = correct/total
	recall = correct/len(queries)
	print "Total: " + str(total)
	#print "Answered: " + str(answered)
	print "Precision: " + str(precision)
	print "Recall: " + str(recall)
	return output
示例#13
0
def real_data():
	csv = kaggle.file_to_array(testing, 'all')
	times = query_times(csv, 3)
	return times
示例#14
0
 def skus(cls):
     class_labels = kaggle.slice(kaggle.file_to_array(csv_file), class_labels_index)
     return class_labels