def word_vectors(csv_file, vector_length, validation, word_to_skus=None, generated_sku_vectors=None): word_vects = {} words_index = 3 queries = kaggle.slice(kaggle.file_to_array(csv_file, validation), words_index) for q in queries: formatted = kaggle.format_string(q) for word in kaggle.tokenize(formatted): if word not in word_vects: word_vects[word] = vector.random_vector(vector_length) return word_vects
def test_data(csv_file, class_labels_index, input_data_index, validation, items_count, ngram=1): array = kaggle.file_to_array(csv_file, validation) class_labels = kaggle.slice(array, class_labels_index) test_data = kaggle.slice(array, input_data_index) formatted_test_data = [] for d in test_data: formatted = kaggle.format_string(d) tokens = kaggle.tokenize(formatted, ngram) formatted_test_data.append(tokens) if items_count != 'All': class_labels, formatted_test_data = class_labels[0:items_count], formatted_test_data[0:items_count] return class_labels, formatted_test_data
def real_test(): neighbors = 20 output = [] model, word_vectors, _, labels, sku_hash = train(extra, neighbors, vector_length, False) queries = kaggle.slice(kaggle.file_to_array(training, True), 3) for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = query_vector(word_hash, word_vectors, vector_length) pred = model.predict(vect)[0] output.append([pred]) return output
def validation_test(): n = 20 sample_size = 10591 start = time.time() model, word_vectors, sku_vectors, labels, sku_hash = train(training, n, vector_length, False) array = kaggle.file_to_array(training, True) labels = kaggle.slice(array, 1) print "Examples: " + str(len(labels)) queries = kaggle.slice(array, 3) test_data = [] for q in queries: word_hash = kaggle.string_to_hash(kaggle.format_string(q)) test_data.append(query_vector(word_hash, word_vectors, vector_length)) score = test(model, n, test_data, labels, sample_size, vector_length, sku_hash) print "Duration: " + str(time.time() - start) return score
def validation_test(): models, word_vectors = train_tree() correct_data = best_buy.sku_to_searches() # Create an array for each model to store predictions. output = [] for m in models: output.append([]) file_array = kaggle.file_to_array(training, True) queries = kaggle.slice(file_array, 3) skus = kaggle.slice(file_array, 1) total = 0. correct = 0. correct_pop = 0. wrong_pop = 0. right_answers = [] wrong_answers = [] for index,q in enumerate(queries): word_hash = kaggle.string_to_hash(kaggle.format_string(q)) vect = knn.query_vector(word_hash, word_vectors, vector_length) all_preds = [] for i,m in enumerate(models): preds = m.predictions(vect)[0:5] output[i].append(preds) pred = [] for p in preds: pred.append(p[0]) all_preds.append(p[0]) #For testing accuracy. sub_out = set(all_preds) total += len(sub_out) correct_sku = skus[index] pop = len(correct_data[correct_sku]) if correct_sku in sub_out: correct += 1. correct_pop += pop right_answers.append([q,correct_sku]) else: wrong_pop += pop wrong_answers.append([q,correct_sku]) #print "\nQuery: " + q #print "Correct Answer: " + str(correct_data[correct_sku][0:6]) + ". Popularity: " + str(len(correct_data[correct_sku])) #for p in sub_out: # print "Prediction: " + str(correct_data[p][0:6]) + ". Popularity: " + str(len(correct_data[p])) wrong = len(queries) - correct #print "Avg wrong pop: " + str(wrong_pop/wrong) #print "Avg correct pop: " + str(correct_pop/correct) answered = total/len(queries) #print "Correct: " + str(correct) precision = correct/total recall = correct/len(queries) print "Total: " + str(total) #print "Answered: " + str(answered) print "Precision: " + str(precision) print "Recall: " + str(recall) return output