def get_prob(Dataset_train, Dataset_test, similarity_measure): """ Generate predict results, given two datasets and similarity_measure This is the first predict scheme, without using any machine learning techniques Return a list of predicted scores """ data_train = Dataset_train data_test = Dataset_test # placeholder for final prediction results pred = [] # get the sizes for further usages train_size = len(data_train.data) test_size = len(data_test.data) category_scores_overall = [] # using utility to get a n*m array of similarity scores sim_result = similarity_utils.iterate_combination_2d_sim(data_test.data, data_train.data, similarity_measure) # t_a = (time() - t0) # joblib.dump(sim_result, './'+str(t_a)+'.pkl') # print sim_result for i in range(test_size): # list for holding sum of scores on each category # note: scores in this list are not averaged but sum category_scores = [0] * len(set(data_train.target)) # get a list of sim results from ith row in the sim_result 2d array # i.e. sim scores for one item in the test set compared to all train sentencs score_list = sim_result[i] # iterate over the score_list and sum up all scores by their categories into category_scores for j in range(train_size): # j's category j_cate = data_train.target[j] # j's sim score j_score = score_list[j] # add score by category back to the score sum list category_scores[j_cate] += j_score # after adding all scores to category score list, first average for category in set(data_train.target): # numpy array to list, otherwise cannot count # train_targets_list = map(None, data_train.target) train_targets_list = data_train.target.tolist() # occurance of this category in training set occurance = train_targets_list.count(category) # average score in the list category_scores[category] = category_scores[category] / float(occurance) category_scores_overall.append(category_scores) # convert to numpy array pred_array = np.array(category_scores_overall) return pred_array
# log file logFile = open('mylogfile.txt', 'wb') list1 = similarity_utils.load_sentences('data_not_sell') list2 = similarity_utils.load_sentences('data_sell_share') print "len(list1):", len(list1) print "len(list2):", len(list2) sentence1 = list1[1] sentence2 = list2[2] # test similarity from sentence score1 = similarity_overlap.sim_overlap(sentence1, sentence2) print "sim(list1[1], list2[2]) :", score1 # test iterate_combination_2d_sim score_array = similarity_utils.iterate_combination_2d_sim(list1, list2, similarity_overlap.sim_overlap) print "score_array[1][2] :", score_array[1][2] # Test for combined_list combined_list = list1+list2 # Sim_overlap score_array_overlap = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap.sim_overlap) # pretty print score_array_overlap = [[similarity_utils.PrettyFloat(n) for n in row] for row in score_array_overlap] pprint(score_array_overlap, logFile) logFile.write('\n') # Sim_overlap_idf score_array_idf = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap_idf.sim_overlap_idf)