示例#1
0
def get_prob(Dataset_train, Dataset_test, similarity_measure):
    """
    Generate predict results, given two datasets and similarity_measure

    This is the first predict scheme, without using any machine learning techniques

    Return a list of predicted scores
    """

    data_train = Dataset_train
    data_test = Dataset_test
    # placeholder for final prediction results
    pred = []
    # get the sizes for further usages
    train_size = len(data_train.data)
    test_size  = len(data_test.data)

    category_scores_overall = []

    # using utility to get a n*m array of similarity scores
    sim_result = similarity_utils.iterate_combination_2d_sim(data_test.data, data_train.data, similarity_measure)

    # t_a = (time() - t0)

    # joblib.dump(sim_result, './'+str(t_a)+'.pkl')
    # print sim_result

    for i in range(test_size):
        # list for holding sum of scores on each category
        # note: scores in this list are not averaged but sum
        category_scores = [0] * len(set(data_train.target))
        # get a list of sim results from ith row in the sim_result 2d array 
        # i.e. sim scores for one item in the test set compared to all train sentencs
        score_list = sim_result[i]

        # iterate over the score_list and sum up all scores by their categories into category_scores
        for j in range(train_size):
            # j's category
            j_cate = data_train.target[j]
            # j's sim score
            j_score = score_list[j]
            # add score by category back to the score sum list
            category_scores[j_cate] += j_score
        
        # after adding all scores to category score list, first average
        for category in set(data_train.target):
            # numpy array to list, otherwise cannot count
            # train_targets_list = map(None, data_train.target)
            train_targets_list = data_train.target.tolist()
            # occurance of this category in training set
            occurance = train_targets_list.count(category)
            # average score in the list
            category_scores[category] = category_scores[category] / float(occurance)

        category_scores_overall.append(category_scores)

    # convert to numpy array
    pred_array = np.array(category_scores_overall)

    return pred_array
示例#2
0
# log file
logFile = open('mylogfile.txt', 'wb')

list1 = similarity_utils.load_sentences('data_not_sell')
list2 = similarity_utils.load_sentences('data_sell_share')
print "len(list1):", len(list1)
print "len(list2):", len(list2)

sentence1 = list1[1]
sentence2 = list2[2]

# test similarity from sentence
score1 = similarity_overlap.sim_overlap(sentence1, sentence2)
print "sim(list1[1], list2[2])  :", score1
# test iterate_combination_2d_sim
score_array = similarity_utils.iterate_combination_2d_sim(list1, list2, similarity_overlap.sim_overlap)
print "score_array[1][2]        :", score_array[1][2]


# Test for combined_list
combined_list = list1+list2

# Sim_overlap
score_array_overlap = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap.sim_overlap)
# pretty print
score_array_overlap = [[similarity_utils.PrettyFloat(n) for n in row] for row in score_array_overlap]
pprint(score_array_overlap, logFile)
logFile.write('\n')

# Sim_overlap_idf
score_array_idf = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap_idf.sim_overlap_idf)