예제 #1
0
def draw_auc(fea_num, method, noise=None):
    plt.figure()
    _method = ['ReliefF', 'MRMR']
    lines = ['KNN', 'NaiveBayes', 'SVM', 'RandomForest']
    delta = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6,
                      1])  # 特征权重阈值(选择排序前delta的特征)
    fea_count = data.shape[1] - 1  # 样本特征总量
    fea_num = np.array(fea_count * delta, dtype=int)  # 需求特征数
    if noise == None:
        auc = f.readpkl("result/auc_" + _method[method] + ".pkl")
        plt.title("The relationship between auc and different classifier(" +
                  _method[method] + ")")
    elif noise != None:
        auc = f.readpkl("result/auc_" + _method[method] + "_noise" +
                        str(noise) + ".pkl")
        plt.title("The relationship between auc and different classifier(" +
                  _method[method] + ",noise=" + str(noise) + ")")
    for i in range(4):
        plt.plot(fea_num, auc[:, i], label=lines[i])
    plt.legend(loc="best")
    plt.xlabel("features count")
    plt.ylabel("auc")
    if noise == None:
        plt.savefig("pic/auc_" + _method[method] + ".png")
    elif noise != None:
        plt.savefig("pic/auc_" + _method[method] + "_noise" + str(noise) +
                    ".png")
예제 #2
0
def get_scores():
    train_X = files.readpkl("pro_data/train_X.pkl")
    each_label_num = np.array([len(data) for data in train_X])
    prior_pr = each_label_num / np.sum(each_label_num)  # 先验概率

    ave = np.array([np.mean(train_X[i], axis=0)
                    for i in range(len(train_X))])  # 均值
    std = np.array([np.std(train_X[i], axis=0)
                    for i in range(len(train_X))])  # 标准差

    test = files.readpkl("pro_data/test.pkl")  # 二维矩阵
    scores = []
    for t in test:
        scores.append(NaiveBayes_Classifier(ave, std, t, prior_pr))
    scores = np.array(scores)

    files.writepkl("scores.pkl", scores)
예제 #3
0
def loadtrain_xy():
    #loadtrain()
    train = files.readpkl("pro_data/train.pkl")
    train_X, train_Y = [], []
    for i in range(len(train)):
        train_X.append(train[i][:, :-1])
        train_Y.append(train[i][:, -1])
    files.writepkl('pro_data/train_X.pkl', train_X)
    #files.writetxt('pro_data/train_X.txt', train_X)
    files.writepkl('pro_data/train_Y.pkl', train_Y)
예제 #4
0
def get_similarity(item_cnt):
    cnt = 0
    for ids in range(int(maxid/20)+1):
        filename = "new/sim_matrix/"+str(ids)+".pkl"
        matrix_file = "new/item_matrix/"+str(ids)+".pkl"
        try:
            matrix = f.readpkl(matrix_file)
            sim_matrix = f.readpkl(filename)
            for pair in matrix:
                item1 = pair[0]
                item2 = pair[1]
                if item1 not in sim_matrix:
                    sim_matrix[item1] = {}
                sim_matrix[item1][item2] = matrix[pair]/(item_cnt[item1]*item_cnt[item2])
            f.writepkl(filename,sim_matrix)
            cnt = cnt+1
            print ("finish",cnt)
        except FileNotFoundError:
            continue
예제 #5
0
def get_rmse():
    train = files.readpkl("rmse_data/rmse_train.pkl")
    each_label_num = np.array([len(data) for data in train])
    prior_pr = each_label_num / np.sum(each_label_num)  # 先验概率

    ave = np.array([np.mean(train[i], axis=0)
                    for i in range(len(train))])  # 均值
    std = np.array([np.std(train[i], axis=0)
                    for i in range(len(train))])  # 标准差
    test = files.readpkl("rmse_data/rmse_test.pkl")

    y_true, y_hat = [], []
    for i in range(len(test)):
        print("i: ", i, "/", len(test))
        for t in test[i]:
            y_true.append(i)
            y_hat.append(NaiveBayes_Classifier(ave, std, t, prior_pr))

    y_true, y_hat = np.array(y_true), np.array(y_hat)
    rmse = np.sqrt(1 / len(y_true) * np.sum((y_true - y_hat)**2))
    print("RMSE计算结果为: %f" % rmse)
예제 #6
0
def load_rmsedata():
    train = files.readpkl("pro_data/train_X.pkl")
    each_label_num = np.array([len(data) for data in train])
    train_count = each_label_num * 0.7
    train_count = train_count.astype(int)  # 训练集每层总量

    rmse_train, rmse_test = [], []
    for i in range(101):
        rmse_train.append(train[i][:train_count[i]])
        rmse_test.append(train[i][train_count[i]:])

    files.writepkl("rmse_data/rmse_train.pkl", rmse_train)
    files.writepkl("rmse_data/rmse_test.pkl", rmse_test)
예제 #7
0
def predict(item_record,user_item_record):
    cnt = 0
    for user in item_record:
        for item_predict in item_record[user]:
            predict_score = 0
            filename = "data/sim_matrix/"+str(int(int(item_predict)/10))+".pkl" #读取物品相似度矩阵
            sim_matrix = f.readpkl(filename)
            try:
                for item in user_item_record[user]:
                    if item in sim_matrix[item_predict]:# 计算评分
                        predict_score = predict_score+user_item_record[user][item]
                        				*sim_matrix[item_predict][item]
                item_record[user][item_predict] = predict_score #存储对应的预测评分
            except KeyError:
                continue
예제 #8
0
def loadtest():
    data = files.readpkl("pro_data/test_stat.pkl")

    for i in range(len(data)):
        if i % 500 == 0:
            print("i:", i)
        user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1)
        item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1)
        if i == 0:
            matrix = np.concatenate((user_id, item_id), axis=1)
        else:
            m = np.concatenate((user_id, item_id), axis=1)
            matrix = np.vstack((matrix, m))

    #files.writetxt('pro_data/test.txt', matrix)
    files.writepkl('pro_data/test.pkl', matrix)
예제 #9
0
def get_item_item_matrix(item_record):
    cnt = 0
    for ids in range(1,user_max,10):
        filename = "data/item_matrix2/"+str(int(ids/10))+".pkl" 
        #读取当前块存储的同现矩阵
        try:
            item_matrix = f.readpkl(filename)
            for user in range(ids,ids+10):
                for item1 in item_record[user]:#遍历用户评分的商品,组成商品对(item1,item2)
                    for item2 in item_record[user]:
                        item_pair = (item1,item2)
                        if item_pair not in item_matrix:
                            item_matrix[item_pair] = 0	#初始化字典条目
                        item_matrix[item_pair] = item_matrix[item_pair]+1	#同现矩阵计数+1
            f.writepkl(filename,item_matrix) #保存当前块
            cnt = cnt+1
            print("finish",cnt)
        except EOFError:
            continue
        except FileNotFoundError:
            continue
예제 #10
0
def find_relative_people(user_id, item_id):
    people_list = []
    sim_list = []
    relative_people = []
    relative_sim = []
    file_num = int(user_id / 100)
    tmp_list = files.readpkl('data/train_res%d.pkl' % file_num)
    index = user_id % 100
    start = index * num_of_people
    for i in range(0, num_of_people):
        if item_id in train_stat[i][2].keys() and user_id != i:
            relative_people.append(i)
            relative_sim.append(tmp_list[start + i])
    if len(relative_people) == 0:
        return people_list, sim_list
    ordered = heapq.nlargest(relative_people_num, relative_sim)
    minimum = ordered[-1]
    for i in range(0, len(relative_people)):
        if relative_sim[i] >= minimum and i != user_id:
            people_list.append(relative_people[i])
            sim_list.append(relative_sim[i])
    return people_list, sim_list
예제 #11
0
def loadtrain():
    data = files.readpkl("pro_data/train_stat.pkl")

    for i in range(len(data)):
        if i % 500 == 0:
            print("i:", i)
        user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1)
        item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1)
        score = np.array(list(data[i][2].values()), dtype=int).reshape(-1, 1)
        if i == 0:
            matrix = np.concatenate((user_id, item_id, score), axis=1)
        else:
            m = np.concatenate((user_id, item_id, score), axis=1)
            matrix = np.vstack((matrix, m))

    _score = matrix[:, 2]
    train = []
    for i in range(101):
        index = np.argwhere(_score == i).reshape(-1)
        train.append(matrix[index])

    #files.writetxt('pro_data/train.txt', train)
    files.writepkl('pro_data/train.pkl', train)
예제 #12
0
    data = f.readpkl("data/urban.pkl")  # 训练集
    delta = np.array([1/6, 2/6, 3/6, 4/6, 5/6, 1])   # 特征权重阈值(选择排序前delta的特征)
    fea_count = data.shape[1] - 1   # 样本特征总量
    fea_num = np.array(fea_count * delta, dtype=int)    # 需求特征数
    '''

    # method=0 -> ReliefF || method=1 -> MRMR
    #run(data, fea_num, 1)
    #draw_acc(fea_num, 0)
    #draw_auc(fea_num, 0)

    noise_fea_num = [50, 100, 150, 200]
    num_list = []
    for i in noise_fea_num:
        print("样本特征添加:", i)
        data = f.readpkl("data/urban.pkl")  # 训练集
        noise = np.random.normal(loc=0, scale=5, size=(data.shape[0], i))
        data = np.concatenate(
            (data[:, :-1], noise, data[:, -1].reshape(-1, 1)), axis=1)
        delta = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6,
                          1])  # 特征权重阈值(选择排序前delta的特征)
        fea_count = data.shape[1] - 1  # 样本特征总量
        fea_num = np.array(fea_count * delta, dtype=int)  # 需求特征数

        #run(data, fea_num, 0, i)
        #run(data, fea_num, 1, i)
        #print("finish run!")
        draw_acc(fea_num, 0, i)
        draw_acc(fea_num, 1, i)
        draw_auc(fea_num, 0, i)
        draw_auc(fea_num, 1, i)
예제 #13
0
import files

# filename = 'data/train_res.txt'
# with open(filename, 'a') as file_obj:

raw_data = files.readpkl('data/train_stat.pkl')
all_person = 19835
all_item = 624961
len_list = []
sim_list = []

# sim_list = [[0 for i in range(0, all_person)] for i in range(0, all_person)]


def initial(raw_data):
    init_list = []
    for i in raw_data:
        init_list.append(i[2])
    return init_list


def pre_process(person_list):
    for i in person_list:
        sum = 0
        for value in i.values():
            sum += value
        average = sum / all_item
        sum = 0
        for key in i.keys():
            i[key] = i[key] - average
            sum += i[key]**2
예제 #14
0
import files

filename = "data/k=3/answer.txt"
res = files.readpkl('data/k=3/test_res.pkl')
with open(filename, 'w') as file_obj:
    for i in range(0, len(res)):
        file_obj.write(str(res[i][0]) + '|' + str(res[i][1]) + '\n')
        for key in res[i][2].keys():
            file_obj.write(key + ' ' + str(int(res[i][2][key])) + '\n')
예제 #15
0
import files
import heapq

num_of_people = 19835
relative_people_num = 3
item_per_person = 6

train_stat = files.readpkl('data/train_stat.pkl')
test_stat = files.readpkl('data/test_stat.pkl')
train_res = files.readpkl('data/train_res0.pkl')

if __name__ == "__main__":
    print(test_stat[12])
    people = train_res[12 * num_of_people:12 * num_of_people + num_of_people]
    print('people', people)

    for key in test_stat[12][2].keys():
        print('==============')
        print(key)
        scored = []
        scored_sim = []
        for i in range(0, len(train_stat)):
            if (key in train_stat[i][2].keys() and i != 12):
                scored.append(i)
                scored_sim.append(people[i])
        print('scored', scored)
        print('scored_sim', scored_sim)
        ordered = heapq.nlargest(relative_people_num, scored_sim)
        print('ordered', ordered)
        if (len(ordered) == 0):
            print("warning")
예제 #16
0
                    if item in sim_matrix[item_predict]:# 计算评分
                        predict_score = predict_score+user_item_record[user][item]
                        				*sim_matrix[item_predict][item]
                item_record[user][item_predict] = predict_score #存储对应的预测评分
            except KeyError:
                continue
        cnt = cnt+1
        print ("finish",cnt)
    f.writepkl("new/result.pkl",item_record)
def ini_item_item():
    for ids in range(1,user_max,10):
        filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl"
        item_matrix = {}
        f.writepkl(filename,item_matrix)
    print("ini ok")

if __name__ == '__main__':
    train_record = f.readpkl("data/item_record.pkl")
    test_record = f.readpkl("data/test_record.pkl")
    item_cnt = f.readpkl("data/item_cnt.pkl")
    item_dict = {}
    predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict)
    print ("ok")
    
    test_record = f.readpkl("test_item.pkl")
    item_cnt = get_item_cnt(item_record=train_record)
    item_dict = {}
    predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict)
    print("ok")'''