def draw_auc(fea_num, method, noise=None): plt.figure() _method = ['ReliefF', 'MRMR'] lines = ['KNN', 'NaiveBayes', 'SVM', 'RandomForest'] delta = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6, 1]) # 特征权重阈值(选择排序前delta的特征) fea_count = data.shape[1] - 1 # 样本特征总量 fea_num = np.array(fea_count * delta, dtype=int) # 需求特征数 if noise == None: auc = f.readpkl("result/auc_" + _method[method] + ".pkl") plt.title("The relationship between auc and different classifier(" + _method[method] + ")") elif noise != None: auc = f.readpkl("result/auc_" + _method[method] + "_noise" + str(noise) + ".pkl") plt.title("The relationship between auc and different classifier(" + _method[method] + ",noise=" + str(noise) + ")") for i in range(4): plt.plot(fea_num, auc[:, i], label=lines[i]) plt.legend(loc="best") plt.xlabel("features count") plt.ylabel("auc") if noise == None: plt.savefig("pic/auc_" + _method[method] + ".png") elif noise != None: plt.savefig("pic/auc_" + _method[method] + "_noise" + str(noise) + ".png")
def get_scores(): train_X = files.readpkl("pro_data/train_X.pkl") each_label_num = np.array([len(data) for data in train_X]) prior_pr = each_label_num / np.sum(each_label_num) # 先验概率 ave = np.array([np.mean(train_X[i], axis=0) for i in range(len(train_X))]) # 均值 std = np.array([np.std(train_X[i], axis=0) for i in range(len(train_X))]) # 标准差 test = files.readpkl("pro_data/test.pkl") # 二维矩阵 scores = [] for t in test: scores.append(NaiveBayes_Classifier(ave, std, t, prior_pr)) scores = np.array(scores) files.writepkl("scores.pkl", scores)
def loadtrain_xy(): #loadtrain() train = files.readpkl("pro_data/train.pkl") train_X, train_Y = [], [] for i in range(len(train)): train_X.append(train[i][:, :-1]) train_Y.append(train[i][:, -1]) files.writepkl('pro_data/train_X.pkl', train_X) #files.writetxt('pro_data/train_X.txt', train_X) files.writepkl('pro_data/train_Y.pkl', train_Y)
def get_similarity(item_cnt): cnt = 0 for ids in range(int(maxid/20)+1): filename = "new/sim_matrix/"+str(ids)+".pkl" matrix_file = "new/item_matrix/"+str(ids)+".pkl" try: matrix = f.readpkl(matrix_file) sim_matrix = f.readpkl(filename) for pair in matrix: item1 = pair[0] item2 = pair[1] if item1 not in sim_matrix: sim_matrix[item1] = {} sim_matrix[item1][item2] = matrix[pair]/(item_cnt[item1]*item_cnt[item2]) f.writepkl(filename,sim_matrix) cnt = cnt+1 print ("finish",cnt) except FileNotFoundError: continue
def get_rmse(): train = files.readpkl("rmse_data/rmse_train.pkl") each_label_num = np.array([len(data) for data in train]) prior_pr = each_label_num / np.sum(each_label_num) # 先验概率 ave = np.array([np.mean(train[i], axis=0) for i in range(len(train))]) # 均值 std = np.array([np.std(train[i], axis=0) for i in range(len(train))]) # 标准差 test = files.readpkl("rmse_data/rmse_test.pkl") y_true, y_hat = [], [] for i in range(len(test)): print("i: ", i, "/", len(test)) for t in test[i]: y_true.append(i) y_hat.append(NaiveBayes_Classifier(ave, std, t, prior_pr)) y_true, y_hat = np.array(y_true), np.array(y_hat) rmse = np.sqrt(1 / len(y_true) * np.sum((y_true - y_hat)**2)) print("RMSE计算结果为: %f" % rmse)
def load_rmsedata(): train = files.readpkl("pro_data/train_X.pkl") each_label_num = np.array([len(data) for data in train]) train_count = each_label_num * 0.7 train_count = train_count.astype(int) # 训练集每层总量 rmse_train, rmse_test = [], [] for i in range(101): rmse_train.append(train[i][:train_count[i]]) rmse_test.append(train[i][train_count[i]:]) files.writepkl("rmse_data/rmse_train.pkl", rmse_train) files.writepkl("rmse_data/rmse_test.pkl", rmse_test)
def predict(item_record,user_item_record): cnt = 0 for user in item_record: for item_predict in item_record[user]: predict_score = 0 filename = "data/sim_matrix/"+str(int(int(item_predict)/10))+".pkl" #读取物品相似度矩阵 sim_matrix = f.readpkl(filename) try: for item in user_item_record[user]: if item in sim_matrix[item_predict]:# 计算评分 predict_score = predict_score+user_item_record[user][item] *sim_matrix[item_predict][item] item_record[user][item_predict] = predict_score #存储对应的预测评分 except KeyError: continue
def loadtest(): data = files.readpkl("pro_data/test_stat.pkl") for i in range(len(data)): if i % 500 == 0: print("i:", i) user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1) item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1) if i == 0: matrix = np.concatenate((user_id, item_id), axis=1) else: m = np.concatenate((user_id, item_id), axis=1) matrix = np.vstack((matrix, m)) #files.writetxt('pro_data/test.txt', matrix) files.writepkl('pro_data/test.pkl', matrix)
def get_item_item_matrix(item_record): cnt = 0 for ids in range(1,user_max,10): filename = "data/item_matrix2/"+str(int(ids/10))+".pkl" #读取当前块存储的同现矩阵 try: item_matrix = f.readpkl(filename) for user in range(ids,ids+10): for item1 in item_record[user]:#遍历用户评分的商品,组成商品对(item1,item2) for item2 in item_record[user]: item_pair = (item1,item2) if item_pair not in item_matrix: item_matrix[item_pair] = 0 #初始化字典条目 item_matrix[item_pair] = item_matrix[item_pair]+1 #同现矩阵计数+1 f.writepkl(filename,item_matrix) #保存当前块 cnt = cnt+1 print("finish",cnt) except EOFError: continue except FileNotFoundError: continue
def find_relative_people(user_id, item_id): people_list = [] sim_list = [] relative_people = [] relative_sim = [] file_num = int(user_id / 100) tmp_list = files.readpkl('data/train_res%d.pkl' % file_num) index = user_id % 100 start = index * num_of_people for i in range(0, num_of_people): if item_id in train_stat[i][2].keys() and user_id != i: relative_people.append(i) relative_sim.append(tmp_list[start + i]) if len(relative_people) == 0: return people_list, sim_list ordered = heapq.nlargest(relative_people_num, relative_sim) minimum = ordered[-1] for i in range(0, len(relative_people)): if relative_sim[i] >= minimum and i != user_id: people_list.append(relative_people[i]) sim_list.append(relative_sim[i]) return people_list, sim_list
def loadtrain(): data = files.readpkl("pro_data/train_stat.pkl") for i in range(len(data)): if i % 500 == 0: print("i:", i) user_id = np.array([data[i][0]] * data[i][1]).reshape(-1, 1) item_id = np.array(list(data[i][2].keys()), dtype=int).reshape(-1, 1) score = np.array(list(data[i][2].values()), dtype=int).reshape(-1, 1) if i == 0: matrix = np.concatenate((user_id, item_id, score), axis=1) else: m = np.concatenate((user_id, item_id, score), axis=1) matrix = np.vstack((matrix, m)) _score = matrix[:, 2] train = [] for i in range(101): index = np.argwhere(_score == i).reshape(-1) train.append(matrix[index]) #files.writetxt('pro_data/train.txt', train) files.writepkl('pro_data/train.pkl', train)
data = f.readpkl("data/urban.pkl") # 训练集 delta = np.array([1/6, 2/6, 3/6, 4/6, 5/6, 1]) # 特征权重阈值(选择排序前delta的特征) fea_count = data.shape[1] - 1 # 样本特征总量 fea_num = np.array(fea_count * delta, dtype=int) # 需求特征数 ''' # method=0 -> ReliefF || method=1 -> MRMR #run(data, fea_num, 1) #draw_acc(fea_num, 0) #draw_auc(fea_num, 0) noise_fea_num = [50, 100, 150, 200] num_list = [] for i in noise_fea_num: print("样本特征添加:", i) data = f.readpkl("data/urban.pkl") # 训练集 noise = np.random.normal(loc=0, scale=5, size=(data.shape[0], i)) data = np.concatenate( (data[:, :-1], noise, data[:, -1].reshape(-1, 1)), axis=1) delta = np.array([1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6, 1]) # 特征权重阈值(选择排序前delta的特征) fea_count = data.shape[1] - 1 # 样本特征总量 fea_num = np.array(fea_count * delta, dtype=int) # 需求特征数 #run(data, fea_num, 0, i) #run(data, fea_num, 1, i) #print("finish run!") draw_acc(fea_num, 0, i) draw_acc(fea_num, 1, i) draw_auc(fea_num, 0, i) draw_auc(fea_num, 1, i)
import files # filename = 'data/train_res.txt' # with open(filename, 'a') as file_obj: raw_data = files.readpkl('data/train_stat.pkl') all_person = 19835 all_item = 624961 len_list = [] sim_list = [] # sim_list = [[0 for i in range(0, all_person)] for i in range(0, all_person)] def initial(raw_data): init_list = [] for i in raw_data: init_list.append(i[2]) return init_list def pre_process(person_list): for i in person_list: sum = 0 for value in i.values(): sum += value average = sum / all_item sum = 0 for key in i.keys(): i[key] = i[key] - average sum += i[key]**2
import files filename = "data/k=3/answer.txt" res = files.readpkl('data/k=3/test_res.pkl') with open(filename, 'w') as file_obj: for i in range(0, len(res)): file_obj.write(str(res[i][0]) + '|' + str(res[i][1]) + '\n') for key in res[i][2].keys(): file_obj.write(key + ' ' + str(int(res[i][2][key])) + '\n')
import files import heapq num_of_people = 19835 relative_people_num = 3 item_per_person = 6 train_stat = files.readpkl('data/train_stat.pkl') test_stat = files.readpkl('data/test_stat.pkl') train_res = files.readpkl('data/train_res0.pkl') if __name__ == "__main__": print(test_stat[12]) people = train_res[12 * num_of_people:12 * num_of_people + num_of_people] print('people', people) for key in test_stat[12][2].keys(): print('==============') print(key) scored = [] scored_sim = [] for i in range(0, len(train_stat)): if (key in train_stat[i][2].keys() and i != 12): scored.append(i) scored_sim.append(people[i]) print('scored', scored) print('scored_sim', scored_sim) ordered = heapq.nlargest(relative_people_num, scored_sim) print('ordered', ordered) if (len(ordered) == 0): print("warning")
if item in sim_matrix[item_predict]:# 计算评分 predict_score = predict_score+user_item_record[user][item] *sim_matrix[item_predict][item] item_record[user][item_predict] = predict_score #存储对应的预测评分 except KeyError: continue cnt = cnt+1 print ("finish",cnt) f.writepkl("new/result.pkl",item_record) def ini_item_item(): for ids in range(1,user_max,10): filename = "data/item_matrix2/" + str(int(ids / 10)) + ".pkl" item_matrix = {} f.writepkl(filename,item_matrix) print("ini ok") if __name__ == '__main__': train_record = f.readpkl("data/item_record.pkl") test_record = f.readpkl("data/test_record.pkl") item_cnt = f.readpkl("data/item_cnt.pkl") item_dict = {} predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict) print ("ok") test_record = f.readpkl("test_item.pkl") item_cnt = get_item_cnt(item_record=train_record) item_dict = {} predict2(train_record=train_record,test_record=test_record,item_cnt=item_cnt,item_dict= item_dict) print("ok")'''