vectors = read_record(sc, record_file) itemIndex = read_index(sc, itemIndex_file, True) userIndex = read_index(sc, userIndex_file, False) user_id = read_userID_file(sc, userID_file) f = open('output.txt','w') for u in user_id: uid = userIndex[u] user = getUserVector(uid, vectors) otherUser = vectors.filter(lambda (k,v): k != uid) neighbors = knn2(sc, user, otherUser, K) #neighbors = knn1(sc, user, otherUser, K) rlt = getRecommend(neighbors) tmp = "" for r in rlt: if r[0] in itemIndex: tmp = tmp + itemIndex[r[0]] + ' ' f.write("%s: %s\n" % (u, tmp)) f.close()
# coding: UTF-8 import matplotlib import matplotlib.pyplot as plt import numpy as np import knn import mnist_test group, labels = knn.createDataSet() bb = knn.classify0([0,0], group, labels, 3) print bb cc = knn.knn2([0,0], group, labels, 3) # 可视化数据 dataSet, labels = knn.file2matrix('data/datingTestSet2.txt') fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(dataSet[:,0], dataSet[:,1], s=15.0*labels, c=15.0*labels) plt.show() # 测试误判率 reload(knn) # testRatio为测试集比例,k为邻居个数 knn.knnTest('../data/datingTestSet2.txt',testRatio=0.2, k=3) # 测试手写数字识别