for datum in training_data: if (random.randrange(2) == 0): train_data.append(datum) else: test_data.append(datum) # Compute the global mean rating for a fallback. num_train = len(train_data) mean_rating = float(sum(map(lambda x: x['rating'], train_data)))/num_train num_users = len(user_list) mat = np.zeros((num_users,1)) for i in range(num_users): mat[i][0] = user_list[i]['age'] [_,resp] = kmeans_plus.kmeans_plus(mat, NUM_CLUSTERS); users = {} clusters = [] for i in range(NUM_CLUSTERS): clusters.append(set()) for i in range(num_users): clusters[resp[i]].add(user_list[i]['user']) users[user_list[i]['user']] = {'index':resp[i],'total':0.,'count':0} '''users = {} clusters = [] for i in range(5): clusters.append(set()) for i in range(num_users): cluster = 0
else: book_pref = mean_rating mat[user_keys[rating['user']]][book_keys[rating['isbn']]] = rating['rating'] - float(user['total']) / user['count'] + mean_rating - book_pref mat[user_keys[rating['user']]][book_keys[rating['isbn']]] = rating['rating'] - float(user['total']) / user['count']''' mat[user_keys[rating['user']]][book_keys[rating['isbn']]] = rating['rating'] - float(user['total']) / user['count'] pca = PCA(n_components = NUM_COMPONENTS) reduced_mat = pca.fit_transform(mat) '''vars = pca.explained_variance_ratio_ x = [w for w in range(NUM_COMPONENTS)] plt.plot(x,vars,'ro') plt.savefig('pca_variances.png')''' [mu,resp] = kmeans_plus.kmeans_plus(reduced_mat, NUM_CLUSTERS) cluster_ids = [] for i in range(NUM_CLUSTERS): cluster_ids.append(set()) for i in range(num_users): cluster_ids[resp[i]].add(inv_user_keys[i]) # Make predictions for each test query. long_book_keys = {} index = 0 for book in book_list: long_book_keys[book['isbn']] = index index += 1 training_sorted = []