def test_mse(): """Tests for the MSE function.""" predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)] assert mse(predictions) == 0 predictions = [pred(0, 0), pred(0, 2)] assert mse(predictions) == ((0 - 2)**2) / 2 predictions = [pred(2, 0), pred(3, 4)] assert mse(predictions) == ((2 - 0)**2 + (3 - 4)**2) / 2 with pytest.raises(ValueError): mse([])
def test_knn_based(data): """ Parameters ---------- data : dataframe Dataframe with columns userId, movieId, and rating in that order. Returns ------- test_mse : float The mean squared error for the knn based algorithm. """ reader = Reader(rating_scale=(1, 5)) knn_data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(knn_data, test_size=.10, random_state=24) algo = KNNWithMeans(k=5, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) predictions = algo.test(testset) test_mse = accuracy.mse(predictions, verbose=False) return test_mse
def metric(predictions, verbose=True, metric_type="rmse"): assert metric_type in {"mse", "fcp", "mae", "rmse"} if metric_type == "mse": metric = accuracy.mse(predictions=predictions, verbose=verbose) elif metric_type == "fcp": metric = accuracy.fcp(predictions=predictions, verbose=verbose) elif metric_type == "mae": metric = accuracy.mae(predictions=predictions, verbose=verbose) else: metric = accuracy.rmse(predictions=predictions, verbose=verbose) return metric
def test_svd(data): reader = Reader(rating_scale=(1, 5)) svd_data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(svd_data, test_size=.10, random_state=24) svd_model = SVD(n_factors=150, n_epochs=20, lr_all=0.008, reg_all=0.1, random_state=24) svd_model.fit(trainset) predictions = svd_model.test(testset) test_mse = accuracy.mse(predictions, verbose=False) return test_mse
def evaluateModel(self, model): recommendationMetrics = {} print( "\nEvaluating accuracy of SVD model based on MAE, MSE and RMSE score..." ) print( "\nIf we have Mean Absolute Error, Mean Square Error and Root Mean Square " "lower score then model is good in prediction...") model.fit(self.data.getTrainingData()) predictions = model.test(self.data.getTestData()) # calculate WAE score suing MAE function from the surprise library recommendationMetrics["Mean Absolute Error"] = accuracy.mae( predictions) recommendationMetrics["Mean Square Error"] = accuracy.mse(predictions) recommendationMetrics["Root Mean Square Error"] = accuracy.rmse( predictions)
def get_metrics(predictions): ''' Compute accuracy metrics Params - predictions = list of Suprise Predictions Returns - Dictionary with metrics TODO: Review https://hkh12.scripts.mit.edu/mlgp/Weeks/week15/evaluationRecoEngine.pdf for more relevant metrics ''' metric_dict = {} metric_dict['RMSE'] = accuracy.rmse(predictions, verbose=False) # TODO: fcp takes long time + is it relevant? #metric_dict['FCP'] = accuracy.fcp(predictions, verbose=False) metric_dict['MAE'] = accuracy.mae(predictions, verbose=False) metric_dict['MSE'] = accuracy.mse(predictions, verbose=False) return metric_dict
data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.30) similarityMatrix = np.zeros((10, 10)) # yourFile = 'venv\Lib\site-packages\surprise\prediction_algorithms\matrix.txt' # np.savetxt('venv\Lib\site-packages\surprise\prediction_algorithms\matrix.txt', np.matrix(similarityMatrix)) #if os.path.isfile(yourFile) and os.access(yourFile, os.R_OK): if True: print("Testset") print(type(testset[0])) algo = KNNBasic() algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions=predictions) accuracy.mae(predictions=predictions) accuracy.mse(predictions=predictions) tupleList = [] #for train in trainset.all_ratings(): # print(train) for prediction in predictions: # print(prediction) tupleList.append( (int(prediction[0]), int(prediction[1]), int(prediction[3]))) sortedTupleL = sorted(tupleList) midList = [] for predTup in sortedTupleL: if predTup[0] == 1 and predTup[2] >= 4: midList.append(predTup[1]) # print(midList)
def MSE(predictions): return accuracy.mse(predictions, verbose=False)
from surprise import KNNWithMeans from surprise import Dataset, Reader, accuracy from surprise.model_selection import KFold # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./knn_cf/ratings.csv', reader=reader) # trainset = data.build_full_trainset() # ItemCF 计算得分 # 取最相似的用户计算时,只取最相似的k个 algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) # 定义k折交叉验证,k=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练与预测 algo.fit(trainset) pred = algo.test(testset) # 计算RMSE accuracy.rmse(pred, verbose=True) accuracy.mse(pred, verbose=True) uid = str(196) iid = str(302) pred = algo.predict(uid, iid) print(pred)
def experiments(config_file): args = get_args_parser().parse_args(['@' + config_file]) # Set seed np.random.seed(int(args.seed)) # Construct output directory timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") outdir = args.outdir + str(args.dataset) + "/" + timestamp + '/' # Create results directory outdir_path = Path(outdir) if not outdir_path.is_dir(): os.makedirs(outdir) # Logging logfile = outdir + 'log.txt' log(logfile, "Directory " + outdir + " created.") # Set dataset if str(args.dataset) == 'ml-100k': dataset_name = 'MovieLens 100K' else: dataset_name = 'MovieLens 1M' # Load the MovieLens dataset (download it if needed), data = Dataset.load_builtin(str(args.dataset)) # 80-20 split train_dataset, test_dataset = train_test_split(data, test_size=.20, random_state=int(args.seed)) # Run Autoencoder [a_mse, a_runtime] = autoencoder(str(args.dataset), logfile, int(args.seed)) # Set algorithms user_based_msd_sim_options = {'name': 'msd', 'user_based': True} user_based_pearson_baseline_sim_options = { 'name': 'pearson_baseline', 'user_based': True } user_based_msd_algo = KNNBasic(sim_options=user_based_msd_sim_options) user_based_pearson_baseline_algo = KNNBasic( sim_options=user_based_pearson_baseline_sim_options) item_based_sim_options = {'name': 'msd', 'user_based': False} item_based_pearson_baseline_sim_options = { 'name': 'pearson_baseline', 'user_based': False } item_based_msd_algo = KNNBasic(sim_options=item_based_sim_options) item_based_pearson_baseline_algo = KNNBasic( sim_options=item_based_pearson_baseline_sim_options) algorithms = ( ("User MSD", user_based_msd_algo), ("User Pearson Baseline", user_based_pearson_baseline_algo), ("Item MSD", item_based_msd_algo), ("Item Pearson Baseline", item_based_pearson_baseline_algo), ) # Plotting plt.style.use('dark_background') fig, ax = plt.subplots() # Autoencoder results runtimes = [a_runtime] mses = [a_mse] # ax.annotate("Autoencoder", (runtimes[0] + .001, mses[0] + .001)) # Running for name, algorithm in algorithms: log(logfile, dataset_name + ", " + name) # Train time_start = time.time() algorithm.fit(train_dataset) time_stop = time.time() log( logfile, 'Train time: {0:f}'.format(round(time_stop - time_start, 2)).strip('0')) # Test time_start = time.time() predictions = algorithm.test(test_dataset) time_stop = time.time() runtime = round(time_stop - time_start, 2) runtimes += [runtime] log(logfile, 'Test time: {0:f}'.format(runtime).strip('0')) # MSE metric mse = accuracy.mse(predictions, verbose=False) mses += [mse] log(logfile, 'MSE: {0:1.4f}\n'.format(mse)) # Draw scatter plot ax.scatter(runtimes[1:], mses[1:], marker='x', color='red') # ax.scatter(runtimes, mses, marker='x', color='red') # Annotate scatter plot, i=0 is for Autoencoder for i, (name, _) in enumerate(algorithms): ax.annotate(name, (runtimes[i + 1] + .001, mses[i + 1] + .001)) # Set plot settings plt.title("{}".format(dataset_name), size=15) plt.xlabel('Runtime (s)') plt.ylabel('MSE') # Save plot plt.savefig(outdir + 'plot.png', bbox_inches='tight')
os.path.join(output_dir, '%s_%s_mse_predictions.txt' % (dataset_name, algo_name)), surprise_predictions_to_matrix(predictions)) if algo_name.startswith('1bitMC'): propensity_estimates = algo.propensity_estimates np.savetxt( os.path.join( output_dir, '%s_%s_mse_propensity_estimates.txt' % (dataset_name, algo_name)), propensity_estimates) print(algo_name, 'RMSE:', accuracy.rmse(predictions, verbose=False), 'MSE:', accuracy.mse(predictions, verbose=False), flush=True) print() # MAE print('[Dataset: %s - test set MAE]' % dataset_name, flush=True) for algo_name in algs_to_run: random.seed(algorithm_deterministic_seeds[algo_name] + 2) np.random.seed(algorithm_deterministic_seeds[algo_name] + 2) algo = best_algs_mae[algo_name] algo.fit(train_set) predictions = algo.test(test_set) np.savetxt( os.path.join(output_dir, '%s_%s_mae_predictions.txt' % (dataset_name, algo_name)),
# def printRest(): # for row in range(0, 5): # print(weightedPayoffList[row]) # print("----------------------------------------------------------------------") # for row in range(0, 5): # print(similarityMatrix[row][:]) # print("----------------------------------------------------------------------") # #for predict in predictions: # # print(predict) # # # printRest() print(accuracy.rmse(predictions=predictions)) print(accuracy.mae(predictions=predictions)) print(accuracy.mse(predictions=predictions)) print(accuracy.rmse(predictions=predictions2)) print(accuracy.mae(predictions=predictions2)) print(accuracy.mse(predictions=predictions2)) predTupleList = [] predMovList = [] for prediction in predictions: # print(prediction) predTupleList.append( (int(prediction[0]), int(prediction[1]), int(prediction[3]))) predMovList.append(int(prediction[1])) with open('predictions.txt', 'w') as f: for item in predictions:
# split into trainset and testset trainset, testset = train_test_split(data, test_size=.10) train_eval = trainset.build_testset() # train a Funk SGD-SVD algorithms: epochs = [1, 5, 10, 20, 40, 80, 100, 120, 150] train_mse = [] test_mse = [] for n_epoch in epochs: print("Number of epochs trained", n_epoch) algo = SVD(n_factors = 40, lr_all = 0.001, n_epochs = n_epoch) algo.fit(trainset) train_predictions = algo.test(train_eval) test_predictions = algo.test(testset) train_mse.append(accuracy.mse(train_predictions)) test_mse.append(accuracy.mse(test_predictions)) print(accuracy.mse(train_predictions), accuracy.mse(test_predictions)) # function to plot the learning curve through epochs def plot_learning_curve(iter_array, train_accuracy, test_accuracy, xlabel = 'iterations'): plt.plot(iter_array, train_accuracy, label='Train mse', linewidth=5) plt.plot(iter_array, test_accuracy, label='Test mse', linewidth=5) plt.xticks(fontsize=16); plt.yticks(fontsize=16); plt.xlabel(xlabel, fontsize=30); plt.ylabel('MSE', fontsize=30);
import pandas as pd from surprise import Reader, Dataset, accuracy, dump, SVD import surprise import pickle from surprise.model_selection import cross_validate, GridSearchCV, KFold, train_test_split data = pd.read_pickle("data.pickle") test = pd.read_pickle("test.pickle") kf = KFold() trainset, testset = train_test_split(data, test_size=.80) algo = dump.load('saved svd modelV12') model = algo[1].fit(data.build_full_trainset()) predictions = algo[1].test(testset) dump.dump('Complete SVD v1.12', predictions=False, algo=algo, verbose=0) accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mse(predictions, verbose=True)
R_surpise = Dataset.load_from_df(R[['userId', 'movieId', 'rating']], reader) svd = SVD(random_state=43, n_factors=25, n_epochs=500) # train algo on the whole data R_surpise = R_surpise.build_full_trainset() svd.fit(R_surpise) # test and fill in the missing ratings testset = R_surpise.build_anti_testset() predictions = svd.test(testset) # score model print(accuracy.rmse(predictions)) # 0.7426613175948654 print(accuracy.mse(predictions)) # 0.5515458326517415 # Reconstruction of original matrix original = np.zeros((R_surpise.n_users, R_surpise.n_items)) for (u, i, r) in R_surpise.all_ratings(): original[u][i] = r known_entries = (original != 0) mean = R_surpise.global_mean bi = svd.bi.reshape(svd.bi.shape[0], 1) bu = svd.bu.reshape(svd.bu.shape[0], 1) qi = svd.qi pu = svd.pu