Exemplo n.º 1
0
def test_mse():
    """Tests for the MSE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert mse(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert mse(predictions) == ((0 - 2)**2) / 2

    predictions = [pred(2, 0), pred(3, 4)]
    assert mse(predictions) == ((2 - 0)**2 + (3 - 4)**2) / 2

    with pytest.raises(ValueError):
        mse([])
Exemplo n.º 2
0
def test_knn_based(data):
    """
    Parameters
    ----------
    data : dataframe
        Dataframe with columns userId, movieId, and rating in that order.

    Returns
    -------
    test_mse : float
        The mean squared error for the knn based algorithm.

    """
    reader = Reader(rating_scale=(1, 5))
    knn_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(knn_data,
                                         test_size=.10,
                                         random_state=24)
    algo = KNNWithMeans(k=5,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)
    predictions = algo.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
Exemplo n.º 3
0
 def metric(predictions, verbose=True, metric_type="rmse"):
     assert metric_type in {"mse", "fcp", "mae", "rmse"}
     if metric_type == "mse":
         metric = accuracy.mse(predictions=predictions, verbose=verbose)
     elif metric_type == "fcp":
         metric = accuracy.fcp(predictions=predictions, verbose=verbose)
     elif metric_type == "mae":
         metric = accuracy.mae(predictions=predictions, verbose=verbose)
     else:
         metric = accuracy.rmse(predictions=predictions, verbose=verbose)
     return metric
Exemplo n.º 4
0
def test_svd(data):
    reader = Reader(rating_scale=(1, 5))
    svd_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(svd_data,
                                         test_size=.10,
                                         random_state=24)
    svd_model = SVD(n_factors=150,
                    n_epochs=20,
                    lr_all=0.008,
                    reg_all=0.1,
                    random_state=24)
    svd_model.fit(trainset)
    predictions = svd_model.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
    def evaluateModel(self, model):

        recommendationMetrics = {}
        print(
            "\nEvaluating accuracy of SVD model based on MAE, MSE and RMSE score..."
        )
        print(
            "\nIf we have Mean Absolute Error, Mean Square Error and Root Mean Square "
            "lower score then model is good in prediction...")
        model.fit(self.data.getTrainingData())
        predictions = model.test(self.data.getTestData())
        # calculate WAE score suing MAE function from the surprise library
        recommendationMetrics["Mean Absolute Error"] = accuracy.mae(
            predictions)
        recommendationMetrics["Mean Square Error"] = accuracy.mse(predictions)
        recommendationMetrics["Root Mean Square Error"] = accuracy.rmse(
            predictions)
Exemplo n.º 6
0
def get_metrics(predictions):
  '''
  Compute accuracy metrics

  Params
    - predictions = list of Suprise Predictions
  Returns
    - Dictionary with metrics

  TODO: Review https://hkh12.scripts.mit.edu/mlgp/Weeks/week15/evaluationRecoEngine.pdf for more relevant metrics

  '''
  metric_dict = {}
  metric_dict['RMSE'] = accuracy.rmse(predictions, verbose=False)
  # TODO: fcp takes long time + is it relevant?
  #metric_dict['FCP'] = accuracy.fcp(predictions, verbose=False)
  metric_dict['MAE'] = accuracy.mae(predictions, verbose=False)
  metric_dict['MSE'] = accuracy.mse(predictions, verbose=False)

  return metric_dict
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.30)
similarityMatrix = np.zeros((10, 10))
# yourFile = 'venv\Lib\site-packages\surprise\prediction_algorithms\matrix.txt'
# np.savetxt('venv\Lib\site-packages\surprise\prediction_algorithms\matrix.txt', np.matrix(similarityMatrix))

#if os.path.isfile(yourFile) and os.access(yourFile, os.R_OK):
if True:
    print("Testset")
    print(type(testset[0]))
    algo = KNNBasic()
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions=predictions)
    accuracy.mae(predictions=predictions)
    accuracy.mse(predictions=predictions)
    tupleList = []

#for train in trainset.all_ratings():
#    print(train)

for prediction in predictions:
    # print(prediction)
    tupleList.append(
        (int(prediction[0]), int(prediction[1]), int(prediction[3])))
sortedTupleL = sorted(tupleList)
midList = []
for predTup in sortedTupleL:
    if predTup[0] == 1 and predTup[2] >= 4:
        midList.append(predTup[1])
# print(midList)
 def MSE(predictions):
     return accuracy.mse(predictions, verbose=False)
Exemplo n.º 9
0
from surprise import KNNWithMeans
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import KFold

# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./knn_cf/ratings.csv', reader=reader)
# trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

# 定义k折交叉验证,k=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练与预测
    algo.fit(trainset)
    pred = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(pred, verbose=True)
    accuracy.mse(pred, verbose=True)

uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
print(pred)
Exemplo n.º 10
0
def experiments(config_file):
    args = get_args_parser().parse_args(['@' + config_file])

    # Set seed
    np.random.seed(int(args.seed))

    # Construct output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outdir = args.outdir + str(args.dataset) + "/" + timestamp + '/'

    # Create results directory
    outdir_path = Path(outdir)
    if not outdir_path.is_dir():
        os.makedirs(outdir)

    # Logging
    logfile = outdir + 'log.txt'
    log(logfile, "Directory " + outdir + " created.")

    # Set dataset
    if str(args.dataset) == 'ml-100k':
        dataset_name = 'MovieLens 100K'
    else:
        dataset_name = 'MovieLens 1M'

    # Load the MovieLens dataset (download it if needed),
    data = Dataset.load_builtin(str(args.dataset))

    # 80-20 split
    train_dataset, test_dataset = train_test_split(data,
                                                   test_size=.20,
                                                   random_state=int(args.seed))

    # Run Autoencoder
    [a_mse, a_runtime] = autoencoder(str(args.dataset), logfile,
                                     int(args.seed))

    # Set algorithms
    user_based_msd_sim_options = {'name': 'msd', 'user_based': True}
    user_based_pearson_baseline_sim_options = {
        'name': 'pearson_baseline',
        'user_based': True
    }
    user_based_msd_algo = KNNBasic(sim_options=user_based_msd_sim_options)
    user_based_pearson_baseline_algo = KNNBasic(
        sim_options=user_based_pearson_baseline_sim_options)

    item_based_sim_options = {'name': 'msd', 'user_based': False}
    item_based_pearson_baseline_sim_options = {
        'name': 'pearson_baseline',
        'user_based': False
    }
    item_based_msd_algo = KNNBasic(sim_options=item_based_sim_options)
    item_based_pearson_baseline_algo = KNNBasic(
        sim_options=item_based_pearson_baseline_sim_options)

    algorithms = (
        ("User MSD", user_based_msd_algo),
        ("User Pearson Baseline", user_based_pearson_baseline_algo),
        ("Item MSD", item_based_msd_algo),
        ("Item Pearson Baseline", item_based_pearson_baseline_algo),
    )

    # Plotting
    plt.style.use('dark_background')
    fig, ax = plt.subplots()

    # Autoencoder results
    runtimes = [a_runtime]
    mses = [a_mse]
    # ax.annotate("Autoencoder", (runtimes[0] + .001, mses[0] + .001))

    # Running
    for name, algorithm in algorithms:
        log(logfile, dataset_name + ", " + name)

        # Train
        time_start = time.time()
        algorithm.fit(train_dataset)
        time_stop = time.time()
        log(
            logfile,
            'Train time: {0:f}'.format(round(time_stop - time_start,
                                             2)).strip('0'))

        # Test
        time_start = time.time()
        predictions = algorithm.test(test_dataset)
        time_stop = time.time()
        runtime = round(time_stop - time_start, 2)
        runtimes += [runtime]
        log(logfile, 'Test time: {0:f}'.format(runtime).strip('0'))

        # MSE metric
        mse = accuracy.mse(predictions, verbose=False)
        mses += [mse]
        log(logfile, 'MSE: {0:1.4f}\n'.format(mse))

    # Draw scatter plot
    ax.scatter(runtimes[1:], mses[1:], marker='x', color='red')
    # ax.scatter(runtimes, mses, marker='x', color='red')

    # Annotate scatter plot, i=0 is for Autoencoder
    for i, (name, _) in enumerate(algorithms):
        ax.annotate(name, (runtimes[i + 1] + .001, mses[i + 1] + .001))

    # Set plot settings
    plt.title("{}".format(dataset_name), size=15)
    plt.xlabel('Runtime (s)')
    plt.ylabel('MSE')

    # Save plot
    plt.savefig(outdir + 'plot.png', bbox_inches='tight')
Exemplo n.º 11
0
        os.path.join(output_dir,
                     '%s_%s_mse_predictions.txt' % (dataset_name, algo_name)),
        surprise_predictions_to_matrix(predictions))

    if algo_name.startswith('1bitMC'):
        propensity_estimates = algo.propensity_estimates
        np.savetxt(
            os.path.join(
                output_dir, '%s_%s_mse_propensity_estimates.txt' %
                (dataset_name, algo_name)), propensity_estimates)

    print(algo_name,
          'RMSE:',
          accuracy.rmse(predictions, verbose=False),
          'MSE:',
          accuracy.mse(predictions, verbose=False),
          flush=True)

print()

# MAE
print('[Dataset: %s - test set MAE]' % dataset_name, flush=True)
for algo_name in algs_to_run:
    random.seed(algorithm_deterministic_seeds[algo_name] + 2)
    np.random.seed(algorithm_deterministic_seeds[algo_name] + 2)
    algo = best_algs_mae[algo_name]
    algo.fit(train_set)
    predictions = algo.test(test_set)
    np.savetxt(
        os.path.join(output_dir,
                     '%s_%s_mae_predictions.txt' % (dataset_name, algo_name)),
# def printRest():
#     for row in range(0, 5):
#         print(weightedPayoffList[row])
#     print("----------------------------------------------------------------------")
#     for row in range(0, 5):
#         print(similarityMatrix[row][:])
#     print("----------------------------------------------------------------------")
#     #for predict in predictions:
#     #    print(predict)
#
#
# printRest()

print(accuracy.rmse(predictions=predictions))
print(accuracy.mae(predictions=predictions))
print(accuracy.mse(predictions=predictions))
print(accuracy.rmse(predictions=predictions2))
print(accuracy.mae(predictions=predictions2))
print(accuracy.mse(predictions=predictions2))

predTupleList = []
predMovList = []

for prediction in predictions:
    # print(prediction)
    predTupleList.append(
        (int(prediction[0]), int(prediction[1]), int(prediction[3])))
    predMovList.append(int(prediction[1]))

with open('predictions.txt', 'w') as f:
    for item in predictions:
Exemplo n.º 13
0
# split into trainset and testset
trainset, testset = train_test_split(data, test_size=.10)
train_eval = trainset.build_testset()

# train a Funk SGD-SVD algorithms:
epochs = [1, 5, 10, 20, 40, 80, 100, 120, 150]
train_mse = []
test_mse = []
for n_epoch in epochs:
    print("Number of epochs trained", n_epoch)
    algo = SVD(n_factors = 40, lr_all = 0.001, n_epochs = n_epoch)
    algo.fit(trainset)
    train_predictions = algo.test(train_eval)
    test_predictions = algo.test(testset)
    train_mse.append(accuracy.mse(train_predictions))
    test_mse.append(accuracy.mse(test_predictions))
    print(accuracy.mse(train_predictions), accuracy.mse(test_predictions))

# function to plot the learning curve through epochs
def plot_learning_curve(iter_array, train_accuracy, test_accuracy, xlabel = 'iterations'):
    plt.plot(iter_array, train_accuracy,
             label='Train mse', linewidth=5)
    plt.plot(iter_array, test_accuracy,
             label='Test mse', linewidth=5)


    plt.xticks(fontsize=16);
    plt.yticks(fontsize=16);
    plt.xlabel(xlabel, fontsize=30);
    plt.ylabel('MSE', fontsize=30);
import pandas as pd
from surprise import Reader, Dataset, accuracy, dump, SVD
import surprise
import pickle
from surprise.model_selection import cross_validate, GridSearchCV, KFold, train_test_split

data = pd.read_pickle("data.pickle")
test = pd.read_pickle("test.pickle")
kf = KFold()

trainset, testset = train_test_split(data, test_size=.80)

algo = dump.load('saved svd modelV12')
model = algo[1].fit(data.build_full_trainset())
predictions = algo[1].test(testset)
dump.dump('Complete SVD v1.12', predictions=False, algo=algo, verbose=0)
accuracy.rmse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)
accuracy.fcp(predictions, verbose=True)
accuracy.mse(predictions, verbose=True)
    R_surpise = Dataset.load_from_df(R[['userId', 'movieId', 'rating']],
                                     reader)

    svd = SVD(random_state=43, n_factors=25, n_epochs=500)

    # train algo on the whole data
    R_surpise = R_surpise.build_full_trainset()
    svd.fit(R_surpise)

    # test and fill in the missing ratings
    testset = R_surpise.build_anti_testset()
    predictions = svd.test(testset)
    # score model
    print(accuracy.rmse(predictions))
    # 0.7426613175948654
    print(accuracy.mse(predictions))
    # 0.5515458326517415

    # Reconstruction of original matrix

    original = np.zeros((R_surpise.n_users, R_surpise.n_items))

    for (u, i, r) in R_surpise.all_ratings():
        original[u][i] = r

    known_entries = (original != 0)
    mean = R_surpise.global_mean
    bi = svd.bi.reshape(svd.bi.shape[0], 1)
    bu = svd.bu.reshape(svd.bu.shape[0], 1)
    qi = svd.qi
    pu = svd.pu