Exemplo n.º 1
0
def base_running_time(data):
    '''
        Calculates the running times for training and predictions for Baseline algorithm

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_Basetrain: running time for training
            elapsedtime_Basetest: running time for predictions on testset
    '''
    elapsedtime_Basetrain = []
    elapsedtime_Basetest = []

    # calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        baseline = BaselineOnly()
        baseline.train(training)
        elapsedtime_Basetrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        baseline.test(testing)
        elapsedtime_Basetest.append(time.time() - test_start)
    return elapsedtime_Basetrain, elapsedtime_Basetrain
Exemplo n.º 2
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    for trainset, testset in data.folds():
        pass

    algo = BaselineOnly()
    algo.train(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Exemplo n.º 3
0
def baseline(trainset, testset, predset):
    
    modelname = 'baseline'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = BaselineOnly(bsl_options=bsl_options)
    print('Baseline Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
def grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is,
                    file_name):

    print('KNN Surprise manual grid search')

    result_train = pd.DataFrame()
    result_test = pd.DataFrame()

    # loops on the parameters
    for n_epoch in n_epochs:
        for reg_u in reg_us:
            for reg_i in reg_is:

                bsl_options = {
                    'method': 'als',
                    'n_epochs': n_epoch,
                    'reg_u': reg_u,
                    'reg_i': reg_i
                }

                algo = BaselineOnly(bsl_options=bsl_options)
                # Retrieve the trainset.
                trainset = data_train.build_full_trainset()

                # Build an algorithm, and train it.
                algo.train(trainset)
                #Evaluate the performance
                perf_train = evaluate(algo, data_train, measures=['RMSE'])
                perf_test = evaluate(algo, data_test, measures=['RMSE'])

                perf_train["n_epoch"] = n_epoch
                perf_train["reg_u"] = reg_u
                perf_train["reg_i"] = reg_i
                #Store the mean performance RMSE on train
                perf_train["rmse"] = np.mean(perf_train['rmse'])

                perf_test["n_epoch"] = n_epoch
                perf_test["reg_u"] = reg_u
                perf_test["reg_i"] = reg_i
                #Store the mean performance RMSE on test
                perf_test["rmse"] = np.mean(perf_test['rmse'])

                #Store on a dataframe
                result_train = result_train.append(perf_train,
                                                   ignore_index=True)
                result_test = result_test.append(perf_test, ignore_index=True)

    # Save the dataframe so we will see or plot the differencies if it's interesting
    writer = pd.ExcelWriter(file_name, engine='xlsxwriter')
    result_train.to_excel(writer, 'Sheet1')
    result_test.to_excel(writer, 'Sheet2')
    writer.save()
Exemplo n.º 5
0
def knn_surprise(data_train, n_epoch, reg_u, reg_i, name_file):
    print('KNN Surprise')

    #We construct our KNN algo with surprise and the best parameters
    bsl_options = {
        'method': 'als',
        'n_epochs': n_epoch,
        'reg_u': reg_u,
        'reg_i': reg_i
    }

    #Create algo KNN BaselineOnly
    algo = BaselineOnly(bsl_options=bsl_options)
    # Retrieve the trainset.
    trainset = data_train.build_full_trainset()

    #Build an algorithm, and train it.
    algo.train(trainset)

    #Evaluate the RMSE of the algo
    evaluate(algo, data_train, measures=['RMSE'])
    # Make the prediction
    make_prediction_surprise(algo, name_file)
Exemplo n.º 6
0
def baseline(training, testing):
    '''
    Calculates RMSE, coverage and running time of Baseline model

    Args:
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of Baseline with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # fit model
    baseline = BaselineOnly()
    baseline.train(training)

    # evaluate the model using test data
    predictions = baseline.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
Exemplo n.º 7
0
def compute_recommendations(user_id, prediction_table,
                            numeric_prediction_table):

    algo = 'Baseline'

    algorithm = BaselineOnly()

    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview

    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))

    #reading in the database

    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)

    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()

    #     algorithm = eval(algo + "()")# set the algorithm...............................................

    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con=engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])

    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series(
        [user_id for x in range(len(predictions.index))],
        index=predictions.index)

    predictions['prediction'] = predicted_ratings

    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)

    cols = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id

    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    df_pred.to_sql(prediction_table, engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction': 'predicted_rating'},
                          inplace=True)

    df_num_ratings.to_sql('numeric_predictions',
                          engine,
                          if_exists='append',
                          index=False)  #if_exists='append'
    session.commit()

    predcols = [
        'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8',
        'num_9', 'num_10'
    ]

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols

    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[[
        'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7',
        'num_8', 'num_9', 'num_10'
    ]]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)

    df_num_ratings_transpose.to_sql(numeric_prediction_table,
                                    engine,
                                    if_exists='append',
                                    index=False)  #if_exists='append'
    session.commit()
Exemplo n.º 8
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.train(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
Exemplo n.º 9
0
#reader = Reader(line_format='user item rating', sep='\t')

# A reader is still needed but only the rating_scale param is requiered.

data.split(n_folds=20)  # data can now be used normally

data_full = data.build_full_trainset()

bu, bi, global_mean = get_baseline(datamat_missing,
                                   lr=0.01,
                                   n_epochs=50,
                                   reg=0)
best_item_est_oma = np.mean(bu) + bi + global_mean

algo_baseline = BaselineOnly(reg_u=0, reg_i=0)
algo_baseline.train(data_full)
best_item_est = algo_baseline.trainset._global_mean + np.mean(
    algo_baseline.bu) + algo_baseline.bi

algo_SVD = NMF(verbose=True,
               n_factors=5,
               n_epochs=50,
               reg_bu=0,
               reg_bi=0,
               reg_pu=0.1,
               reg_qi=0.1,
               biased=True)
algo_SVD.train(data_full)
best_item_est_svd = algo_SVD.trainset._global_mean + np.mean(
    algo_SVD.bu) + algo_SVD.bi
Exemplo n.º 10
0
count=1

for i in n_epochs:
    for j in reg_u:
        for k in reg_i:
            print("================================================")
            bsl_options = {'method': 'als',
                           'n_epochs': i,
                           'reg_u': j,
                           'reg_i': k
                           }

            algo = BaselineOnly(bsl_options=bsl_options)

            algo.train(trainset)
            print("This is the #" + str(count) + " parameter combination")
            predictions=algo.test(testset)

            print("n_epochs="+str(i)+", "+"reg_u="+str(j)+", "+"reg_i="+str(k))
            accuracy.rmse(predictions, verbose=True)
            accuracy.fcp(predictions, verbose=True)
            accuracy.mae(predictions, verbose=True)
            count=count+1


## baseline model using SGD
n_epochs=[5, 10]
reg=[0.2, 0.02] # where reg_u>0, and default = 0.02
learning_rate=[0.05, 0.005] # where between 0 and 1, and default = 0.005
Exemplo n.º 11
0
pred

print("Predicted Rating:")
pred[3]


# print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo_2 = BaselineOnly(bsl_options=bsl_options)

trainset = data.build_full_trainset()
algo_2.train(trainset)

pred = algo_2.predict('374', '500')

print("Prediction Object:")
pred

print("Predicted Rating:")
pred[3]

#Predicting all missing entries
#First lets start by visualising our matrix of all observed entries.
#This matrix is quite sparse.
import numpy as np

n_users = trainset.n_users