예제 #1
0
def nmf(data, training, testing):
    '''
    Tune NMF parameters then calculates RMSE, coverage and running time of NMF

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of NMF with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]}

    # optimize parameters
    grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('NMF:', param)

    # fit model using the optimized parameters
    nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    nmf.train(training)

    # evaluate the model using test data
    predictions = nmf.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
예제 #2
0
def nmf_running_time(data):
    '''
        Calculates the running times for training and predictions for NMF

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_NMFtrain: running time for training
            elapsedtime_NMFtest: running time for predictions on testset
    '''
    elapsedtime_NMFtrain = []
    elapsedtime_NMFtest = []

    # tune the parameters on the entire data
    param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]}
    grid_search = GridSearch(NMF, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    n_factors = param['n_factors']
    n_epochs = param['n_epochs']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        nmf = NMF(n_factors=n_factors, n_epochs=n_epochs)
        nmf.train(training)
        elapsedtime_NMFtrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        nmf.test(testing)
        elapsedtime_NMFtest.append(time.time() - test_start)
    return elapsedtime_NMFtrain, elapsedtime_NMFtest
예제 #3
0
def compute_recommendations(user_id, prediction_table, numeric_prediction_table):


    algo = 'NMF'

    algorithm = NMF()



    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview



    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(sessionmaker(bind=engine,
                                      autocommit = False,
                                      autoflush = False))



    #reading in the database


    df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine)
    df_ratings=df_ratings[['user_id','item_id','rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()


    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id','item_id','rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)




    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()


#     algorithm = eval(algo + "()")# set the algorithm...............................................


    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con = engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])


    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index)


    predictions['prediction'] = predicted_ratings


    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)


    cols =['pred_1', 'pred_2','pred_3','pred_4',
                                   'pred_5','pred_6','pred_7','pred_8',
                                  'pred_9','pred_10']




    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id



    df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4',
                                       'pred_5','pred_6','pred_7','pred_8',
                                      'pred_9','pred_10']]

    df_pred['id'] = df_pred['id'].astype(int)



    df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True)


    df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    predcols =['num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols




    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)








    df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append'
    session.commit()
예제 #4
0
best_item_est_oma = np.mean(bu) + bi + global_mean

algo_baseline = BaselineOnly(reg_u=0, reg_i=0)
algo_baseline.train(data_full)
best_item_est = algo_baseline.trainset._global_mean + np.mean(
    algo_baseline.bu) + algo_baseline.bi

algo_SVD = NMF(verbose=True,
               n_factors=5,
               n_epochs=50,
               reg_bu=0,
               reg_bi=0,
               reg_pu=0.1,
               reg_qi=0.1,
               biased=True)
algo_SVD.train(data_full)
best_item_est_svd = algo_SVD.trainset._global_mean + np.mean(
    algo_SVD.bu) + algo_SVD.bi

f, axarr = plt.subplots(nrows=2)
im1 = axarr[0].imshow(datamat_missing)
plt.colorbar(im1, ax=axarr[0])

mean_rating = np.nanmean(datamat_missing, axis=1)

axarr[1].plot(mean_rating, marker='s')
axarr[1].plot(best_item_est_oma, marker='o')
axarr[1].plot(best_item_est, marker='<')
axarr[1].plot(best_item_est_svd, marker='>')
f.show()
trainset = rating_train2.build_full_trainset()
testset = rating_test2.build_full_trainset().build_testset()

#NMF Model

n_factors=[15] # where default = 15
n_epochs=[50] # where default = 50
reg_pu=[0.06] # where default = 0.06
reg_qi=[0.06] # where default = 0.06

count=1

for i in n_factors:
    for j in n_epochs:
        for k in reg_pu:
            for m in reg_qi:
                start = dt.datetime.today()
                print("================================================")
                algo = NMF(n_factors=i, n_epochs=j, reg_pu=k, reg_qi=m, biased=True)

                algo.train(trainset)
                print("This is the #" + str(count) + " parameter combination")
                predictions=algo.test(testset)

                print("n_factors="+str(i)+", n_epochs="+str(j)+", reg_pu="+str(k)+", reg_qi="+str(m))
                accuracy.rmse(predictions, verbose=True)
                accuracy.fcp(predictions, verbose=True)
                accuracy.mae(predictions, verbose=True)
                count=count+1
                end = dt.datetime.today()
                print("Runtime: "+str(end - start))
예제 #6
0
    del dfTest['date']
    del dfTest['test_id']

    # Set the rating scale and create the data for Surprise to use
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        dfRatings[['user_id', 'business_id', 'rating']], reader)

    # Cross validation for tuning
    # Split in 5 folds
    data.split(5)

    # This part is to use all the data to train and get the output
    train_set = data.build_full_trainset()

    # Use NMF with surprise
    algo = NMF()
    algo.train(train_set)

    f = open('PMFOutput.csv', 'w')
    f.write("test_id,rating\n")
    for i in range(len(dfTest)):
        prediction = algo.predict(dfTest.at[i, 'user_id'],
                                  dfTest.at[i, 'business_id'],
                                  r_ui=4,
                                  verbose=True)
        predRating = prediction.est
        f.write(str(i) + "," + str(predRating) + '\n')

    f.close()
예제 #7
0
def compute_recommendations():
    #connecting to the database
    # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True)
    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))

    blockPrint()

    #reading in the database
    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    #formatting the dataset using the surprise library
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 5))
    data = Dataset.load_from_df(df_ratings, reader=reader)
    training_set = data.build_full_trainset()

    algorithm = NMF()  # use the singular value decomposition

    algorithm.train(training_set)  # fit the data to the model
    testing_set = training_set.build_anti_testset()
    predictions = algorithm.test(testing_set)  # make prediction

    #writing the function for top predictions
    def get_top_n(predictions, n=10):
        #     Return the top-N recommendation for each user from a set of predictions.

        #     Args:
        #         predictions(list of Prediction objects): The list of predictions, as
        #             returned by the test method of an algorithm.
        #         n(int): The number of recommendation to output for each user. Default
        #             is 10.

        #     Returns:
        #     A dict where keys are user (raw) ids and values are lists of tuples:
        #         [(raw item id, rating estimation), ...] of size n.

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
# getting the top 10 predictions
    top_n = get_top_n(predictions, n=10)

    # Print the recommended items for each user
    a = []
    for uid, user_ratings in top_n.items():
        a.append([uid, [iid for (iid, _) in user_ratings]])
    df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B'])

    df_user = pd.DataFrame(df_list_pred.A.values.tolist())
    df_pred = pd.DataFrame(df_list_pred.B.values.tolist())

    df_pred.columns = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_items = pd.read_sql('SELECT * FROM items;', con=engine)

    # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title'])
    df_pred[['id']] = df_user
    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    # Append recomemndations
    df_pred.to_sql('recommendations', engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    #logging the predictions
    df_log = df_pred
    df_log['algorithm'] = 'NMF'
    df_log = df_log.rename(columns={'id': 'user_id'})
    df_log = df_log[[
        'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm'
    ]]

    df_log.to_sql('predictionlogs', engine, if_exists='append',
                  index=False)  #if_exists='append'
    session.commit()

    global mae2
    global rmse2
    mae2 = accuracy.mae(predictions)
    rmse2 = accuracy.rmse(predictions)
    mae2 = float(mae2)
    rmse2 = float(rmse2)
class MovieRecommender:
    def __init__(self):
        self._knn = None
        self._nmf = None
        self._trainset = None
        self._predictions = None

        self.initialized = False

    def initialize(self, data_filepath):
        self._data = Dataset.load_from_file(data_filepath,
                                            reader=Reader('ml-100k'))
        self._trainset = self._data.build_full_trainset()

        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self._knn = KNNBaseline(sim_options=sim_options)
        self._nmf = NMF()

        start_new_thread(self._train)

    def get_similar_movies(self, movie_id, k=10):
        if not self.initialized:
            return []

        model = self._knn

        movie_inner_id = model.trainset.to_inner_iid(movie_id)
        similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k)

        to_raw_iid = model.trainset.to_raw_iid
        similar_movie_ids = (to_raw_iid(inner_id)
                             for inner_id in similar_movie_inner_ids)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def get_similar_movies_for_user(self, user_id, num_movies=10):
        if not self.initialized:
            return []

        user_id = str(user_id)
        user_predictions = [
            prediction for prediction in self._predictions
            if prediction[0] == user_id
        ]

        sorted_predictions = sorted(user_predictions,
                                    key=lambda x: x.est,
                                    reverse=True)
        top_n_predictions = sorted_predictions[:num_movies]

        similar_movie_ids = (prediction.iid
                             for prediction in top_n_predictions)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def update_user_ratings(self, user_id, movie_id, rating):
        if not self.initialized:
            return

        rating = float(rating)

        has_previous_rating = False
        if self._trainset.knows_user(user_id):
            trainset_dict = dict(self._trainset.ur[user_id])
            has_previous_rating = movie_id in trainset_dict

        user_id = str(user_id)
        movie_id = str(movie_id)
        new_rating = (user_id, movie_id, rating, time())
        if has_previous_rating:
            for i, rating in enumerate(self._data.raw_ratings):
                if rating[0] == user_id and rating[1] == movie_id:
                    self._data.raw_ratings[i] = new_rating
                    break
        else:
            self._data.raw_ratings.append(new_rating)

        self._trainset = self._data.build_full_trainset()
        self._train()

    def _train(self):
        self._nmf.train(self._trainset)
        self._knn.train(self._trainset)

        self._predictions = self._nmf.test(self._trainset.build_anti_testset())

        self.initialized = True
from surprise import Reader, Dataset
from surprise import NMF, evaluate

# creating the format for the dataset when given the user, item, rating and timestamp
data_reader = Reader(line_format="user item rating timestamp", sep="\t")

# store the data in the specific format created above
# u. data is the data we want
data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader)

# will be splitting the data into 5 folds for cross validation
data.split(n_folds=5)

# for this project I will be using the NMF algorithm
algorithm = NMF()
evaluate(algorithm, data, measures=["RMSE", "MAE"])

# train the whole data set now
training_set = data.build_full_trainset()
algorithm.train(training_set)

# set the specific user and movie I want to predict
user_id = str(200)
item_id = str(222)
actual_rating = 5

# see how it works!
print(algorithm.predict(user_id, item_id, actual_rating))
예제 #10
0
grid_search = GridSearch(NMF, param_grid_NMF, measures=['RMSE'])

# Evaluate performances of our algorithm on the dataset.
grid_search.evaluate(data)

print('best: ' + str(grid_search.best_score['RMSE']))

# combination of parameters that gave the best FCP score
print('best params: ' + str(grid_search.best_params['RMSE']))

params = grid_search.best_params['RMSE']
algo_NMF = NMF(verbose=True,
               n_factors=params['n_factors'],
               n_epochs=params['n_epochs'],
               biased=params['biased'])
algo_NMF.train(data_full)

#%% SVD
param_grid_SVD = {
    'n_factors': [5, 10, 20],
    'n_epochs': [70],
    'lr_all': [0.005, 0.003, 0.001],
    'reg_all': [0.005, 0.01, 0.02]
}

#grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE'])
grid_search = GridSearch(SVD, param_grid_SVD, measures=['RMSE'])

# Evaluate performances of our algorithm on the dataset.
grid_search.evaluate(data)