예제 #1
0
def making_model(id_purify_data, skin_type):
    evaluate_data = making_evaluate_data(id_purify_data, skin_type)
    evaluate_data = evaluate_data.build_full_trainset()
    svd = SVD()
    svd.fit(evaluate_data)
    return svd
예제 #2
0
    iter = 0
    for uid, iid, ratings in trainset.all_ratings():
        # print("is uid,iid int or not?", isinstance(uid, int))
        ruid = trainset.to_raw_uid(uid)
        riid = trainset.to_raw_iid(iid)
        # print("and raw ids are:",ruid,riid)
        testset[iter] = [ruid, riid, ratings]
        # print("testset element are:", testset[iter])
        iter += 1
    # Output testset to a csv file
    PM = pd.DataFrame(testset)
    PM.to_csv("TestSet.csv")

    # Initializing algorithm with predefined options
    # algo = NMF(biased = True)
    algo = SVD(biased=True)
    # algo = KNNBaseline()

    # Initializing sizes for Adaboost parameter matrices
    size_ui = (trainset.n_users + 1, trainset.n_items + 1)
    size_mui = (m, trainset.n_users + 1, trainset.n_items + 1)
    size_wmui = (m, WholeSet.n_users + 1, WholeSet.n_items + 1)

    # Initializing weight matrix
    W = np.ones(size_ui)
    # Initializing Adaboost Prediction matrix from ABtestset
    ABPredictM = np.zeros(size_wmui)
    # Initializing weight-update Prediction matrix from T_train
    PredictM = np.zeros(size_mui)
    # Initializing RMSE vector to store RMSE of ABtestset from each model in Adaboost iteration
    ABRMSE = np.zeros(m, dtype=float)
    data = ml.loadMovieLensLatestSmall()
    print("\nComputing movie popularity ranks so we can measure novelty later...")
    rankings = ml.getPopularityRanks()
    return (ml, data, rankings)

np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)

# SVD
SVD = SVD()
evaluator.AddAlgorithm(SVD, "SVD")

# SVD++
SVDPlusPlus = SVDpp()
evaluator.AddAlgorithm(SVDPlusPlus, "SVD++")

# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")

# Fight!
evaluator.Evaluate(False)

evaluator.SampleTopNRecs(ml)
예제 #4
0
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-1m')
trainset = data.build_full_trainset()
algo = SVD()
algo.train(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

movies = pd.read_csv('movies.csv', index_col='id')

rec = top_n['196'][0][0]
print('Top movie recommendation for user_id 196: {}'.format( \
      movies[movies.index==int(rec)]))
예제 #5
0
param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010],
              'n_factors': [50, 100]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(evaluationData)

# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)

params = gs.best_params['rmse']
SVDtuned = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors'])
evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned")

SVDUntuned = SVD()
evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned")

# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")

# Fight!
evaluator.Evaluate(False)

evaluator.SampleTopNRecs(gb)
예제 #6
0

plt.figure(figsize=[12, 10]).set_tight_layout(True)
trainset, testset = train_test_split(data, test_size=0.1)
algo = KNNWithMeans(k=30, sim_options={'name': 'pearson'})  # find in Q11
plot_ROC_of_algo(algo=algo,
                 curvelabel='K-NN',
                 color='darkorange',
                 trainset=trainset,
                 testset=testset)
algo = NMF(n_factors=20)
plot_ROC_of_algo(algo=algo,
                 curvelabel='NNMF',
                 color='cyan',
                 trainset=trainset,
                 testset=testset)
algo = SVD(n_factors=8, biased=True)
plot_ROC_of_algo(algo=algo,
                 curvelabel='MF with bias',
                 color='lime',
                 trainset=trainset,
                 testset=testset)

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
예제 #7
0
from surprise import Dataset, SVD, Reader
import pandas as pd

train_rating_df = pd.read_csv("train_rating.txt", header=0, index_col=0)
test = pd.read_csv('test_rating.txt', header=0, index_col=0)
test['dummy_rating'] = '-1'
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    train_rating_df[['user_id', 'business_id', 'rating']], reader)
trainset = data.build_full_trainset()
algo = SVD(lr_all=0.0035, reg_all=0.04, n_factors=200, lr_bu=0.01, lr_bi=0.01)
algo.train(trainset)
testdata = Dataset.load_from_df(
    test[['user_id', 'business_id', 'dummy_rating']], reader)
predictions = algo.test(
    testdata.construct_testset(raw_testset=testdata.raw_ratings))
df = pd.DataFrame(predictions)
newdf = df['est']
newdf.rename('rating', inplace=True)
newdf.to_csv('submission.csv', header='rating', index_label='test_id')
예제 #8
0
                           inplace=True)

# In[39]:

predictions_df.groupby('userid').head(10).reset_index(drop=True)

# ## SVD Based Recommendation

# In[40]:

from surprise import SVD
from surprise import accuracy

# In[41]:

svd_model = SVD(n_factors=50, biased=False)
svd_model.fit(trainset)
test_pred_svd = svd_model.test(testset)

# ### RMSE for SVD

# In[42]:

accuracy.rmse(test_pred_svd)
accuracy.mae(test_pred_svd)

# In[43]:

test_pred_svd[20]

# ### Parameter tuning for SVD
예제 #9
0
b = datetime.now()
print("共", (b - a).seconds, "秒")


# transfer dataCombed into surpriseLib-SVD-fitting style
data = shuffle(dataCombed)
del dataCombed
data.to_csv("dataForDump/trainingData.data",
                    sep='\t', header=False, index=False)
reader = Reader(line_format='user item rating', sep='\t')
file_path = os.path.expanduser('dataForDump/trainingData.data')
dataForTraining=Dataset.load_from_file(file_path,reader=reader)
dataForTraining = dataForTraining.build_full_trainset()

# fitting...
algo = SVD(n_factors=30, n_epochs=30, lr_all=0.009, reg_all=0.08)
algo.fit(dataForTraining)

# Dump the SVD predictions for later usage
dump.dump("interDump/svd-predictions", predictions=None, algo=algo, verbose=False)
# prediction,algor = dump.load("svd-predictions")

# ----------------Insert supplierId-bidId-score into database----------------

# get ids of all suppliers who have at lease one operation
supplierId = data[['sid']].copy()
supplierId.drop_duplicates(inplace=True)

a=datetime.now()
#supplierData_dict供应商主营辅营物资转字典
supplierData_dict = {}
    dataframe["itemID"] = items
    dataframe["userID"] = users
    dataframe["ratings"] = ratings
    return dataframe

# =========================================================================

# #########################################################################
# Tests against Scikit-Surprise
# #########################################################################


reader = Reader(rating_scale=(0, 1))
algo = SVD(n_factors=K,
           n_epochs=100,
           biased=False,
           reg_all=0,
           lr_all=alph,
           verbose=False)

data = Dataset.load_from_df(mlong1, reader)
trainset = data.build_full_trainset()
algo.train(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
dfpred1 = predictions_df(predictions)
df1 = pd.concat([mlong1, dfpred1])
df1 = pd.DataFrame(df1)
df1 = df1.pivot(index="userID", columns="itemID", values="ratings")
num1 = np.array(df1)

data = Dataset.load_from_df(mlong2, reader)
예제 #11
0
alg1 = surprise.SVD()
alg2 = surprise.KNNBasic()
alg3 = surprise.NMF()

#cross_validate(alg1, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)
#cross_validate(alg2, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)
#cross_validate(alg3, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)

##############
# EVALUATION #
##############

benchmark = []
# Iterate over all algorithms --> First Fold ist train, k-1 Folds for testing
for algorithm in [SVD(), NMF(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm,
                             data,
                             measures=['RMSE', 'MAE', "MSE"],
                             cv=5,
                             verbose=False)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(
        pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                  index=['Algorithm']))
    benchmark.append(tmp)

pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
예제 #12
0
    print(
        "\nComputing movie popularity ranks so we can measure novelty later..."
    )
    rankings = ml.get_popularity_ranks()

    print("\nComputing item similarities so we can measure diversity later...")
    full_trainset = data.build_full_trainset()
    options = {'name': 'pearson_baseline', 'user_based': False}
    knn_model = KNNBaseline(sim_options=options)
    knn_model.fit(full_trainset)

    print("\nBuilding recommendation model...")
    train, test = train_test_split(data, test_size=.25, random_state=1)

    svd_model = SVD(random_state=10)
    svd_model.fit(train)

    print("\nComputing recommendations...")
    predictions = svd_model.test(test)

    print("\nEvaluating accuracy of model...")
    print("RMSE: ", metrics.rmse(predictions))
    print("MAE: ", metrics.mae(predictions))

    print("\nEvaluating top-10 recommendations...")

    # Set aside one rating per user for testing
    LOOCV = LeaveOneOut(n_splits=1, random_state=1)

    for train, test in LOOCV.split(data):
def main():
    # Load dataset
    df = pd.read_csv('input/combined_data_1.txt',
                     names=['Cust-Id', 'Ratings'],
                     usecols=[0, 1],
                     header=None)
    df.index = np.arange(0, len(df))

    # df_nan returns df with rows index that contain nan values
    df_nan = pd.DataFrame(pd.isnull(df.Ratings))
    df_nan = df_nan[df_nan['Ratings'] == True]
    # When reset_index is used, the old index becomes values in a column while the new index is sequential
    df_nan = df_nan.reset_index()

    # Numpy array
    movie_np = []
    movie_id = 1
    for i, j in zip(df_nan['index'][1:],
                    df_nan['index'][:-1]):  # excludes 23057834 in df_na
        temp = np.full((1, i - j - 1), movie_id)
        # i-j-1 because you want to know the number of rows in between 0 and 548.
        # The number of rows between 0 and 548 correspond to the number of customer ratings for movie 1
        movie_np = np.append(movie_np, temp)
        movie_id += 1

    last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id)
    # len(df) is the last customer rating for movie 4499 and df_nan.iloc[-1,0] is first row for customer ratings for 4499
    movie_np = np.append(movie_np, last_record)

    # Adjust dataframe with notnull and datatype
    df = df[pd.notnull(df['Ratings'])]
    df['Movie_Id'] = movie_np.astype(int)

    f = ['count', 'mean']

    # Benchmark movies
    df_movie_summary = df.groupby('Movie_Id')['Ratings'].agg(f)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0)
    movie_list = df_movie_summary[
        df_movie_summary['count'] < movie_benchmark].index
    print(f'Movie minimum times of review: {movie_benchmark}')

    # Benchmark users
    df_customer_summary = df.groupby('Cust-Id')['Ratings'].agg(f)
    customer_benchmark = round(df_customer_summary['count'].quantile(0.7), 0)
    customer_list = df_customer_summary[
        df_customer_summary['count'] < customer_benchmark].index
    print(f'Customer minimum times of review: {customer_benchmark}')

    # Slice df with benchmarked customer_list and movie_list
    df = df[~df['Movie_Id'].isin(movie_list)]
    df = df[~df['Cust-Id'].isin(customer_list)]
    df = df.reset_index(drop=True)

    # Create pivot table
    # df_p = pd.pivot_table(df, values='Ratings', index='Cust-Id', columns='Movie_Id')

    # Load movie titles into dataframe
    df_title = pd.read_csv('input\\movie_titles.csv',
                           encoding="ISO-8859-1",
                           names=['Movie_Id', 'Year', 'Name'])
    df_title.set_index('Movie_Id', inplace=True)

    # Count which user rates the most movies
    # df_count = df_p.count(axis='columns')
    # df_count = df_count.sort_values(ascending=False)
    # print(df_count)

    # Top 100K rows for faster evaluating
    reader = Reader()
    data = Dataset.load_from_df(
        df[['Cust-Id', 'Movie_Id', 'Ratings']][:100000], reader)

    # Choose algorithm
    algorithm = SVD()

    # Evaluate chosen algorithm
    cross_validate(algorithm,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=3,
                   verbose=True)

    # Viewing 5-star rated movies by chosen user
    df_chosen_user = df[(df['Cust-Id'] == '785314') & (df['Ratings'] == 5)]
    df_chosen_user = df_chosen_user.join(df_title)

    # Drop all ready seen movies from possibilities
    chosen_user = df_title.copy()
    chosen_user = chosen_user.reset_index()
    chosen_user = chosen_user[~chosen_user['Movie_Id'].isin(movie_list)]
    cond = chosen_user['Movie_Id'].isin(df_chosen_user['Movie_Id'])
    chosen_user.drop(chosen_user[cond].index, inplace=True)

    # Load complete dataset
    data = Dataset.load_from_df(df[['Cust-Id', 'Movie_Id', 'Ratings']], reader)

    # Create trainset
    trainset = data.build_full_trainset()

    # Fit algorithm
    algorithm.fit(trainset)

    # Predict
    chosen_user['Estimate_Score'] = chosen_user['Movie_Id'].apply(
        lambda x: algorithm.predict(785314, x).est)

    # Sort and clean prediction to print on console
    chosen_user = chosen_user.sort_values(['Estimate_Score'], ascending=False)
    chosen_user["Year"] = chosen_user["Year"].fillna(0.0).astype(int)
    print(chosen_user.head(n=10).to_string(index=False))

    # End timer
    print(f"Total prediction time {int(time.perf_counter())} seconds")

    # Print complete results to csv
    chosen_user.to_csv("output\\recommendation_results.csv", index=False)
예제 #14
0
    def make_predictions(self):
        """
        Predict ratings of un-rated wines based on past ratings and SVD.

        Returns
        -------
        top_k_items : defaultdict
            Top k recommended wines.
        top_k_items_pd : DataFrame
            Top k recommended wines. Columns: Username, Wine, est.
        predictions : list of surprise.prediction_algorithms.predictions.Prediction objects
            All rating predictions for all users and all wines.

        """
        # Tuning
        # if tune, always compare tuned and un-tuned cross-validation results
        if self.tune:
            tuned_algo = self.hyper_tune()
        algo = SVD()

        # Cross-validation
        # cross-validate with n_splits folds.
        # 5 folds (default) corresponds to a 80/20 split
        kf = KFold(n_splits=self.n_splits)
        # initialize cross-validation measures
        measures = ['rmse', 'mae', 'preci.@k', 'recall@k']
        if self.tune:
            rmse_tuned_vals = []
            mae_tuned_vals = []
            precision_tuned_vals = []
            recall_tuned_vals = []
            train_time_tuned = []
            test_time_tuned = []
        rmse_vals = []
        mae_vals = []
        precision_vals = []
        recall_vals = []
        train_time = []
        test_time = []

        # perform cross-validation
        for trainset, testset in kf.split(self.data_ml):
            # train and test algorithm
            if self.tune:
                start_time = time.time()
                tuned_algo.fit(trainset)  # train
                train_time_tuned.append(time.time() - start_time)
                start_time = time.time()
                tuned_predictions = tuned_algo.test(testset)  # test
                test_time_tuned.append(time.time() - start_time)
            start_time = time.time()
            algo.fit(trainset)  # train
            train_time.append(time.time() - start_time)
            start_time = time.time()
            predictions = algo.test(testset)  # test
            test_time.append(time.time() - start_time)

            # compute metrics
            if self.tune:
                # get rmse and mae
                rmse_tuned_vals.append(
                    accuracy.rmse(tuned_predictions, verbose=False))
                mae_tuned_vals.append(
                    accuracy.mae(tuned_predictions, verbose=False))
                # get precision@k and recall@k
                tuned_precisions, tuned_recalls = self.precision_recall_at_k(
                    tuned_predictions, threshold=3.5)
                # average precision@k and recall@k over all users
                precision_tuned_vals.append(
                    sum(prec for prec in tuned_precisions.values()) /
                    len(tuned_precisions))
                recall_tuned_vals.append(
                    sum(rec for rec in tuned_recalls.values()) /
                    len(tuned_recalls))
            # get rmse and mae
            rmse_vals.append(accuracy.rmse(predictions, verbose=False))
            mae_vals.append(accuracy.mae(predictions, verbose=False))
            # get precision@k and recall@k
            precisions, recalls = self.precision_recall_at_k(predictions,
                                                             threshold=3.5)
            # average precision@k and recall@k over all users
            precision_vals.append(
                sum(prec for prec in precisions.values()) / len(precisions))
            recall_vals.append(
                sum(rec for rec in recalls.values()) / len(recalls))

        # print metrics
        # take advantage of surprise.model_selection.validation.print_summary
        # test metrics results must be in the form of a dict of lists
        if self.tune:
            test_measures_tuned_dict = {}
            test_measures_tuned_list = [
                rmse_tuned_vals, mae_tuned_vals, precision_tuned_vals,
                recall_tuned_vals
            ]
        test_measures_dict = {}
        test_measures_list = [rmse_vals, mae_vals, precision_vals, recall_vals]
        for i, m in enumerate(measures):
            if self.tune:
                test_measures_tuned_dict[m] = test_measures_tuned_list[i]
            test_measures_dict[m] = test_measures_list[i]

        # use surprise.model_selection.validation.print_summary to print summary of results
        if self.tune:
            print('Tuned Cross-Validation Results:')
            surprise.model_selection.validation.print_summary(
                tuned_algo, measures, test_measures_tuned_dict, None,
                train_time_tuned, test_time_tuned, self.n_splits)
        print('Un-tuned Cross-Validation Results:')
        surprise.model_selection.validation.print_summary(
            algo, measures, test_measures_dict, None, train_time, test_time,
            self.n_splits)

        # Make recommendations
        # only recommend using tuned OR un-tuned algorithm
        # train on the full data set
        full_trainset = self.data_ml.build_full_trainset()
        if self.tune:
            start_time = time.time()
            tuned_algo.fit(full_trainset)  # train
            train_time = time.time() - start_time
            print(
                "Took {} seconds for tuned full training.".format(train_time))
        else:
            start_time = time.time()
            algo.fit(full_trainset)  # train
            train_time = time.time() - start_time
            print("Took {} seconds for un-tuned full training.".format(
                train_time))

        # all user-item pairs with no rating in the trainset (don't recommend already rated wines)
        anti_testset = trainset.build_anti_testset()
        if self.tune:
            start_time = time.time()
            predictions = tuned_algo.test(anti_testset)  # predict
            test_time = time.time() - start_time
            print("Took {} seconds for tuned predictions.".format(test_time))
        else:
            start_time = time.time()
            predictions = algo.test(anti_testset)  # predict
            test_time = time.time() - start_time
            print(
                "Took {} seconds for un-tuned predictions.".format(test_time))

        # Get top-k predictions for all users
        top_k_items, top_k_items_pd = self.get_top_k(predictions)

        return top_k_items, top_k_items_pd, predictions
예제 #15
0
def main(rec='SVD', threshold=4, topK=10):
    # First train an SVD algorithm on the movielens dataset.
    print("load data...")
    '''
    data = Dataset.load_builtin('ml-1m')
    # test set is made of 40% of the ratings.
    test_size = 0.4
    trainset, testset = train_test_split(data, test_size=test_size)
    '''

    # path to dataset file
    test_data_path = r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data'  #这个还不知道干嘛用
    file_path = os.path.expanduser(
        r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data')
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)
    trainset = data.build_full_trainset()

    test_user, test_item, test_rate = read_data(test_data_path)  #分为三组
    #print("test size %.1f..." % test_size)
    print("training...")

    sim_options = {
        'name': 'cosine',
        'user_based': False  # 计算物品相似度
    }
    #选择算法
    if rec == 'NMF':
        algo = NMF()
    elif rec == 'SVD':
        algo = SVD()
        name = ['SVD']
    else:
        algo = KNNBaseline(sim_options=sim_options)
        name = ['ItemKNN']

    train_start = time.time()
    algo.fit(trainset)
    train_end = time.time()
    print('train time:%.1f s' % (train_end - train_start))

    #Than predict ratings for all pairs (u, i) that are NOT in the training set.
    ######填充空值,预测trainset的值
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    test_end = time.time()
    print('test time:%.1f s' % (test_end - train_end))
    #top_n_est 是元组列表,元组里边是itemid 和 对应预测评分
    top_n_est, true_ratings = get_top_n(predictions, n=10, threshold=threshold)
    #模型评估
    f1, map, mrr, mndcg = evaluate_model_new(algo, test_user, test_item,
                                             test_rate, topK)
    eval_end = time.time()
    print('evaluate time:%.1f s' % (eval_end - test_end))
    print("algorithm : %s" % rec)
    print(
        'recommendation metrics: F1 : %0.4f, NDCG : %0.4f, MAP : %0.4f, MRR : %0.4f'
        % (f1, mndcg, map, mrr))
    print('%0.4f个用户' % algo.pu.shape)
    print('%0.4f个物品' % algo.qi.shape)
    return top_n_est
  qualified = qualified.sort_values('wr', ascending = False).head(10)
  return qualified

improved_recommendations('The Dark Knight')

#---------------------------------------------- Collborative Filtering Based Recommender ----------------------------------------------

reader = Reader()

ratings = pd.read_csv('ratings_small.csv')
ratings.head()

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds = 5)

svd = SVD()
evaluate(svd, data, measures = ['RMSE', 'MAE'])

trainset = data.build_fill_trainset()
svd.train(trainset)

ratings[ratings['userId'] == 1]

svd.predict(1, 302, 3)

#---------------------------------------------- Hybrid Recommender ----------------------------------------------

def convert_int(x):
  try:
    return int(x)
  except:
random.seed(0)


data = GetBookData(density_filter = False)
trainset, testset = train_test_split(data, test_size=0.25)


##Tuning Parameters
param_grid = {'n_epochs': [30, 30], 
            'lr_all': [0.001, 0.15],
              'reg_all':[0.01,0.1],
              'n_factors': [10, 200]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
gs.fit(data)
params = gs.best_params['rmse']
SVD_TUNED = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors'])
SVD_TUNED.fit(trainset)
gs_predictions = SVD_TUNED.test(testset)
rmse = accuracy.rmse(gs_predictions)

precisions, recalls = precision_recall_at_k(gs_predictions, k = 10, threshold = 4.9)
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall= sum(rec for rec in recalls.values()) / len(recalls)


metrics = {'rmse': rmse, 
               'avg_precision': avg_precision, 
               'avg_recall': avg_recall,
               'best_parameters': params}
results['SVD'] = metrics
예제 #18
0
def Cal_Svd(filepath, user_id):
    # 1. raw dataset
    rating = pd.read_csv(filepath)
    rating['userId'].value_counts()
    rating['placeId'].value_counts()

    # 관광 vs 미관광
    tab = pd.crosstab(rating['userId'], rating['placeId'])
    #print(tab)

    # rating
    # 두 개의 집단변수를 가지고 나머지 rating을 그룹화
    rating_g = rating.groupby(['userId', 'placeId'])
    rating_g.sum()
    tab = rating_g.sum().unstack()  # 행렬구조로 변환
    #print(tab)
    #print(tab.info())
    #사용자 2이 가지 않은 곳, 1,15, 39....

    # 2. rating 데이터셋 생성
    reader = Reader(rating_scale=(1, 5))  # 평점 범위
    data = Dataset.load_from_df(df=rating, reader=reader)
    # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다.
    #print(data)

    # 3. train/test set
    train = data.build_full_trainset()  # 훈련셋
    test = train.build_testset()  # 검정셋

    # 4. model 생성
    #help(SVD)
    model = SVD(n_factors=100, n_epochs=20, random_state=123)
    model.fit(train)  # model 생성

    # 5. user_id 입력
    #user_id = 1 # 추천대상자
    item_ids = range(0, 2106)  # placeId 범위
    actual_rating = 0  # 평점

    predict_result = []

    for item_id in item_ids:
        if not actual_rating in tab:
            actual_rating = 0
            predict_result.append(
                model.predict(user_id, item_id, actual_rating))
    ddff = pd.DataFrame(predict_result)
    #print(ddff)

    # 유저 1 추천 여행지 상위 5개
    result = ddff.sort_values(by='est', ascending=False)[:5]
    #print(result)
    results.append(result)


#
# if __name__ == '__main__':
#     Cal_Svd(filepath, user_id)
#     print(results[0])
#print(type(results[0]))     #dataframe
#print(results[0]['iid'])        # placeId
예제 #19
0
    user = prediction[0]
    book = prediction[1]
    actual_rating = prediction[2]
    recc_rating = prediction[3]
    if actual_rating == 0:
        write_str = str(user) + "," + str(book) + "," + str(
            actual_rating) + "," + str(recc_rating) + "\n"
        pred_file.write(write_str)
pred_file.close()
print("done")

# In[ ]:

from surprise import SVD

algo = SVD(n_factors=20, n_epochs=500, random_state=1)
trainSet = data.build_full_trainset()
algo.fit(trainSet)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

# In[ ]:

testset = trainSet.build_testset()
pred = algo.test(testset)
accuracy.rmse(pred, verbose=True), accuracy.mae(pred, verbose=True)

# In[ ]:

from surprise import KNNBasic

algo = KNNBasic(n_factors=20, n_epochs=500, random_state=1)
예제 #20
0
def build_model():
    # Load movies data from ./ml-20m/
    movies = pd.read_csv('ml-20m/movies.csv')
    tags = pd.read_csv('ml-20m/tags.csv')
    ratings = pd.read_csv('ml-20m/ratings.csv')
    # limit ratings to user ratings that have rated more that 55 movies
    # it also filters the number of movies we can keep-- the reason is my
    # laptop limited power.
    ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 55)
    movie_list_rating = ratings_f.movieId.unique().tolist()
    # filter the movies data frame
    movies = movies[movies.movieId.isin(movie_list_rating)]
    # map movie to id:
    Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))
    # remove unnecessary timesteps
    tags.drop(['timestamp'], 1, inplace=True)
    ratings_f.drop(['timestamp'], 1, inplace=True)
    # make a useful dataframe from tags and movies
    mixed = pd.merge(movies, tags, on='movieId', how='left')

    # create metadata from all tags and genres
    mixed.fillna("", inplace=True)
    mixed = pd.DataFrame(
        mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
    Final = pd.merge(movies, mixed, on='movieId', how='left')
    Final['metadata'] = Final[['tag', 'genres']].apply(lambda x: ' '.join(x),
                                                       axis=1)

    # text transformation and truncated SVD to create a content latent matrix:
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(Final['metadata'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=Final.index.tolist())
    svd = TruncatedSVD(n_components=200)
    latent_matrix_1 = svd.fit_transform(tfidf_df)
    latent_matrix_1_df = pd.DataFrame(latent_matrix_1,
                                      index=Final.title.tolist())

    # text transformation and truncated SVD to create a collaborative
    # latent matrix:
    ratings_f1 = pd.merge(movies['movieId'],
                          ratings_f,
                          on="movieId",
                          how="right")
    ratings_f2 = ratings_f1.pivot(index='movieId',
                                  columns='userId',
                                  values='rating').fillna(0)
    svd = TruncatedSVD(n_components=200)
    latent_matrix_2 = svd.fit_transform(ratings_f2)
    latent_matrix_2_df = pd.DataFrame(latent_matrix_2,
                                      index=Final.title.tolist())

    # now a user collabortive model using Surprise
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings_f1[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=.25)
    algorithm = SVD()
    # Train the algorithm on the trainset, and predict ratings for the testset
    algorithm.fit(trainset)
    accuracy.rmse(algorithm.test(testset))

    # pickle all necessary files in ./Files/:
    ratings_f.to_pickle('./Files/rating.pkl')
    latent_matrix_1_df.to_pickle('./Files/latent_content.pkl')
    latent_matrix_2_df.to_pickle('./Files/latent_collaborative.pkl')
    with open('./Files/map.pkl', 'wb') as f:
        pickle.dump(Mapping_file, f, pickle.HIGHEST_PROTOCOL)
    with open('./Files/model_svd.pkl', 'wb') as f:
        pickle.dump(algorithm, f, pickle.HIGHEST_PROTOCOL)

    return
예제 #21
0
    print(s)


#load data from a file
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp',
                sep='\t',
                skip_lines=0)
data = Dataset.load_from_file(file_path, reader=reader)
data.folds()

#Splitting data into 3 folds
data.split(n_folds=3, shuffle=False)

#PMF Algorithm
algo = SVD(biased=False)

#Printing the result
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

# def printPMF():
#     pt(perf)
# printPMF()
# # printPMF()
# os.chdir("C:/Users/Stark/Desktop/Programming/Everythin_else!/Work/Current/Recommender-System/Outputs/")
#
# with open('PMF.csv','w') as fo:
#     print_perf(perf,fo)
print_perf(perf)

#Visualization
예제 #22
0
trainset = rating_train2.build_full_trainset()
testset = rating_test2.build_full_trainset().build_testset()

#SVD Model

n_factors=[100] # where default = 100
n_epochs=[5] # where default = 20
lr_all=[0.05, 0.005] # where default = 0.005
reg_all=[0.2, 0.02] # where default = 0.02

count=1

for i in n_factors:
    for j in n_epochs:
        for k in lr_all:
            for m in reg_all:
                start = dt.datetime.today()
                print("================================================")
                algo = SVD(n_factors=i, n_epochs=j, lr_all=k, reg_all=m)

                algo.train(trainset)
                print("This is the #" + str(count) + " parameter combination")
                predictions=algo.test(testset)

                print("n_factors="+str(i)+", n_epochs="+str(j)+", lr_all="+str(k)+", reg_all="+str(m))
                accuracy.rmse(predictions, verbose=True)
                accuracy.fcp(predictions, verbose=True)
                accuracy.mae(predictions, verbose=True)
                count=count+1
                end = dt.datetime.today()
                print("Runtime: "+str(end - start))
예제 #23
0
 def build_model(self, data):
     algo = SVD()
     cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5)
     return algo
예제 #24
0
model_user_results = cross_validate(model_user, data, measures=['RMSE'], cv=5, verbose=True)
print('\n\nModel training successful!')

# Create model object
model_item = KNNBasic(sim_options={'user_based': False})
print('Model creation successful!')

# Train on data using cross-validation with k=5 folds, measuring the RMSE
# Note, this may have a lot of print output
# You can set verbose=False to prevent this from happening
model_item_results = cross_validate(model_item, data, measures=['RMSE'], cv=5, verbose=True)
print('\n\nModel training successful!')


# Create model object
model_matrix = SVD()
print('Model creation successful!')


# Train on data using cross-validation with k=5 folds, measuring the RMSE
# Note, this may take some time (2-3 minutes) to train, so please be patient
model_matrix_results = cross_validate(model_matrix, data, measures=['RMSE'], cv=5, verbose=True)
print('\n\nModel training successful!')

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = dict()
    for uid, _, true_r, est, _ in predictions:
        current = user_est_true.get(uid, list())