Exemplo n.º 1
0
def surprise_baseline(trainset, finalset):
    "baseline model"

    algo = spr.BaselineOnly()

    algo.fit(trainset)
    predictions_final = algo.test(finalset)

    return spr_estimate_to_vect(predictions_final)
Exemplo n.º 2
0
def baseline(train, test):
    """
    Run Baseline model from Surprise library.
    @param train: the training set in the Surprise format.
    @param test: the test set in the Surprise format.
    @return: the predictions in a numpy array.
    """
    algo = spr.BaselineOnly()
    algo.fit(train)
    predictions = algo.test(test)
    return get_predictions(predictions)
Exemplo n.º 3
0
def main(args):

    user_item_based = 'item_based' if args.item_based else 'user_based'
    filename = '_'.join([
        args.exp_name, args.algorithm, args.sim_name, user_item_based,
        str(args.num_rows)
    ]) + '.pkl'

    output_file = Path(filename)
    if output_file.exists():
        print(f'ERROR! Output file {output_file} already exists. Exiting!')
        sys.exit(1)

    print(f'Saving scores in {output_file}\n')

    reader = surprise.Reader(rating_scale=(1, 5))
    df = pq.read_table('all_ratings_with_indices.parquet',
                       columns=['user_idx', 'movie_idx',
                                'rating']).to_pandas()
    df.user_idx = df.user_idx.astype(np.uint32)
    df.movie_idx = df.movie_idx.astype(np.uint16)
    df.rating = df.rating.astype(np.uint8)
    print(df.dtypes)
    data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader)
    del df
    sim_options = {
        'name': args.sim_name,
        'user_based': False if args.item_based else True
    }

    if args.algorithm == 'knn':
        algo = surprise.KNNBasic(sim_options=sim_options)
    elif args.algorithm == 'baseline':
        algo = surprise.BaselineOnly()
    elif args.algorithm == 'normal':
        algo = surprise.NormalPredictor()
    elif args.algorithm == 'knn_zscore':
        algo = surprise.KNNWithZScore(sim_options=sim_options)
    elif args.algorithm == 'svd':
        algo = surprise.SVD()
    elif args.algorithm == 'nmf':
        algo = surprise.NMF()
    else:
        print(f'Algorithm {args.algorithm} is not a valid choice.')

    scores = surprise.model_selection.cross_validate(algo,
                                                     data,
                                                     cv=args.cv_folds,
                                                     verbose=True,
                                                     n_jobs=-1)

    pickle.dump(scores, open(output_file, 'wb'))
def get_surprise_model(train_df):
    reader = surprise.Reader(rating_scale=(train_df["rating"].min(),
                                           train_df["rating"].max()))
    surprise_inp = surprise.Dataset.load_from_df(train_df,
                                                 reader).build_full_trainset()
    model = surprise.BaselineOnly(bsl_options={
        'method': 'als',
        'n_epochs': 50,
        'reg_u': 12,
        'reg_i': 5
    })
    model.fit(surprise_inp)
    return model
Exemplo n.º 5
0
def algo_tester(data_object):
    '''
  Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms

  ---Parameters---
  data_object(variable) created from the read_data_surprise function

  ---Returns---
  returns a dataframe where you can compare the performance of different algorithms
  '''
    benchmark = []
    algos = [
        sp.SVDpp(),
        sp.SVD(),
        sp.SlopeOne(),
        sp.NMF(),
        sp.NormalPredictor(),
        sp.KNNBaseline(),
        sp.KNNBasic(),
        sp.KNNWithMeans(),
        sp.KNNWithZScore(),
        sp.BaselineOnly(),
        sp.CoClustering()
    ]

    # Iterate over all algorithms
    for algorithm in algos:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_object,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
    return benchmark
Exemplo n.º 6
0
def dic_to_train(data):
    from surprise.model_selection import KFold

    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    algo = surprise.BaselineOnly(bsl_options)

    ax1 = data.plot.scatter(x='store', y='user', s=1, c='score')
    print(ax1)

    df_to_dict = recur_dictify(data)

    store_list = []  # 음식점 목록을 담을 리스트
    user_set = set()  # 유저 목록을 담을 set

    # store 수 만큼 반복
    for store_key in df_to_dict:
        store_list.append(store_key)

        for user_key in df_to_dict[store_key]:
            user_set.add(user_key)

    user_list = list(user_set)

    pd.to_pickle(
        pd.Series(user_list).to_frame(), "../data/Item_based_user_list.pkl")
    pd.to_pickle(
        pd.Series(store_list).to_frame(), "../data/Item_based_store_list.pkl")

    rating_dic = {"store_id": [], "user_id": [], "score": []}

    # store 수 만큼 반복
    for store_key in df_to_dict:
        for name_key in df_to_dict[store_key]:
            a1 = store_list.index(store_key)
            a2 = user_list.index(name_key)
            a3 = df_to_dict[store_key][name_key]

            rating_dic["store_id"].append(a1)
            rating_dic["user_id"].append(a2)
            rating_dic["score"].append(a3)

        df = pd.DataFrame(rating_dic)
    return df.sort_values(by=['store_id'])
    def model_fit(self):
        '''
        Train model using surprise.SVD algorithm. 
        '''
        self.build_trainset()
        algo = self._algo_choise
        if algo == 'SVD':
            self.algorithm = surprise.SVD()
        elif algo == 'Baseline':
            self.algorithm = surprise.BaselineOnly()
        elif algo == 'SlopeOne':
            self.algorithm = surprise.SlopeOne()
        elif algo == 'CoClustering':
            self.algorithm = surprise.CoClustering()
        else:
            self.algorithm = surprise.KNNBasic()

        print('Training Recommender System using %s...' % algo)

        self.algorithm.fit(self.trainset)
        self.ratings_changed = False
        print('Done')
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

# Load dataset into surprise specific data-structure
data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

benchmark = []
# Iterate over all algorithms
for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    
    # Store data
    benchmark.append(tmp)
    
    # Store results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False)

# Get data
data = surprise_results[['test_rmse', 'test_mae']]
Exemplo n.º 9
0
sgd_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly,
                                         sgd_param_grid,
                                         measures=['rmse'],
                                         cv=3,
                                         joblib_verbose=0)

reader = sp.Reader(rating_scale=(0, 10))
data = sp.Dataset.load_from_df(
    ScoresDFHotStart[['username', 'anime_id', 'my_score']], reader)
als_gs.fit(data)

sgd_gs.fit(data)

trainset = data.build_full_trainset()
algo = sp.BaselineOnly()
algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

last_predictions = pd.DataFrame(
    predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
last_predictions.drop('rui', inplace=True, axis=1)

sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo_items = sp.KNNBaseline(sim_options=sim_options)
algo_items.fit(trainset)


def get_item_recommendations(anime_title, anime_id=100000, k=10):
    if anime_id == 100000:
Exemplo n.º 10
0
reader = surprise.Reader(rating_scale=(1, 5))
rating_data = surprise.Dataset.load_from_df(rating_df, reader=reader)
svd = surprise.SVD()
svd_temp = surprise.model_selection.cross_validate(svd,
                                                   rating_data,
                                                   measures=['RMSE', 'MAE'],
                                                   cv=5,
                                                   verbose=True)
print('SVD--------------')
print(svd_temp)
normalPredictor = surprise.NormalPredictor()
normalPredictor_temp = surprise.model_selection.cross_validate(
    normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('normalPredictor--------------')
print(normalPredictor_temp)
baselineOnly = surprise.BaselineOnly()
baselineOnly_temp = surprise.model_selection.cross_validate(
    baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('baselineOnly-----------------')
print(baselineOnly_temp)
knnBasic = surprise.KNNBasic()
knnBasic_temp = surprise.model_selection.cross_validate(
    knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBasic-----------------')
print(knnBasic_temp)
knnWithMeans = surprise.KNNWithMeans()
knnWithMeans_temp = surprise.model_selection.cross_validate(
    knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnWithMeans-----------------')
print(knnWithMeans_temp)
knnBaseline = surprise.KNNBaseline()
from surprise import dump
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
from surprise.model_selection import KFold

data = surprise.Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df_table = df.set_index(["user", "item"]).unstack()
data_array = df_table.values

matA = np.array(data_array).astype(np.float64)
print(data_array)
U, s, V = np.linalg.svd(matA, full_matrices=True)

print(U, s * V)

bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()
Exemplo n.º 12
0
def main(train_df, target_df, cache_name="test", force_recompute=[]):
    """Train multiple models on train_df and predicts target_df

    Predictions are cached. If the indices don't match the indices of
    target_df, the cache is discarded.

    By default, if a method was already computed it is not recomputed again
    (except if the method name is listed in force_recompute). cache_name
    is the name to use to read and write the cache.

    Arguments:
        train_df {dataframe} -- Training dataframe
        target_df {dataframe} -- Testing dataframe

    Keyword Arguments:
        cache_name {str} -- Name to use for caching (default: {"test"})
        force_recompute {list} -- Name(s) of methods to recompute, whether or
        not it was already computed. Useful to only recompute single methods
        without discarding the rest. (default: {[]})

    Returns:
        Dataframe -- Dataframe with predictions for each methods as columns,
        IDs as indices
    """
    global algo_in_use
    CACHED_DF_FILENAME = os.path.dirname(
        os.path.abspath(__file__)) +\
        "/cache/cached_predictions_{}.pkl".format(cache_name)
    train_df = preprocess_df(train_df)
    trainset = pandas_to_data(train_df)
    ids_to_predict = target_df["Id"].to_list()

    # try to retrieve backup dataframe
    try:
        print("Retrieving cached predictions")
        all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME)
        print("Ensuring cached IDs match given IDs")
        assert sorted(ids_to_predict) == sorted(
            all_algos_preds_df.index.values)
        print("Indices match, continuing")
    except (FileNotFoundError, AssertionError):
        print("No valid cached predictions found")
        all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"])
        all_algos_preds_df.set_index("Id", inplace=True)

    all_algos = {
        "SVD": spr.SVD(n_factors=200, n_epochs=100),
        "Baseline": spr.BaselineOnly(),
        "NMF": spr.NMF(n_factors=30, n_epochs=100),
        "Slope One": spr.SlopeOne(),
        "KNN Basic": spr.KNNBasic(k=60),
        "KNN Means": spr.KNNWithMeans(k=60),
        "KNN Baseline": spr.KNNBaseline(),
        "KNN Zscore": spr.KNNWithZScore(k=60),
        "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100),
        "Co Clustering": spr.CoClustering()
    }

    for name in all_algos:
        print("##### {} ####".format(name))
        if name in force_recompute and name in all_algos_preds_df.columns:
            all_algos_preds_df.drop(name, axis=1, inplace=True)
        if name in all_algos_preds_df.columns:
            print("Already computed {}, skipping".format(name))
            continue
        algo = all_algos[name]
        time.sleep(1)
        algo.fit(trainset)
        time.sleep(1)
        algo_in_use = algo
        print("Generating predictions...")
        predictions = parallelize_predictions(ids_to_predict, 80)
        print("Done. Merging with previous results")
        this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name])
        this_algo_preds_df.set_index("Id", inplace=True)
        all_algos_preds_df = pd.merge(all_algos_preds_df,
                                      this_algo_preds_df,
                                      left_index=True,
                                      right_index=True)
        all_algos_preds_df.to_pickle(CACHED_DF_FILENAME)
    print("DONE computing surprize")
    return all_algos_preds_df
# 指定文件所在路径
root_dir = os.getcwd()
dfs_path = os.path.join(root_dir, 'Data/datasets/')
ratings_df = pd.read_csv(os.path.join(dfs_path, 'ratings_expl.csv'),
                         sep=';',
                         encoding='latin-1',
                         low_memory=False)
# 加载数据
reader = surprise.Reader(rating_scale=(1, 10))
data = surprise.Dataset.load_from_df(ratings_df[['user_id', 'isbn', 'rating']],
                                     reader)
kf = KFold(n_splits=5)
# data = Dataset.load_builtin('jester')
# Kfold
algo1 = SVD()
algo2 = surprise.BaselineOnly()
algo3 = surprise.KNNBasic()
algo4 = surprise.CoClustering()
for trainset, testset in kf.split(data):
    # SVD
    algo1.fit(trainset)
    pSVD = algo1.test(testset)
    # 计算并打印RMSE
    print("SVD-")
    accuracy.rmse(pSVD, verbose=True)
    #Baseline
    algo2.fit(trainset)
    pBase = algo2.test(testset)
    print("BaseLine-")
    accuracy.rmse(pBase)
    #Baseline
Exemplo n.º 14
0
    n_recall = 0
    n_precision = 0
    for uid in est_top_n.keys():
        hit += len(set(est_top_n[uid]) & set(true_top_n[uid]))
        n_precision += len(est_top_n[uid])
        n_recall += len(true_top_n[uid])
    return hit / n_precision, hit / n_recall


# define reader and load data
file_path = os.path.expanduser('./rates.csv')
reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
data = Dataset.load_from_file(file_path, reader=reader)
kf = KFold(n_splits=5, random_state=1)

algo = surprise.BaselineOnly()
# algo = surprise.KNNBasic()
# algo = surprise.KNNWithMeans()
# algo = surprise.KNNBaseline()
# algo = surprise.SVD()

precisions = []
recalls = []
rmse = []
mae = []
#
# # GridList = [(5, 5), (10, 10), (5, 10), (5, 20)]
# with open('result.txt', 'w') as file:
#     for est_n, true_n in GridList:
#         precisions = []
#         recalls = []
Exemplo n.º 15
0
data = sp.Dataset.load_from_file(file_name, reader=reader)
print("Done.")

# defining the number of folds = 5
print("Performing splits...")
kf = sp.model_selection.KFold(n_splits=5, random_state=0)
print("Done.")

###
### PART 1.1
###
'''
application of all algorithms for recommendation made available by 
“Surprise” libraries, according to their default configuration.
'''
algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\
              sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\
              sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()]
for elem in algorithms:
    start_time = time.time()
    algo = elem
    sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \
                                      cv=kf, n_jobs = 2, verbose=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

###
### PART 1.2
###
'''
Improvement of the quality of both KNNBaseline and SVD methods, 
Exemplo n.º 16
0
                            'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering'
                        ])
    args = parser.parse_args()

    train_path = path + '/Data/train_format.txt'

    train_reader = Reader(line_format='user item rating timestamp',
                          sep=',',
                          rating_scale=(0, 5))
    trainset = Dataset.load_from_file(train_path, reader=train_reader)
    trainset = trainset.build_full_trainset()

    if args.model == 'NormalPredictor':
        model = surprise.NormalPredictor()
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
        model = surprise.KNNBaseline()
    elif args.model == 'SVD':
        model = surprise.SVD()
    elif args.model == 'SVDpp':
        model = surprise.SVDpp(verbose=True)
    elif args.model == 'NMF':
        model = surprise.NMF()
    elif args.model == 'SlopeOne':