def surprise_slopeOne(trainset, finalset): "Slope One model" algo = spr.SlopeOne() algo.fit(trainset) predictions_final = algo.test(finalset) return spr_estimate_to_vect(predictions_final)
def slopeOne(train, test): """ Run the Slope One model from Surprise library. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @return: the predictions in a numpy array. """ algo = spr.SlopeOne() algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
def model_fit(self): ''' Train model using surprise.SVD algorithm. ''' self.build_trainset() algo = self._algo_choise if algo == 'SVD': self.algorithm = surprise.SVD() elif algo == 'Baseline': self.algorithm = surprise.BaselineOnly() elif algo == 'SlopeOne': self.algorithm = surprise.SlopeOne() elif algo == 'CoClustering': self.algorithm = surprise.CoClustering() else: self.algorithm = surprise.KNNBasic() print('Training Recommender System using %s...' % algo) self.algorithm.fit(self.trainset) self.ratings_changed = False print('Done')
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
print("Precision: " + str(sur_precision(atstd, predictions, product_index))) # NMF: algo = surprise.NMF() algo.fit(trainset) predictions = algo.test(testset) print("model NMF: ") accuracy.rmse(predictions) print("NDCG: " + str(sur_ndcg(atstd, predictions, product_index))) print("Precision: " + str(sur_precision(atstd, predictions, product_index))) # SlopeOne: algo = surprise.SlopeOne() algo.fit(trainset) predictions = algo.test(testset) print("model SlopeOne: ") # Then compute RMSE accuracy.rmse(predictions) print("NDCG: " + str(sur_ndcg(atstd, predictions, product_index))) print("Precision: " + str(sur_precision(atstd, predictions, product_index))) continue # Make predictions with trained parameters: X_ = U1.dot(V.T) Y_ = U2.dot(V.T) A_ = U1.dot(U2.T) + H1.dot(H2.T)
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
def build_network(self): self.model = surprise.SlopeOne()
# identifier_df_train = X_train[['user_id', 'business_id']] # identifier_df_test = X_test[['user_id', 'business_id']] # A reader is needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(train_df, reader) sim_options = {'user_based': [False]} results = [] # Iterate over all algorithms for algorithm in [ SVD(), surprise.NMF(), surprise.SlopeOne(), surprise.CoClustering(), surprise.KNNBasic(sim_options=sim_options), surprise.KNNWithMeans(sim_options=sim_options), surprise.KNNWithZScore(sim_options=sim_options), surprise.KNNBaseline(sim_options=sim_options), surprise.NormalPredictor(), surprise.BaselineOnly() ]: # Get string of algname for naming a pickle file a useful name alg_name = str(algorithm) alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[:alg_name.find('object') - 1]
# defining the number of folds = 5 print("Performing splits...") kf = sp.model_selection.KFold(n_splits=5, random_state=0) print("Done.") ### ### PART 1.1 ### ''' application of all algorithms for recommendation made available by “Surprise” libraries, according to their default configuration. ''' algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\ sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\ sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()] for elem in algorithms: start_time = time.time() algo = elem sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \ cv=kf, n_jobs = 2, verbose=True) print("--- %s seconds ---" % (time.time() - start_time)) print() ### ### PART 1.2 ### ''' Improvement of the quality of both KNNBaseline and SVD methods, by performing hyper-parameters tuning over 5-folds Random-Search-Cross-Validation - KNN
elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline': model = surprise.KNNBaseline() elif args.model == 'SVD': model = surprise.SVD() elif args.model == 'SVDpp': model = surprise.SVDpp(verbose=True) elif args.model == 'NMF': model = surprise.NMF() elif args.model == 'SlopeOne': model = surprise.SlopeOne() elif args.model == 'CoClustering': model = surprise.CoClustering() # cross_validate(model, trainset, cv=5, verbose=True) model.fit(trainset) lines = [] test_path = path + '/Data/test_format.txt' for line in tqdm(open(test_path, 'r').readlines()): user_id, item_id, timestamp, *tags = line.strip().split(',') rating = model.predict(user_id, item_id).est lines.append("{:.5}\n".format(float(rating))) open(path + '/Data/submit.txt', 'w').writelines(lines)