def coClustering(trainset, testset): # CoClustering print("\n" + "-" * 5 + " CoClustering algorithm using surprise package " + "-" * 5) algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def Coclustering(recipe_df, train_rating_df, pd, benchmark): print("\n###### Compute CoClustering ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) algo = CoClustering() algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def cluster(data, kwargs): # Set algorithm cluster_u = kwargs.get('user_cluster') cluster_i = kwargs.get('item_cluster') n_epochs = kwargs.get('maxiter') # Set algorithm algo = CoClustering(n_cltr_u = cluster_u[0], n_cltr_i = cluster_i[0], n_epochs = n_epochs , random_state = kwargs['random_seed'] ) # Train the algorithm on the data, and predict ratings for the testset algo.fit(data) prediction = np.zeros([10000,1000]) for row in range(10000): for col in range(1000): prediction[row,col] = algo.predict(str(row+1),str(col+1)).est return prediction
def co_clustering(train, test, ids, Xtest, Xids): """ Co-clustering algorithm, users and items assigned to clusters and co_clusters Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Co-clustering') algo = CoClustering(n_cltr_u=1, n_cltr_i=1, n_epochs=50, random_state=15) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def co_clustering(self, n_cltr_u=10, n_cltr_i=10, n_epochs=20): """ k Nearest Negihbors collaborative filtering algorithm taking into account a baseline rating. Args: n_cltr_u: Number of user clusters n_cltr_i: Number of item clusters n_epochs: Number of iteration of the optimization loop Returns: predictions_df: The predictions of the model on the test data in Pandas Data Frame format """ algorithm = CoClustering(n_cltr_u=n_cltr_u, n_cltr_i=n_cltr_i, n_epochs=n_epochs) predictions = algorithm.fit(self.train_data).test(self.test_data) predictions_df = self.data.test_df.copy() predictions_df['Rating'] = [x.est for x in predictions] if self.test_purpose: self.evalueate_model(predictions_df['Rating'], 'Surprise co_clustering') return predictions_df
''' file_path1 = os.path.expanduser("../test1.csv") reader1 = Reader(line_format="user item rating", sep=',') data = Dataset.load_from_file(file_path, reader=reader) data1 = Dataset.load_from_file(file_path1, reader=reader1) ''' trainset = data.build_full_trainset() #testset = data1.build_full_trainset() # Use the famous SVD algorithm. algo = CoClustering() # Run 5-fold cross-validation and print results. #cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) algo.fit(trainset) ''' predictions = algo.test(testset) print(predictions) ''' ''' uid = str(0) # raw user id (as in the ratings file). They are **strings**! iid = str(35546) # raw item id (as in the ratings file). They are **strings**! # get a prediction for specific users and items. pred = algo.predict(uid, iid, verbose=True) print(pred[3]) ''' testset = open("../test1.csv", "r") result = open("./result/result_CoClustering.txt", "w")
for i in range(5): movieRecc2 = topMovies2[i] movieRawID2 = movieRecc2[0] movieName2 = movie[movieRawID2] print(str(i+1) + '. ' + movieName2 ) #############predictions using Co-Clustering print('') print('Making more recommendations...') algo3 = CoClustering() algo3.fit(trainset) predictions3 = algo3.test(testset) dictMovies3 = get_top_n(predictions3) topMovies3 = dictMovies3.get(672) print('') print('Here are the top 5 recommendations based on Co-Clustering! ') for i in range(5): movieRecc3 = topMovies3[i] movieRawID3 = movieRecc3[0] movieName3 = movie[movieRawID3] print(str(i+1) + '. ' + movieName3 )
class Surprise(): def train(self, algo='SVD', like=True, test='cv', local=False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv") self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv") else: self.recipes = storage.import_file('data/preprocessed', 'recipe_pp.csv') self.reviews = storage.import_file('data/preprocessed', 'review_pp.csv') if like: self.target = 'liked' self.s_min = 0 self.s_max = 1 else: self.target = 'rating' self.s_min = 1 self.s_max = 5 reader = Reader(rating_scale=(self.s_min, self.s_max)) self.relevant_data = self.reviews[[ 'user_id', 'recipe_id', self.target ]] model_data = Dataset.load_from_df(self.relevant_data, reader) # Algos if 'NormalPredictor': self.algorithm = NormalPredictor() elif 'BaselineOnly': self.algorithm = BaselineOnly() elif 'KNNBasic': self.algorithm = KNNBasic() elif 'KNNWithMeans': self.algorithm = KNNWithMeans() elif 'KNNWithZScore': self.algorithm = KNNWithZScore() elif 'KNNBaseline': self.algorithm = KNNBaseline() elif 'SVD': params = { 'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02 } self.algorithm = SVD(params) # Tuned with svd_grid elif 'SVDpp': self.algorithm = SVDpp() elif 'NMF': self.algorithm = NMF() elif 'SlopeOne': self.algorithm = SlopeOne() elif 'CoClustering': self.algorithm = CoClustering() if test == 'cv': cv_results = cross_validate(self.algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) rmse = np.round(cv_results['test_rmse'].mean(), 3) mae = np.round(cv_results['test_mae'].mean(), 3) train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) elif test == 'svd_grid': param_grid = { 'n_epochs': [10, 20], 'n_factors': [100, 200], 'lr_all': [0.001, 0.002], 'reg_all': [0.01, 0.02] } train_data = model_data.build_full_trainset() gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(model_data) rmse = gs.best_score['rmse'] mae = gs.best_score['mae'] print(gs.best_params['rmse'], gs.best_params['mae']) self.algorithm = gs.best_estimator['rmse'] train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) else: train, test = train_test_split(model_data, test_size=0.3, random_state=42) self.algorithm.fit(train) predictions = self.algorithm.test(test) rmse = np.round(accuracy.rmse(predictions), 3) mae = np.round(accuracy.mae(predictions), 3) return rmse, mae def predict(self, user_id): inputs = self.relevant_data[self.relevant_data['user_id'] == user_id] \ .merge(self.recipes, on="recipe_id", how="left")[['recipe_id', 'name', self.target]] display(inputs) user_recipes = self.relevant_data[self.relevant_data['user_id'] == user_id].recipe_id.unique() recipe_list = self.relevant_data[ self.relevant_data['user_id'] != user_id].recipe_id.unique() predictions = [ self.algorithm.predict(user_id, rec) for rec in recipe_list if rec not in list(user_recipes) ] pdf = pd.DataFrame(predictions, columns=[ 'user_id', 'recipe_id', self.target, f'rec_{self.target}', 'details' ]) pdf = pdf.drop(columns=[self.target, 'details']) pdf = pdf.sort_values(f'rec_{self.target}', ascending=False) rec_target = pdf[f'rec_{self.target}'] pdf['rec_score'] = (rec_target - self.s_min) / (self.s_max - self.s_min) outputs = pdf.merge(self.recipes, on="recipe_id", how="left")[[ 'recipe_id', 'name', f'rec_{self.target}', 'rec_score' ]] display(outputs.head(10)) return outputs
print("Best Params\n", gs.best_params) print("Best Estimators\n", gs.best_estimator) print("Best Index\n", gs.best_index) print("Results Dicts: \n") results_df = pd.DataFrame.from_dict(gs.cv_results) print(results_df) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing Co-Clustering as algorithm algo = CoClustering() # * Train the algorithm on the trainset, and predict ratings for the testset for trainset, testset in kf.split(data): predictions = algo.fit(trainset).test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.mse(predictions) accuracy.fcp(predictions) print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls)) df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"]) df["err"] = abs(df.est - df.rui) df.to_csv("predictions_CoClustering.csv") # top_n = get_top_n(predictions, n=10) # * Print the recommended items for each user
# In[88]: # We'll use the famous SVD algorithm. from surprise import CoClustering df_CoClustering = df_final_user_repo_star_v3.copy(deep=True); dataCoClustering = Dataset.load_from_df(df_CoClustering, reader) coClustering = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20) # Train the algorithm on the trainset, and predict ratings for the testset trainsetcoClustering = dataCoClustering.build_full_trainset() coClustering.fit(trainsetcoClustering) testcoClustering = trainsetcoClustering.build_anti_testset() predictionscoClustering = coClustering.test(testcoClustering) accuracy.rmse(predictionscoClustering) listOfRMSE.append(accuracy.rmse(predictionscoClustering)) models.append('CoClustering') # ## SlopeOne Implementation # In[89]: