def NMF_Mat(df): model_nmf = NMF(n_components=10, init='random', random_state=0) m = model_nmf.fit_transform(df) h = model_nmf.components_ nmf_mat = m @ h return nmf_mat
def nmf(data, training, testing): ''' Tune NMF parameters then calculates RMSE, coverage and running time of NMF Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of NMF with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]} # optimize parameters grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('NMF:', param) # fit model using the optimized parameters nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs']) nmf.train(training) # evaluate the model using test data predictions = nmf.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def recommender_nmf_baseline(self, train_file, test_file, output): train, test, train_dataset, test_dataset = prepare_datasets( train_file, test_file) # Use user_based true/false to switch between user-based or item-based collaborative filtering algo_nmf_baseline = NMF() algo_nmf_baseline.fit(train) #not_seen_elems = self.merge_train_set(train_dataset, test_dataset) #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True) predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False) #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0) # Precision and recall can then be averaged over all users #precision_avg = sum(prec for prec in precisions.values()) / len(precisions) #recall_avg = sum(rec for rec in recalls.values()) / len(recalls) #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str( # rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False))) print('NMF BASELINE: ' + ' RMSE ' + str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' + str(mae(predictions_nmf_baseline, verbose=False))) return algo_nmf_baseline
def predict_NMF(userid): df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1) reader = Reader(rating_scale=(1, 30)) #使用reader格式从文件中读取数据 data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader) #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集 trainset, testset = train_test_split(data, test_size=.25) #使用NMF algo = NMF() algo.fit(trainset) pred_nmf = algo.test(testset) top_nmf_n = get_top_n(pred_nmf, n=5) movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title']) movie_titles = movie_titles.rename(columns={'id': 'movieId'}) movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'], errors='coerce').fillna(0) movie_titles['movieId'] = movie_titles['movieId'].astype('int') movie_titles.drop_duplicates() for uid, user_ratings in top_nmf_n.items(): if (uid == userid): #print(uid, [iid for (iid, _) in user_ratings]) title_list = [iid for (iid, _) in user_ratings] titles = movie_titles[movie_titles.movieId.isin(title_list)] print(titles[2:]) return titles[2:]
def recommendation_mf(userArray, numUsers, movieIds): ratings_dict = {'itemID': list(df_ratings.movie_id_ml) + list(numUsers*movieIds), 'userID': list(df_ratings.user_id) + [max(df_ratings.user_id)+1+x for x in range(numUsers) for y in range(len(userArray[0]))], 'rating': list(df_ratings.rating) + [item for sublist in userArray for item in sublist] } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() nmf = NMF() nmf.fit(trainset) userIds = [trainset.to_inner_uid(max(df_ratings.user_id)+1+x) for x in range(numUsers)] mat = np.dot(nmf.pu, nmf.qi.T) scores = hmean(mat[userIds, :], axis=0) best_movies = scores.argsort() best_movies = best_movies[-9:][::-1] scores = scores[best_movies] movie_ind = [trainset.to_raw_iid(x) for x in best_movies] recommendation = list(zip(list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].title), list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].poster_url), list(scores))) return recommendation
def do_nmf(data_raw, impute_params): data = data_raw.pivot(index="User", columns="Movie", values="Prediction").to_numpy() reader = surprise.Reader(rating_scale=(1, 5)) dataset = surprise.Dataset.load_from_df( data_raw[["User", "Movie", "Prediction"]], reader) trainset = dataset.build_full_trainset() algo = NMF(n_factors=impute_params["FACTORS"], n_epochs=impute_params["EPOCHS"], verbose=True) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) predictions = pd.DataFrame(predictions) predictions.rename(columns={ "uid": "User", "iid": "Movie", "est": "Prediction" }, inplace=True) predictions = predictions[["User", "Movie", "Prediction"]] data = pd.concat([data_raw, predictions], ignore_index=True) data = data.pivot(index="User", columns="Movie", values="Prediction").to_numpy() return data
def main(): f = open("Python/user_rated_movies.tsv", "r") user_ratings = [] for line in f: inline = line.split('\t') rating = inline[2] mytuple = inline[0], inline[1], float(rating[:-1]), None user_ratings.append(mytuple) f.close() # data = Dataset.load_builtin(name=u'ml-1m') reader = Reader(line_format='user item rating', sep='\t') datain = pd.read_csv("ratings.tsv", sep="\t") data = Dataset.load_from_df(datain, reader=reader) for i in user_ratings: data.raw_ratings.append(i) movies = pd.read_csv("movies.tsv", sep="\t", header=None, low_memory=False) algo = NMF(n_factors=4, n_epochs=100, random_state=1) trainSet = data.build_full_trainset() algo.fit(trainSet) predictions = [] #have i[0] and i[1] be the current user and movie id for index, row in movies.iterrows(): pred = algo.predict(user_ratings[0][0], row[1], r_ui=4) predictions.append(pred) sortpred = sorted(predictions, key=lambda pred: pred[3]) sortpred = sortpred[-10:] for i in sortpred: print(i[1])
def run_process(self, all_ips_data, ip_16_data, misclassifications, queue): historical_item = generate_prefix_data(all_ips_data, ip_16_data, self.reference_end_time, self.half_life_duration) matrix = [] if len(historical_item) == 0: return if len(historical_item) < 5: for ip, bl_name_data in historical_item.items(): queue.put(ip + ",0") return matrix_string = "userId,itemId,rating\n" all_blacklists = set() ip_order = set() for ip, bl_name_data in historical_item.items(): ip_order.add(ip) for bl_name, score in bl_name_data.items(): matrix_string = matrix_string + ip + "," + bl_name + "," + str( score) + "\n" all_blacklists.add(bl_name) for ip in misclassifications: if ip in ip_order: matrix_string = matrix_string + ip + "," + "misclassifications,10" + "\n" matrix_string = StringIO(matrix_string) ratings = pd.read_csv(matrix_string) ratings_dict = { 'itemID': list(ratings.itemId), 'userID': list(ratings.userId), 'rating': list(ratings.rating) } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 10.0)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) epochs = 100 broken_flag = False while True: algo = NMF(n_epochs=epochs, n_factors=self.n_factors) try: res = model_selection.cross_validate(algo, data, measures=['RMSE']) except: broken_flag = True break mean_rmse = sum(res["test_rmse"]) / len(res["test_rmse"]) if mean_rmse <= 1: break epochs = epochs + 100 if epochs >= self.epochs: break for ip in ip_order: prediction = algo.predict(ip, "misclassifications").est queue.put(ip + "," + str(round(prediction, 2))) return
def colaborative_filtering_based_model(path, config, engine, df_valid_games): with open(path, 'r') as f: raw_strings = f.readlines() total_count = len(raw_strings) current_count = 0 user_ratings = [] scaler = MinMaxScaler((1, 5)) for raw_string in raw_strings: user_id, user_inventory = list(json.loads(raw_string).items())[0] if user_inventory is not None: app_ids = [item['appid'] for item in user_inventory] app_scores = [item['playtime_forever'] for item in user_inventory] app_scores = scaler.fit_transform(np.log1p(app_scores).reshape(-1, 1)) user_ratings_temp = [[user_id, app_ids[i], app_scores[i].item()] for i in range(len(app_ids))] user_ratings += user_ratings_temp show_work_status(1,total_count,current_count) current_count+=1 user_item_ratings = pd.DataFrame(user_ratings) user_item_ratings.columns = ['user_id', 'item_id', 'rating'] # Prediction part game_ids_set = set(df_valid_games.steam_appid) grouped_user_item_ratings = user_item_ratings.groupby('user_id') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(user_item_ratings[['user_id', 'item_id', 'rating']], reader) alg = NMF(n_factors=20) alg.fit(data.build_full_trainset()) total_count = len(user_item_ratings.user_id.unique()) current_count = 0 dict_user_recommendations = {} for user in user_item_ratings.user_id.unique().tolist(): temp = grouped_user_item_ratings.get_group(user) not_purchased_ids = game_ids_set - set([str(x) for x in temp.item_id]) user_test_temp = [[user, not_purchased_id, 0] for not_purchased_id in not_purchased_ids] user_test_temp = pd.DataFrame(user_test_temp) user_test_temp.columns = ['user_id', 'item_id', 'rating'] data = Dataset.load_from_df(user_test_temp[['user_id', 'item_id', 'rating']], reader) user_test = data.build_full_trainset().build_testset() results = alg.test(user_test) dict_user_recommendations.update({user: pd.DataFrame(results).sort_values('est', ascending=False).iloc[:10, 1].values.tolist()}) show_work_status(1,total_count,current_count) current_count+=1 df_cf_based_results = pd.DataFrame(dict_user_recommendations).T df_cf_based_results.index.name = 'user_id' df_cf_based_results.reset_index(inplace=True) df_cf_based_results.to_sql(config.mysql_user_like_table, engine, if_exists='replace')
def trainingRatings(movies, users, ratings): ratings_dict = {"movies": movies, "users": users, "ratings": ratings} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[["users", "movies", "ratings"]], reader) trainingSet = data.build_full_trainset() algo = NMF(n_factors=100, n_epochs=100, reg_pu=0.01) algo.fit(trainingSet) recommendMoviesForUsers(movies, users, algo)
def initialize(self, data_filepath): self._data = Dataset.load_from_file(data_filepath, reader=Reader('ml-100k')) self._trainset = self._data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} self._knn = KNNBaseline(sim_options=sim_options) self._nmf = NMF() start_new_thread(self._train)
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def get_u_v(data_raw, params): reader = surprise.Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). dataset = surprise.Dataset.load_from_df( data_raw[["User", "Movie", "Prediction"]], reader) trainset = dataset.build_full_trainset() algo = NMF(n_factors=params["GLOBAL_NMF_K"], n_epochs=params["GLOBAL_NMF_EPOCHS"], verbose=False) algo.fit(trainset) U_red = algo.pu V_red = algo.qi logging.info("return from get_u_v") return (U_red, V_red)
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin): for alg in algos: if alg == "KNNBasic": algo = KNNBasic() elif alg == "KNNWithZScore": algo = KNNWithZScore() elif alg == "SVD": algo = SVD() elif alg == "NMF": algo = NMF() elif alg == "SlopeOne": algo = SlopeOne() elif alg == "CoClustering": algo = CoClustering() if data_origin == 'netflix': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'netflix') elif data_origin == 'small': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'small') elif data_origin == '100k': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, '100k') # print results print("\n\nAlg %s" % alg) print("Number of user-items pairs: %d" % nr_predictions) print("Accuracy: %.2f " % accuracy) print("RMSE: %.2f" % rmse) print("MAE: %.2f" % mae) print("Precision: %.2f" % precision) print("Recall: %.2f" % recall) print("F1: %.2f" % f1)
def benchmark(data): performance = [] algorithms = [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering(), SVD_SGD_momentum(), SVDpp_SGD_momentum() ] for algorithm in algorithms: results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', 'FCP'], cv=3, verbose=False) output = pd.DataFrame.from_dict(results).mean(axis=0) output = output.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) performance.append(output) output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values( 'test_rmse') store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def select_model(user_review): user_review = data_prep() reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( user_review[['user_id', 'business_id', 'stars']], reader) benchmark = [] # Iterate over all algorithms for algorithm in [ KNNBasic(), KNNBaseline(), KNNWithMeans(), SVD(), SVDpp(), SlopeOne(), NMF() ]: # Perform cross validation print(algorithm) print('start ......') results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(benchmark)
def nmf(): print('Algoritmo Baseline Only...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=10) algo = NMF() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def get_model(model_name): algo = None if 'KNN' in model_name: model_name = model_name.split('_') knn_model_name = model_name[0] user_based = False if len( model_name) > 1 and model_name[1] == 'I' else True dis_method = 'msd' if len(model_name) < 3 else model_name[2] k = 20 if len(model_name) < 4 else int(model_name[3]) sim_options = {'user_based': user_based, 'name': dis_method} if knn_model_name == 'KNNBasic': algo = KNNBasic(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithMeans': algo = KNNWithMeans(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithZScore': algo = KNNWithZScore(sim_options=sim_options, k=k) elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name: model_name = model_name.split('_') n_factors = 25 if len(model_name) == 1 else int(model_name[1]) if model_name[0] == 'SVDpp': algo = SVDpp(n_factors=n_factors) elif model_name[0] == 'SVD': algo = SVD(n_factors=n_factors) elif model_name[0] == 'NMF': algo = NMF(n_factors=n_factors) return algo
def model(self, alg_key): reader = Reader(rating_scale = (1, 5)) data_result = Dataset.load_from_df(self.make_df()[['user_id', 'place_id', 'score']], reader) # split data into 5 folds data_result.split(n_folds=10) # evaluation if alg_key.lower() == "svd": alg = SVD() elif alg_key.lower() == "knn": alg = KNNBasic() elif alg_key.lower() == "nmf": alg = NMF() evaluate(alg, data_result, measures=['RMSE', 'MAE']) # prediction # user_0 smallShop_5645 2 test_user = '******' test_id = 'smallShop_7089' real_score = 4 trainset = data_result.build_full_trainset() alg.train(trainset) print(alg.predict(test_user, test_id, real_score))
def crossvalidate(data): results = [] for algorithm in [ NormalPredictor(), KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)), BaselineOnly(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering() ]: result = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False) temp = pd.DataFrame.from_dict(result).mean(axis=0) temp = temp.append( pd.Series([str(algorithm).split(' ')[0].split(".")[-1]], index=['Algorithm'])) results.append(temp) rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values( 'test_rmse') return rmse_values
def q7(): file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=3) algo = NMF() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def generate_algorithms(self, rating_data): """ here we separate untuned and tuned algo as it might take a really long time on tuning, it's easier to comment out the tuning part if needed Args: param1: rating_data: the main data set Return: a dictionary of algorithms; key: name of algo, val: algo object """ algo = {} algo.update({'SVD': SVD()}) algo.update({'PMF': SVD(biased=False)}) algo.update({'SVD++': SVDpp()}) algo.update({'NMF': NMF()}) print('Generated algo object for SVD, PMF, SVD++, and NMF.') # generate tuned SVD algorithm param_grid_svd = { 'n_factors': [130, 200], 'n_epochs': [50, 60], 'lr_all': [0.0015, 0.002], 'reg_all': [0.02, 0.03] } best_params_svd = self.tune_and_find_param('SVD', SVD, rating_data, param_grid_svd) # initiate tuned MF algos with tuned hyperparameters SVD_tuned = SVD(n_factors=best_params_svd['n_factors'], n_epochs=best_params_svd['n_epochs'], lr_all=best_params_svd['lr_all']) # append new algos to result dict algo.update({'SVD_tuned': SVD_tuned}) # code for future use: tuning SVDpp, NMF # # param_grid_svdpp = {'n_factors': [20, 30], 'n_epochs': [15, 25], 'lr_all': [0.005, 0.0085]} # best_params_svdpp = self.tune_and_find_param('SVD++', SVDpp, rating_data, param_grid_svdpp) # # param_grid_nmf = {'n_factors': [50, 55], 'n_epochs': [45, 50], 'lr_bu': [0.02, 0.025], 'lr_bi': [0.02, 0.025]} # best_params_nmf = self.tune_and_find_param('NMF', NMF, rating_data, param_grid_nmf) # SVDpp_tuned = SVDpp(n_factors = best_params_svdpp['n_factors'], # n_epochs = best_params_svdpp['n_epochs'], # lr_all = best_params_svdpp['lr_all']) # # NMF_tuned = NMF(n_factors = best_params_nmf['n_factors'], # n_epochs = best_params_nmf['n_epochs'], # lr_bu = best_params_nmf['lr_bu'], # lr_bi = best_params_nmf['lr_bi']) # algo.update({'SVD++_tuned': SVDpp_tuned}) # algo.update({'NMF_tuned': NMF_tuned}) return algo
def __init__(self): super().__init__("nmf", NMF, param_grid={ 'n_factors': [15, 20], 'n_epochs': [50, 70] }) best_params = super().tune() print(best_params) self.algo = NMF(n_factors=best_params['n_factors'], n_epochs=best_params['n_epochs'])
def get_model_old(model_name): algo = None if model_name == 'KNNBasic_U': sim_options = {'user_based': True} algo = KNNBasic(sim_options=sim_options, k=20) elif model_name == 'KNNBasic_I': sim_options = {'user_based': False} algo = KNNBasic(sim_options=sim_options, k=20) # algo = KNNBasic() elif model_name == 'KNNWithMeans_I': algo = KNNWithMeans(sim_options={'user_based': False}, k=20) elif model_name == 'KNNWithMeans_U': algo = KNNWithMeans(sim_options={'user_based': True}, k=20) elif model_name == 'KNNWithZScore_I': algo = KNNWithZScore(sim_options={'user_based': False}, k=20) elif model_name == 'KNNWithZScore_U': algo = KNNWithZScore(sim_options={'user_based': True}, k=20) elif model_name == 'SVDpp': algo = SVDpp() elif model_name == 'SVD': algo = SVD() elif model_name == 'NMF': algo = NMF() elif 'NMF_' in model_name: n_factors = int(model_name.split("_")[1]) algo = NMF(n_factors=n_factors) elif 'SVDpp_' in model_name: n_factors = int(model_name.split("_")[1]) algo = SVDpp(n_factors=n_factors) elif 'SVD_' in model_name: n_factors = int(model_name.split("_")[1]) algo = SVD(n_factors=n_factors) elif 'KNNBasic_U_' in model_name: k = int(model_name.split("_")[-1]) sim_options = {'user_based': True} algo = KNNBasic(sim_options=sim_options, k=k) elif 'KNNBasic_I_' in model_name: k = int(model_name.split("_")[-1]) sim_options = {'user_based': False} algo = KNNBasic(sim_options=sim_options, k=k) return algo
def __init__(self, algo='knn_baseline', filepath=None): if not os.path.exists(filepath): raise FileNotFoundError("{} not exist".format(filepath)) self.filepath = filepath if algo == 'nmf': self.algo = NMF() self.model_name = 'nmf' else: self.algo = KNNBaseline() self.model_name = 'knn_baseline' self.convertor = DataConvertHelper()
def nmf(self, namefile, uid, iid, rati, value_uid, value_iid): test_data = pd.read_csv('./container/' + namefile) dt = pd.DataFrame(test_data) # Retrieve the trainset. reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(dt[[uid, iid, rati]], reader) trainset = data.build_full_trainset() algo = NMF() algo.fit(trainset) pred = algo.predict(int(value_uid), int(value_iid), r_ui=1, verbose=True) #var_rmse = accuracy.rmse(pred) #return result to json jsondata = {} jsondata = {} jsondata["uid"] = pred.uid jsondata["idd"] = pred.iid jsondata["rati"] = round(pred.est, 2) return jsondata
def train(): data = load_dataset() algo_svd = SVD() algo_nmf = NMF() print("Cross Validation procedure") kf = KFold(n_splits=KFOLD_NUM) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data), start=1): print(f"===> Fold number {i}") # Save the first fold train_helper(algo_svd, "SVD", trainset_cv, testset_cv, i == 1) train_helper(algo_nmf, "NMF", trainset_cv, testset_cv, i == 1)
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF, bestParamsKNN): ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset() modelNMF = NMF(**bestParamsNMF) modelNMF.fit(ratingsTrainTrainset) saveModel(modelNMF, 'NMF') predictions = modelNMF.test(ratingsTest) rmseValue = rmse(predictions) maeValue = mae(predictions) saveFinalResult('NMF', rmseValue, maeValue) modelKNN = KNNWithMeans(**bestParamsKNN) modelKNN.fit(ratingsTrainTrainset) saveModel(modelKNN, 'KNN') predictions = modelKNN.test(ratingsTest) rmseValue = rmse(predictions) maeValue = mae(predictions) saveFinalResult('KNN', rmseValue, maeValue)